Home | History | Annotate | Download | only in etree
      1 #
      2 # ElementTree
      3 # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
      4 #
      5 # limited xpath support for element trees
      6 #
      7 # history:
      8 # 2003-05-23 fl   created
      9 # 2003-05-28 fl   added support for // etc
     10 # 2003-08-27 fl   fixed parsing of periods in element names
     11 # 2007-09-10 fl   new selection engine
     12 # 2007-09-12 fl   fixed parent selector
     13 # 2007-09-13 fl   added iterfind; changed findall to return a list
     14 # 2007-11-30 fl   added namespaces support
     15 # 2009-10-30 fl   added child element value filter
     16 #
     17 # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
     18 #
     19 # fredrik (at] pythonware.com
     20 # http://www.pythonware.com
     21 #
     22 # --------------------------------------------------------------------
     23 # The ElementTree toolkit is
     24 #
     25 # Copyright (c) 1999-2009 by Fredrik Lundh
     26 #
     27 # By obtaining, using, and/or copying this software and/or its
     28 # associated documentation, you agree that you have read, understood,
     29 # and will comply with the following terms and conditions:
     30 #
     31 # Permission to use, copy, modify, and distribute this software and
     32 # its associated documentation for any purpose and without fee is
     33 # hereby granted, provided that the above copyright notice appears in
     34 # all copies, and that both that copyright notice and this permission
     35 # notice appear in supporting documentation, and that the name of
     36 # Secret Labs AB or the author not be used in advertising or publicity
     37 # pertaining to distribution of the software without specific, written
     38 # prior permission.
     39 #
     40 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
     41 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
     42 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
     43 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
     44 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
     45 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
     46 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
     47 # OF THIS SOFTWARE.
     48 # --------------------------------------------------------------------
     49 
     50 # Licensed to PSF under a Contributor Agreement.
     51 # See http://www.python.org/psf/license for licensing details.
     52 
     53 ##
     54 # Implementation module for XPath support.  There's usually no reason
     55 # to import this module directly; the <b>ElementTree</b> does this for
     56 # you, if needed.
     57 ##
     58 
     59 import re
     60 
     61 xpath_tokenizer_re = re.compile(
     62     "("
     63     "'[^']*'|\"[^\"]*\"|"
     64     "::|"
     65     "//?|"
     66     "\.\.|"
     67     "\(\)|"
     68     "[/.*:\[\]\(\)@=])|"
     69     "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
     70     "\s+"
     71     )
     72 
     73 def xpath_tokenizer(pattern, namespaces=None):
     74     for token in xpath_tokenizer_re.findall(pattern):
     75         tag = token[1]
     76         if tag and tag[0] != "{" and ":" in tag:
     77             try:
     78                 prefix, uri = tag.split(":", 1)
     79                 if not namespaces:
     80                     raise KeyError
     81                 yield token[0], "{%s}%s" % (namespaces[prefix], uri)
     82             except KeyError:
     83                 raise SyntaxError("prefix %r not found in prefix map" % prefix)
     84         else:
     85             yield token
     86 
     87 def get_parent_map(context):
     88     parent_map = context.parent_map
     89     if parent_map is None:
     90         context.parent_map = parent_map = {}
     91         for p in context.root.iter():
     92             for e in p:
     93                 parent_map[e] = p
     94     return parent_map
     95 
     96 def prepare_child(next, token):
     97     tag = token[1]
     98     def select(context, result):
     99         for elem in result:
    100             for e in elem:
    101                 if e.tag == tag:
    102                     yield e
    103     return select
    104 
    105 def prepare_star(next, token):
    106     def select(context, result):
    107         for elem in result:
    108             for e in elem:
    109                 yield e
    110     return select
    111 
    112 def prepare_self(next, token):
    113     def select(context, result):
    114         for elem in result:
    115             yield elem
    116     return select
    117 
    118 def prepare_descendant(next, token):
    119     token = next()
    120     if token[0] == "*":
    121         tag = "*"
    122     elif not token[0]:
    123         tag = token[1]
    124     else:
    125         raise SyntaxError("invalid descendant")
    126     def select(context, result):
    127         for elem in result:
    128             for e in elem.iter(tag):
    129                 if e is not elem:
    130                     yield e
    131     return select
    132 
    133 def prepare_parent(next, token):
    134     def select(context, result):
    135         # FIXME: raise error if .. is applied at toplevel?
    136         parent_map = get_parent_map(context)
    137         result_map = {}
    138         for elem in result:
    139             if elem in parent_map:
    140                 parent = parent_map[elem]
    141                 if parent not in result_map:
    142                     result_map[parent] = None
    143                     yield parent
    144     return select
    145 
    146 def prepare_predicate(next, token):
    147     # FIXME: replace with real parser!!! refs:
    148     # http://effbot.org/zone/simple-iterator-parser.htm
    149     # http://javascript.crockford.com/tdop/tdop.html
    150     signature = []
    151     predicate = []
    152     while 1:
    153         token = next()
    154         if token[0] == "]":
    155             break
    156         if token[0] and token[0][:1] in "'\"":
    157             token = "'", token[0][1:-1]
    158         signature.append(token[0] or "-")
    159         predicate.append(token[1])
    160     signature = "".join(signature)
    161     # use signature to determine predicate type
    162     if signature == "@-":
    163         # [@attribute] predicate
    164         key = predicate[1]
    165         def select(context, result):
    166             for elem in result:
    167                 if elem.get(key) is not None:
    168                     yield elem
    169         return select
    170     if signature == "@-='":
    171         # [@attribute='value']
    172         key = predicate[1]
    173         value = predicate[-1]
    174         def select(context, result):
    175             for elem in result:
    176                 if elem.get(key) == value:
    177                     yield elem
    178         return select
    179     if signature == "-" and not re.match("\d+$", predicate[0]):
    180         # [tag]
    181         tag = predicate[0]
    182         def select(context, result):
    183             for elem in result:
    184                 if elem.find(tag) is not None:
    185                     yield elem
    186         return select
    187     if signature == "-='" and not re.match("\d+$", predicate[0]):
    188         # [tag='value']
    189         tag = predicate[0]
    190         value = predicate[-1]
    191         def select(context, result):
    192             for elem in result:
    193                 for e in elem.findall(tag):
    194                     if "".join(e.itertext()) == value:
    195                         yield elem
    196                         break
    197         return select
    198     if signature == "-" or signature == "-()" or signature == "-()-":
    199         # [index] or [last()] or [last()-index]
    200         if signature == "-":
    201             index = int(predicate[0]) - 1
    202         else:
    203             if predicate[0] != "last":
    204                 raise SyntaxError("unsupported function")
    205             if signature == "-()-":
    206                 try:
    207                     index = int(predicate[2]) - 1
    208                 except ValueError:
    209                     raise SyntaxError("unsupported expression")
    210             else:
    211                 index = -1
    212         def select(context, result):
    213             parent_map = get_parent_map(context)
    214             for elem in result:
    215                 try:
    216                     parent = parent_map[elem]
    217                     # FIXME: what if the selector is "*" ?
    218                     elems = list(parent.findall(elem.tag))
    219                     if elems[index] is elem:
    220                         yield elem
    221                 except (IndexError, KeyError):
    222                     pass
    223         return select
    224     raise SyntaxError("invalid predicate")
    225 
    226 ops = {
    227     "": prepare_child,
    228     "*": prepare_star,
    229     ".": prepare_self,
    230     "..": prepare_parent,
    231     "//": prepare_descendant,
    232     "[": prepare_predicate,
    233     }
    234 
    235 _cache = {}
    236 
    237 class _SelectorContext:
    238     parent_map = None
    239     def __init__(self, root):
    240         self.root = root
    241 
    242 # --------------------------------------------------------------------
    243 
    244 ##
    245 # Generate all matching objects.
    246 
    247 def iterfind(elem, path, namespaces=None):
    248     # compile selector pattern
    249     if path[-1:] == "/":
    250         path = path + "*" # implicit all (FIXME: keep this?)
    251     try:
    252         selector = _cache[path]
    253     except KeyError:
    254         if len(_cache) > 100:
    255             _cache.clear()
    256         if path[:1] == "/":
    257             raise SyntaxError("cannot use absolute path on element")
    258         next = iter(xpath_tokenizer(path, namespaces)).next
    259         token = next()
    260         selector = []
    261         while 1:
    262             try:
    263                 selector.append(ops[token[0]](next, token))
    264             except StopIteration:
    265                 raise SyntaxError("invalid path")
    266             try:
    267                 token = next()
    268                 if token[0] == "/":
    269                     token = next()
    270             except StopIteration:
    271                 break
    272         _cache[path] = selector
    273     # execute selector pattern
    274     result = [elem]
    275     context = _SelectorContext(elem)
    276     for select in selector:
    277         result = select(context, result)
    278     return result
    279 
    280 ##
    281 # Find first matching object.
    282 
    283 def find(elem, path, namespaces=None):
    284     try:
    285         return iterfind(elem, path, namespaces).next()
    286     except StopIteration:
    287         return None
    288 
    289 ##
    290 # Find all matching objects.
    291 
    292 def findall(elem, path, namespaces=None):
    293     return list(iterfind(elem, path, namespaces))
    294 
    295 ##
    296 # Find text for first matching object.
    297 
    298 def findtext(elem, path, default=None, namespaces=None):
    299     try:
    300         elem = iterfind(elem, path, namespaces).next()
    301         return elem.text or ""
    302     except StopIteration:
    303         return default
    304