Home | History | Annotate | Download | only in Lib
      1 #

      2 # Secret Labs' Regular Expression Engine

      3 #

      4 # convert re-style regular expression to sre pattern

      5 #

      6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.

      7 #

      8 # See the sre.py file for information on usage and redistribution.

      9 #

     10 
     11 """Internal support module for sre"""
     12 
     13 # XXX: show string offset and offending character for all errors

     14 
     15 import sys
     16 
     17 from sre_constants import *
     18 
     19 SPECIAL_CHARS = ".\\[{()*+?^$|"
     20 REPEAT_CHARS = "*+?{"
     21 
     22 DIGITS = set("0123456789")
     23 
     24 OCTDIGITS = set("01234567")
     25 HEXDIGITS = set("0123456789abcdefABCDEF")
     26 
     27 WHITESPACE = set(" \t\n\r\v\f")
     28 
     29 ESCAPES = {
     30     r"\a": (LITERAL, ord("\a")),
     31     r"\b": (LITERAL, ord("\b")),
     32     r"\f": (LITERAL, ord("\f")),
     33     r"\n": (LITERAL, ord("\n")),
     34     r"\r": (LITERAL, ord("\r")),
     35     r"\t": (LITERAL, ord("\t")),
     36     r"\v": (LITERAL, ord("\v")),
     37     r"\\": (LITERAL, ord("\\"))
     38 }
     39 
     40 CATEGORIES = {
     41     r"\A": (AT, AT_BEGINNING_STRING), # start of string

     42     r"\b": (AT, AT_BOUNDARY),
     43     r"\B": (AT, AT_NON_BOUNDARY),
     44     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
     45     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
     46     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
     47     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
     48     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
     49     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
     50     r"\Z": (AT, AT_END_STRING), # end of string

     51 }
     52 
     53 FLAGS = {
     54     # standard flags

     55     "i": SRE_FLAG_IGNORECASE,
     56     "L": SRE_FLAG_LOCALE,
     57     "m": SRE_FLAG_MULTILINE,
     58     "s": SRE_FLAG_DOTALL,
     59     "x": SRE_FLAG_VERBOSE,
     60     # extensions

     61     "t": SRE_FLAG_TEMPLATE,
     62     "u": SRE_FLAG_UNICODE,
     63 }
     64 
     65 class Pattern:
     66     # master pattern object.  keeps track of global attributes

     67     def __init__(self):
     68         self.flags = 0
     69         self.open = []
     70         self.groups = 1
     71         self.groupdict = {}
     72         self.lookbehind = 0
     73 
     74     def opengroup(self, name=None):
     75         gid = self.groups
     76         self.groups = gid + 1
     77         if name is not None:
     78             ogid = self.groupdict.get(name, None)
     79             if ogid is not None:
     80                 raise error, ("redefinition of group name %s as group %d; "
     81                               "was group %d" % (repr(name), gid,  ogid))
     82             self.groupdict[name] = gid
     83         self.open.append(gid)
     84         return gid
     85     def closegroup(self, gid):
     86         self.open.remove(gid)
     87     def checkgroup(self, gid):
     88         return gid < self.groups and gid not in self.open
     89 
     90 class SubPattern:
     91     # a subpattern, in intermediate form

     92     def __init__(self, pattern, data=None):
     93         self.pattern = pattern
     94         if data is None:
     95             data = []
     96         self.data = data
     97         self.width = None
     98     def dump(self, level=0):
     99         seqtypes = (tuple, list)
    100         for op, av in self.data:
    101             print level*"  " + op,
    102             if op == IN:
    103                 # member sublanguage

    104                 print
    105                 for op, a in av:
    106                     print (level+1)*"  " + op, a
    107             elif op == BRANCH:
    108                 print
    109                 for i, a in enumerate(av[1]):
    110                     if i:
    111                         print level*"  " + "or"
    112                     a.dump(level+1)
    113             elif op == GROUPREF_EXISTS:
    114                 condgroup, item_yes, item_no = av
    115                 print condgroup
    116                 item_yes.dump(level+1)
    117                 if item_no:
    118                     print level*"  " + "else"
    119                     item_no.dump(level+1)
    120             elif isinstance(av, seqtypes):
    121                 nl = 0
    122                 for a in av:
    123                     if isinstance(a, SubPattern):
    124                         if not nl:
    125                             print
    126                         a.dump(level+1)
    127                         nl = 1
    128                     else:
    129                         print a,
    130                         nl = 0
    131                 if not nl:
    132                     print
    133             else:
    134                 print av
    135     def __repr__(self):
    136         return repr(self.data)
    137     def __len__(self):
    138         return len(self.data)
    139     def __delitem__(self, index):
    140         del self.data[index]
    141     def __getitem__(self, index):
    142         if isinstance(index, slice):
    143             return SubPattern(self.pattern, self.data[index])
    144         return self.data[index]
    145     def __setitem__(self, index, code):
    146         self.data[index] = code
    147     def insert(self, index, code):
    148         self.data.insert(index, code)
    149     def append(self, code):
    150         self.data.append(code)
    151     def getwidth(self):
    152         # determine the width (min, max) for this subpattern

    153         if self.width:
    154             return self.width
    155         lo = hi = 0
    156         UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
    157         REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
    158         for op, av in self.data:
    159             if op is BRANCH:
    160                 i = MAXREPEAT - 1
    161                 j = 0
    162                 for av in av[1]:
    163                     l, h = av.getwidth()
    164                     i = min(i, l)
    165                     j = max(j, h)
    166                 lo = lo + i
    167                 hi = hi + j
    168             elif op is CALL:
    169                 i, j = av.getwidth()
    170                 lo = lo + i
    171                 hi = hi + j
    172             elif op is SUBPATTERN:
    173                 i, j = av[1].getwidth()
    174                 lo = lo + i
    175                 hi = hi + j
    176             elif op in REPEATCODES:
    177                 i, j = av[2].getwidth()
    178                 lo = lo + i * av[0]
    179                 hi = hi + j * av[1]
    180             elif op in UNITCODES:
    181                 lo = lo + 1
    182                 hi = hi + 1
    183             elif op == SUCCESS:
    184                 break
    185         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
    186         return self.width
    187 
    188 class Tokenizer:
    189     def __init__(self, string):
    190         self.string = string
    191         self.index = 0
    192         self.__next()
    193     def __next(self):
    194         if self.index >= len(self.string):
    195             self.next = None
    196             return
    197         char = self.string[self.index]
    198         if char[0] == "\\":
    199             try:
    200                 c = self.string[self.index + 1]
    201             except IndexError:
    202                 raise error, "bogus escape (end of line)"
    203             char = char + c
    204         self.index = self.index + len(char)
    205         self.next = char
    206     def match(self, char, skip=1):
    207         if char == self.next:
    208             if skip:
    209                 self.__next()
    210             return 1
    211         return 0
    212     def get(self):
    213         this = self.next
    214         self.__next()
    215         return this
    216     def tell(self):
    217         return self.index, self.next
    218     def seek(self, index):
    219         self.index, self.next = index
    220 
    221 def isident(char):
    222     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
    223 
    224 def isdigit(char):
    225     return "0" <= char <= "9"
    226 
    227 def isname(name):
    228     # check that group name is a valid string

    229     if not isident(name[0]):
    230         return False
    231     for char in name[1:]:
    232         if not isident(char) and not isdigit(char):
    233             return False
    234     return True
    235 
    236 def _class_escape(source, escape):
    237     # handle escape code inside character class

    238     code = ESCAPES.get(escape)
    239     if code:
    240         return code
    241     code = CATEGORIES.get(escape)
    242     if code and code[0] == IN:
    243         return code
    244     try:
    245         c = escape[1:2]
    246         if c == "x":
    247             # hexadecimal escape (exactly two digits)

    248             while source.next in HEXDIGITS and len(escape) < 4:
    249                 escape = escape + source.get()
    250             escape = escape[2:]
    251             if len(escape) != 2:
    252                 raise error, "bogus escape: %s" % repr("\\" + escape)
    253             return LITERAL, int(escape, 16) & 0xff
    254         elif c in OCTDIGITS:
    255             # octal escape (up to three digits)

    256             while source.next in OCTDIGITS and len(escape) < 4:
    257                 escape = escape + source.get()
    258             escape = escape[1:]
    259             return LITERAL, int(escape, 8) & 0xff
    260         elif c in DIGITS:
    261             raise error, "bogus escape: %s" % repr(escape)
    262         if len(escape) == 2:
    263             return LITERAL, ord(escape[1])
    264     except ValueError:
    265         pass
    266     raise error, "bogus escape: %s" % repr(escape)
    267 
    268 def _escape(source, escape, state):
    269     # handle escape code in expression

    270     code = CATEGORIES.get(escape)
    271     if code:
    272         return code
    273     code = ESCAPES.get(escape)
    274     if code:
    275         return code
    276     try:
    277         c = escape[1:2]
    278         if c == "x":
    279             # hexadecimal escape

    280             while source.next in HEXDIGITS and len(escape) < 4:
    281                 escape = escape + source.get()
    282             if len(escape) != 4:
    283                 raise ValueError
    284             return LITERAL, int(escape[2:], 16) & 0xff
    285         elif c == "0":
    286             # octal escape

    287             while source.next in OCTDIGITS and len(escape) < 4:
    288                 escape = escape + source.get()
    289             return LITERAL, int(escape[1:], 8) & 0xff
    290         elif c in DIGITS:
    291             # octal escape *or* decimal group reference (sigh)

    292             if source.next in DIGITS:
    293                 escape = escape + source.get()
    294                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
    295                     source.next in OCTDIGITS):
    296                     # got three octal digits; this is an octal escape

    297                     escape = escape + source.get()
    298                     return LITERAL, int(escape[1:], 8) & 0xff
    299             # not an octal escape, so this is a group reference

    300             group = int(escape[1:])
    301             if group < state.groups:
    302                 if not state.checkgroup(group):
    303                     raise error, "cannot refer to open group"
    304                 if state.lookbehind:
    305                     import warnings
    306                     warnings.warn('group references in lookbehind '
    307                                   'assertions are not supported',
    308                                   RuntimeWarning)
    309                 return GROUPREF, group
    310             raise ValueError
    311         if len(escape) == 2:
    312             return LITERAL, ord(escape[1])
    313     except ValueError:
    314         pass
    315     raise error, "bogus escape: %s" % repr(escape)
    316 
    317 def _parse_sub(source, state, nested=1):
    318     # parse an alternation: a|b|c

    319 
    320     items = []
    321     itemsappend = items.append
    322     sourcematch = source.match
    323     while 1:
    324         itemsappend(_parse(source, state))
    325         if sourcematch("|"):
    326             continue
    327         if not nested:
    328             break
    329         if not source.next or sourcematch(")", 0):
    330             break
    331         else:
    332             raise error, "pattern not properly closed"
    333 
    334     if len(items) == 1:
    335         return items[0]
    336 
    337     subpattern = SubPattern(state)
    338     subpatternappend = subpattern.append
    339 
    340     # check if all items share a common prefix

    341     while 1:
    342         prefix = None
    343         for item in items:
    344             if not item:
    345                 break
    346             if prefix is None:
    347                 prefix = item[0]
    348             elif item[0] != prefix:
    349                 break
    350         else:
    351             # all subitems start with a common "prefix".

    352             # move it out of the branch

    353             for item in items:
    354                 del item[0]
    355             subpatternappend(prefix)
    356             continue # check next one

    357         break
    358 
    359     # check if the branch can be replaced by a character set

    360     for item in items:
    361         if len(item) != 1 or item[0][0] != LITERAL:
    362             break
    363     else:
    364         # we can store this as a character set instead of a

    365         # branch (the compiler may optimize this even more)

    366         set = []
    367         setappend = set.append
    368         for item in items:
    369             setappend(item[0])
    370         subpatternappend((IN, set))
    371         return subpattern
    372 
    373     subpattern.append((BRANCH, (None, items)))
    374     return subpattern
    375 
    376 def _parse_sub_cond(source, state, condgroup):
    377     item_yes = _parse(source, state)
    378     if source.match("|"):
    379         item_no = _parse(source, state)
    380         if source.match("|"):
    381             raise error, "conditional backref with more than two branches"
    382     else:
    383         item_no = None
    384     if source.next and not source.match(")", 0):
    385         raise error, "pattern not properly closed"
    386     subpattern = SubPattern(state)
    387     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    388     return subpattern
    389 
    390 _PATTERNENDERS = set("|)")
    391 _ASSERTCHARS = set("=!<")
    392 _LOOKBEHINDASSERTCHARS = set("=!")
    393 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
    394 
    395 def _parse(source, state):
    396     # parse a simple pattern

    397     subpattern = SubPattern(state)
    398 
    399     # precompute constants into local variables

    400     subpatternappend = subpattern.append
    401     sourceget = source.get
    402     sourcematch = source.match
    403     _len = len
    404     PATTERNENDERS = _PATTERNENDERS
    405     ASSERTCHARS = _ASSERTCHARS
    406     LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
    407     REPEATCODES = _REPEATCODES
    408 
    409     while 1:
    410 
    411         if source.next in PATTERNENDERS:
    412             break # end of subpattern

    413         this = sourceget()
    414         if this is None:
    415             break # end of pattern

    416 
    417         if state.flags & SRE_FLAG_VERBOSE:
    418             # skip whitespace and comments

    419             if this in WHITESPACE:
    420                 continue
    421             if this == "#":
    422                 while 1:
    423                     this = sourceget()
    424                     if this in (None, "\n"):
    425                         break
    426                 continue
    427 
    428         if this and this[0] not in SPECIAL_CHARS:
    429             subpatternappend((LITERAL, ord(this)))
    430 
    431         elif this == "[":
    432             # character set

    433             set = []
    434             setappend = set.append
    435 ##          if sourcematch(":"):

    436 ##              pass # handle character classes

    437             if sourcematch("^"):
    438                 setappend((NEGATE, None))
    439             # check remaining characters

    440             start = set[:]
    441             while 1:
    442                 this = sourceget()
    443                 if this == "]" and set != start:
    444                     break
    445                 elif this and this[0] == "\\":
    446                     code1 = _class_escape(source, this)
    447                 elif this:
    448                     code1 = LITERAL, ord(this)
    449                 else:
    450                     raise error, "unexpected end of regular expression"
    451                 if sourcematch("-"):
    452                     # potential range

    453                     this = sourceget()
    454                     if this == "]":
    455                         if code1[0] is IN:
    456                             code1 = code1[1][0]
    457                         setappend(code1)
    458                         setappend((LITERAL, ord("-")))
    459                         break
    460                     elif this:
    461                         if this[0] == "\\":
    462                             code2 = _class_escape(source, this)
    463                         else:
    464                             code2 = LITERAL, ord(this)
    465                         if code1[0] != LITERAL or code2[0] != LITERAL:
    466                             raise error, "bad character range"
    467                         lo = code1[1]
    468                         hi = code2[1]
    469                         if hi < lo:
    470                             raise error, "bad character range"
    471                         setappend((RANGE, (lo, hi)))
    472                     else:
    473                         raise error, "unexpected end of regular expression"
    474                 else:
    475                     if code1[0] is IN:
    476                         code1 = code1[1][0]
    477                     setappend(code1)
    478 
    479             # XXX: <fl> should move set optimization to compiler!

    480             if _len(set)==1 and set[0][0] is LITERAL:
    481                 subpatternappend(set[0]) # optimization

    482             elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
    483                 subpatternappend((NOT_LITERAL, set[1][1])) # optimization

    484             else:
    485                 # XXX: <fl> should add charmap optimization here

    486                 subpatternappend((IN, set))
    487 
    488         elif this and this[0] in REPEAT_CHARS:
    489             # repeat previous item

    490             if this == "?":
    491                 min, max = 0, 1
    492             elif this == "*":
    493                 min, max = 0, MAXREPEAT
    494 
    495             elif this == "+":
    496                 min, max = 1, MAXREPEAT
    497             elif this == "{":
    498                 if source.next == "}":
    499                     subpatternappend((LITERAL, ord(this)))
    500                     continue
    501                 here = source.tell()
    502                 min, max = 0, MAXREPEAT
    503                 lo = hi = ""
    504                 while source.next in DIGITS:
    505                     lo = lo + source.get()
    506                 if sourcematch(","):
    507                     while source.next in DIGITS:
    508                         hi = hi + sourceget()
    509                 else:
    510                     hi = lo
    511                 if not sourcematch("}"):
    512                     subpatternappend((LITERAL, ord(this)))
    513                     source.seek(here)
    514                     continue
    515                 if lo:
    516                     min = int(lo)
    517                     if min >= MAXREPEAT:
    518                         raise OverflowError("the repetition number is too large")
    519                 if hi:
    520                     max = int(hi)
    521                     if max >= MAXREPEAT:
    522                         raise OverflowError("the repetition number is too large")
    523                     if max < min:
    524                         raise error("bad repeat interval")
    525             else:
    526                 raise error, "not supported"
    527             # figure out which item to repeat

    528             if subpattern:
    529                 item = subpattern[-1:]
    530             else:
    531                 item = None
    532             if not item or (_len(item) == 1 and item[0][0] == AT):
    533                 raise error, "nothing to repeat"
    534             if item[0][0] in REPEATCODES:
    535                 raise error, "multiple repeat"
    536             if sourcematch("?"):
    537                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
    538             else:
    539                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
    540 
    541         elif this == ".":
    542             subpatternappend((ANY, None))
    543 
    544         elif this == "(":
    545             group = 1
    546             name = None
    547             condgroup = None
    548             if sourcematch("?"):
    549                 group = 0
    550                 # options

    551                 if sourcematch("P"):
    552                     # python extensions

    553                     if sourcematch("<"):
    554                         # named group: skip forward to end of name

    555                         name = ""
    556                         while 1:
    557                             char = sourceget()
    558                             if char is None:
    559                                 raise error, "unterminated name"
    560                             if char == ">":
    561                                 break
    562                             name = name + char
    563                         group = 1
    564                         if not name:
    565                             raise error("missing group name")
    566                         if not isname(name):
    567                             raise error("bad character in group name %r" %
    568                                         name)
    569                     elif sourcematch("="):
    570                         # named backreference

    571                         name = ""
    572                         while 1:
    573                             char = sourceget()
    574                             if char is None:
    575                                 raise error, "unterminated name"
    576                             if char == ")":
    577                                 break
    578                             name = name + char
    579                         if not name:
    580                             raise error("missing group name")
    581                         if not isname(name):
    582                             raise error("bad character in backref group name "
    583                                         "%r" % name)
    584                         gid = state.groupdict.get(name)
    585                         if gid is None:
    586                             msg = "unknown group name: {0!r}".format(name)
    587                             raise error(msg)
    588                         if state.lookbehind:
    589                             import warnings
    590                             warnings.warn('group references in lookbehind '
    591                                           'assertions are not supported',
    592                                           RuntimeWarning)
    593                         subpatternappend((GROUPREF, gid))
    594                         continue
    595                     else:
    596                         char = sourceget()
    597                         if char is None:
    598                             raise error, "unexpected end of pattern"
    599                         raise error, "unknown specifier: ?P%s" % char
    600                 elif sourcematch(":"):
    601                     # non-capturing group

    602                     group = 2
    603                 elif sourcematch("#"):
    604                     # comment

    605                     while 1:
    606                         if source.next is None or source.next == ")":
    607                             break
    608                         sourceget()
    609                     if not sourcematch(")"):
    610                         raise error, "unbalanced parenthesis"
    611                     continue
    612                 elif source.next in ASSERTCHARS:
    613                     # lookahead assertions

    614                     char = sourceget()
    615                     dir = 1
    616                     if char == "<":
    617                         if source.next not in LOOKBEHINDASSERTCHARS:
    618                             raise error, "syntax error"
    619                         dir = -1 # lookbehind

    620                         char = sourceget()
    621                         state.lookbehind += 1
    622                     p = _parse_sub(source, state)
    623                     if dir < 0:
    624                         state.lookbehind -= 1
    625                     if not sourcematch(")"):
    626                         raise error, "unbalanced parenthesis"
    627                     if char == "=":
    628                         subpatternappend((ASSERT, (dir, p)))
    629                     else:
    630                         subpatternappend((ASSERT_NOT, (dir, p)))
    631                     continue
    632                 elif sourcematch("("):
    633                     # conditional backreference group

    634                     condname = ""
    635                     while 1:
    636                         char = sourceget()
    637                         if char is None:
    638                             raise error, "unterminated name"
    639                         if char == ")":
    640                             break
    641                         condname = condname + char
    642                     group = 2
    643                     if not condname:
    644                         raise error("missing group name")
    645                     if isname(condname):
    646                         condgroup = state.groupdict.get(condname)
    647                         if condgroup is None:
    648                             msg = "unknown group name: {0!r}".format(condname)
    649                             raise error(msg)
    650                     else:
    651                         try:
    652                             condgroup = int(condname)
    653                         except ValueError:
    654                             raise error, "bad character in group name"
    655                     if state.lookbehind:
    656                         import warnings
    657                         warnings.warn('group references in lookbehind '
    658                                       'assertions are not supported',
    659                                       RuntimeWarning)
    660                 else:
    661                     # flags

    662                     if not source.next in FLAGS:
    663                         raise error, "unexpected end of pattern"
    664                     while source.next in FLAGS:
    665                         state.flags = state.flags | FLAGS[sourceget()]
    666             if group:
    667                 # parse group contents

    668                 if group == 2:
    669                     # anonymous group

    670                     group = None
    671                 else:
    672                     group = state.opengroup(name)
    673                 if condgroup:
    674                     p = _parse_sub_cond(source, state, condgroup)
    675                 else:
    676                     p = _parse_sub(source, state)
    677                 if not sourcematch(")"):
    678                     raise error, "unbalanced parenthesis"
    679                 if group is not None:
    680                     state.closegroup(group)
    681                 subpatternappend((SUBPATTERN, (group, p)))
    682             else:
    683                 while 1:
    684                     char = sourceget()
    685                     if char is None:
    686                         raise error, "unexpected end of pattern"
    687                     if char == ")":
    688                         break
    689                     raise error, "unknown extension"
    690 
    691         elif this == "^":
    692             subpatternappend((AT, AT_BEGINNING))
    693 
    694         elif this == "$":
    695             subpattern.append((AT, AT_END))
    696 
    697         elif this and this[0] == "\\":
    698             code = _escape(source, this, state)
    699             subpatternappend(code)
    700 
    701         else:
    702             raise error, "parser error"
    703 
    704     return subpattern
    705 
    706 def parse(str, flags=0, pattern=None):
    707     # parse 're' pattern into list of (opcode, argument) tuples

    708 
    709     source = Tokenizer(str)
    710 
    711     if pattern is None:
    712         pattern = Pattern()
    713     pattern.flags = flags
    714     pattern.str = str
    715 
    716     p = _parse_sub(source, pattern, 0)
    717 
    718     tail = source.get()
    719     if tail == ")":
    720         raise error, "unbalanced parenthesis"
    721     elif tail:
    722         raise error, "bogus characters at end of regular expression"
    723 
    724     if flags & SRE_FLAG_DEBUG:
    725         p.dump()
    726 
    727     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
    728         # the VERBOSE flag was switched on inside the pattern.  to be

    729         # on the safe side, we'll parse the whole thing again...

    730         return parse(str, p.pattern.flags)
    731 
    732     return p
    733 
    734 def parse_template(source, pattern):
    735     # parse 're' replacement string into list of literals and

    736     # group references

    737     s = Tokenizer(source)
    738     sget = s.get
    739     p = []
    740     a = p.append
    741     def literal(literal, p=p, pappend=a):
    742         if p and p[-1][0] is LITERAL:
    743             p[-1] = LITERAL, p[-1][1] + literal
    744         else:
    745             pappend((LITERAL, literal))
    746     sep = source[:0]
    747     if type(sep) is type(""):
    748         makechar = chr
    749     else:
    750         makechar = unichr
    751     while 1:
    752         this = sget()
    753         if this is None:
    754             break # end of replacement string

    755         if this and this[0] == "\\":
    756             # group

    757             c = this[1:2]
    758             if c == "g":
    759                 name = ""
    760                 if s.match("<"):
    761                     while 1:
    762                         char = sget()
    763                         if char is None:
    764                             raise error, "unterminated group name"
    765                         if char == ">":
    766                             break
    767                         name = name + char
    768                 if not name:
    769                     raise error, "missing group name"
    770                 try:
    771                     index = int(name)
    772                     if index < 0:
    773                         raise error, "negative group number"
    774                 except ValueError:
    775                     if not isname(name):
    776                         raise error, "bad character in group name"
    777                     try:
    778                         index = pattern.groupindex[name]
    779                     except KeyError:
    780                         msg = "unknown group name: {0!r}".format(name)
    781                         raise IndexError(msg)
    782                 a((MARK, index))
    783             elif c == "0":
    784                 if s.next in OCTDIGITS:
    785                     this = this + sget()
    786                     if s.next in OCTDIGITS:
    787                         this = this + sget()
    788                 literal(makechar(int(this[1:], 8) & 0xff))
    789             elif c in DIGITS:
    790                 isoctal = False
    791                 if s.next in DIGITS:
    792                     this = this + sget()
    793                     if (c in OCTDIGITS and this[2] in OCTDIGITS and
    794                         s.next in OCTDIGITS):
    795                         this = this + sget()
    796                         isoctal = True
    797                         literal(makechar(int(this[1:], 8) & 0xff))
    798                 if not isoctal:
    799                     a((MARK, int(this[1:])))
    800             else:
    801                 try:
    802                     this = makechar(ESCAPES[this][1])
    803                 except KeyError:
    804                     pass
    805                 literal(this)
    806         else:
    807             literal(this)
    808     # convert template to groups and literals lists

    809     i = 0
    810     groups = []
    811     groupsappend = groups.append
    812     literals = [None] * len(p)
    813     for c, s in p:
    814         if c is MARK:
    815             groupsappend((i, s))
    816             # literal[i] is already None

    817         else:
    818             literals[i] = s
    819         i = i + 1
    820     return groups, literals
    821 
    822 def expand_template(template, match):
    823     g = match.group
    824     sep = match.string[:0]
    825     groups, literals = template
    826     literals = literals[:]
    827     try:
    828         for index, group in groups:
    829             literals[index] = s = g(group)
    830             if s is None:
    831                 raise error, "unmatched group"
    832     except IndexError:
    833         raise error, "invalid group reference"
    834     return sep.join(literals)
    835