Home | History | Annotate | Download | only in Lib
      1 #
      2 # Secret Labs' Regular Expression Engine
      3 #
      4 # convert re-style regular expression to sre pattern
      5 #
      6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
      7 #
      8 # See the sre.py file for information on usage and redistribution.
      9 #
     10 
     11 """Internal support module for sre"""
     12 
     13 # XXX: show string offset and offending character for all errors
     14 
     15 from sre_constants import *
     16 
     17 SPECIAL_CHARS = ".\\[{()*+?^$|"
     18 REPEAT_CHARS = "*+?{"
     19 
     20 DIGITS = frozenset("0123456789")
     21 
     22 OCTDIGITS = frozenset("01234567")
     23 HEXDIGITS = frozenset("0123456789abcdefABCDEF")
     24 ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
     25 
     26 WHITESPACE = frozenset(" \t\n\r\v\f")
     27 
     28 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
     29 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
     30 
     31 ESCAPES = {
     32     r"\a": (LITERAL, ord("\a")),
     33     r"\b": (LITERAL, ord("\b")),
     34     r"\f": (LITERAL, ord("\f")),
     35     r"\n": (LITERAL, ord("\n")),
     36     r"\r": (LITERAL, ord("\r")),
     37     r"\t": (LITERAL, ord("\t")),
     38     r"\v": (LITERAL, ord("\v")),
     39     r"\\": (LITERAL, ord("\\"))
     40 }
     41 
     42 CATEGORIES = {
     43     r"\A": (AT, AT_BEGINNING_STRING), # start of string
     44     r"\b": (AT, AT_BOUNDARY),
     45     r"\B": (AT, AT_NON_BOUNDARY),
     46     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
     47     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
     48     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
     49     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
     50     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
     51     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
     52     r"\Z": (AT, AT_END_STRING), # end of string
     53 }
     54 
     55 FLAGS = {
     56     # standard flags
     57     "i": SRE_FLAG_IGNORECASE,
     58     "L": SRE_FLAG_LOCALE,
     59     "m": SRE_FLAG_MULTILINE,
     60     "s": SRE_FLAG_DOTALL,
     61     "x": SRE_FLAG_VERBOSE,
     62     # extensions
     63     "a": SRE_FLAG_ASCII,
     64     "t": SRE_FLAG_TEMPLATE,
     65     "u": SRE_FLAG_UNICODE,
     66 }
     67 
     68 GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
     69                 SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
     70 
     71 class Verbose(Exception):
     72     pass
     73 
     74 class Pattern:
     75     # master pattern object.  keeps track of global attributes
     76     def __init__(self):
     77         self.flags = 0
     78         self.groupdict = {}
     79         self.groupwidths = [None]  # group 0
     80         self.lookbehindgroups = None
     81     @property
     82     def groups(self):
     83         return len(self.groupwidths)
     84     def opengroup(self, name=None):
     85         gid = self.groups
     86         self.groupwidths.append(None)
     87         if self.groups > MAXGROUPS:
     88             raise error("too many groups")
     89         if name is not None:
     90             ogid = self.groupdict.get(name, None)
     91             if ogid is not None:
     92                 raise error("redefinition of group name %r as group %d; "
     93                             "was group %d" % (name, gid,  ogid))
     94             self.groupdict[name] = gid
     95         return gid
     96     def closegroup(self, gid, p):
     97         self.groupwidths[gid] = p.getwidth()
     98     def checkgroup(self, gid):
     99         return gid < self.groups and self.groupwidths[gid] is not None
    100 
    101     def checklookbehindgroup(self, gid, source):
    102         if self.lookbehindgroups is not None:
    103             if not self.checkgroup(gid):
    104                 raise source.error('cannot refer to an open group')
    105             if gid >= self.lookbehindgroups:
    106                 raise source.error('cannot refer to group defined in the same '
    107                                    'lookbehind subpattern')
    108 
    109 class SubPattern:
    110     # a subpattern, in intermediate form
    111     def __init__(self, pattern, data=None):
    112         self.pattern = pattern
    113         if data is None:
    114             data = []
    115         self.data = data
    116         self.width = None
    117     def dump(self, level=0):
    118         nl = True
    119         seqtypes = (tuple, list)
    120         for op, av in self.data:
    121             print(level*"  " + str(op), end='')
    122             if op is IN:
    123                 # member sublanguage
    124                 print()
    125                 for op, a in av:
    126                     print((level+1)*"  " + str(op), a)
    127             elif op is BRANCH:
    128                 print()
    129                 for i, a in enumerate(av[1]):
    130                     if i:
    131                         print(level*"  " + "OR")
    132                     a.dump(level+1)
    133             elif op is GROUPREF_EXISTS:
    134                 condgroup, item_yes, item_no = av
    135                 print('', condgroup)
    136                 item_yes.dump(level+1)
    137                 if item_no:
    138                     print(level*"  " + "ELSE")
    139                     item_no.dump(level+1)
    140             elif isinstance(av, seqtypes):
    141                 nl = False
    142                 for a in av:
    143                     if isinstance(a, SubPattern):
    144                         if not nl:
    145                             print()
    146                         a.dump(level+1)
    147                         nl = True
    148                     else:
    149                         if not nl:
    150                             print(' ', end='')
    151                         print(a, end='')
    152                         nl = False
    153                 if not nl:
    154                     print()
    155             else:
    156                 print('', av)
    157     def __repr__(self):
    158         return repr(self.data)
    159     def __len__(self):
    160         return len(self.data)
    161     def __delitem__(self, index):
    162         del self.data[index]
    163     def __getitem__(self, index):
    164         if isinstance(index, slice):
    165             return SubPattern(self.pattern, self.data[index])
    166         return self.data[index]
    167     def __setitem__(self, index, code):
    168         self.data[index] = code
    169     def insert(self, index, code):
    170         self.data.insert(index, code)
    171     def append(self, code):
    172         self.data.append(code)
    173     def getwidth(self):
    174         # determine the width (min, max) for this subpattern
    175         if self.width is not None:
    176             return self.width
    177         lo = hi = 0
    178         for op, av in self.data:
    179             if op is BRANCH:
    180                 i = MAXREPEAT - 1
    181                 j = 0
    182                 for av in av[1]:
    183                     l, h = av.getwidth()
    184                     i = min(i, l)
    185                     j = max(j, h)
    186                 lo = lo + i
    187                 hi = hi + j
    188             elif op is CALL:
    189                 i, j = av.getwidth()
    190                 lo = lo + i
    191                 hi = hi + j
    192             elif op is SUBPATTERN:
    193                 i, j = av[-1].getwidth()
    194                 lo = lo + i
    195                 hi = hi + j
    196             elif op in _REPEATCODES:
    197                 i, j = av[2].getwidth()
    198                 lo = lo + i * av[0]
    199                 hi = hi + j * av[1]
    200             elif op in _UNITCODES:
    201                 lo = lo + 1
    202                 hi = hi + 1
    203             elif op is GROUPREF:
    204                 i, j = self.pattern.groupwidths[av]
    205                 lo = lo + i
    206                 hi = hi + j
    207             elif op is GROUPREF_EXISTS:
    208                 i, j = av[1].getwidth()
    209                 if av[2] is not None:
    210                     l, h = av[2].getwidth()
    211                     i = min(i, l)
    212                     j = max(j, h)
    213                 else:
    214                     i = 0
    215                 lo = lo + i
    216                 hi = hi + j
    217             elif op is SUCCESS:
    218                 break
    219         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
    220         return self.width
    221 
    222 class Tokenizer:
    223     def __init__(self, string):
    224         self.istext = isinstance(string, str)
    225         self.string = string
    226         if not self.istext:
    227             string = str(string, 'latin1')
    228         self.decoded_string = string
    229         self.index = 0
    230         self.next = None
    231         self.__next()
    232     def __next(self):
    233         index = self.index
    234         try:
    235             char = self.decoded_string[index]
    236         except IndexError:
    237             self.next = None
    238             return
    239         if char == "\\":
    240             index += 1
    241             try:
    242                 char += self.decoded_string[index]
    243             except IndexError:
    244                 raise error("bad escape (end of pattern)",
    245                             self.string, len(self.string) - 1) from None
    246         self.index = index + 1
    247         self.next = char
    248     def match(self, char):
    249         if char == self.next:
    250             self.__next()
    251             return True
    252         return False
    253     def get(self):
    254         this = self.next
    255         self.__next()
    256         return this
    257     def getwhile(self, n, charset):
    258         result = ''
    259         for _ in range(n):
    260             c = self.next
    261             if c not in charset:
    262                 break
    263             result += c
    264             self.__next()
    265         return result
    266     def getuntil(self, terminator):
    267         result = ''
    268         while True:
    269             c = self.next
    270             self.__next()
    271             if c is None:
    272                 if not result:
    273                     raise self.error("missing group name")
    274                 raise self.error("missing %s, unterminated name" % terminator,
    275                                  len(result))
    276             if c == terminator:
    277                 if not result:
    278                     raise self.error("missing group name", 1)
    279                 break
    280             result += c
    281         return result
    282     @property
    283     def pos(self):
    284         return self.index - len(self.next or '')
    285     def tell(self):
    286         return self.index - len(self.next or '')
    287     def seek(self, index):
    288         self.index = index
    289         self.__next()
    290 
    291     def error(self, msg, offset=0):
    292         return error(msg, self.string, self.tell() - offset)
    293 
    294 def _class_escape(source, escape):
    295     # handle escape code inside character class
    296     code = ESCAPES.get(escape)
    297     if code:
    298         return code
    299     code = CATEGORIES.get(escape)
    300     if code and code[0] is IN:
    301         return code
    302     try:
    303         c = escape[1:2]
    304         if c == "x":
    305             # hexadecimal escape (exactly two digits)
    306             escape += source.getwhile(2, HEXDIGITS)
    307             if len(escape) != 4:
    308                 raise source.error("incomplete escape %s" % escape, len(escape))
    309             return LITERAL, int(escape[2:], 16)
    310         elif c == "u" and source.istext:
    311             # unicode escape (exactly four digits)
    312             escape += source.getwhile(4, HEXDIGITS)
    313             if len(escape) != 6:
    314                 raise source.error("incomplete escape %s" % escape, len(escape))
    315             return LITERAL, int(escape[2:], 16)
    316         elif c == "U" and source.istext:
    317             # unicode escape (exactly eight digits)
    318             escape += source.getwhile(8, HEXDIGITS)
    319             if len(escape) != 10:
    320                 raise source.error("incomplete escape %s" % escape, len(escape))
    321             c = int(escape[2:], 16)
    322             chr(c) # raise ValueError for invalid code
    323             return LITERAL, c
    324         elif c in OCTDIGITS:
    325             # octal escape (up to three digits)
    326             escape += source.getwhile(2, OCTDIGITS)
    327             c = int(escape[1:], 8)
    328             if c > 0o377:
    329                 raise source.error('octal escape value %s outside of '
    330                                    'range 0-0o377' % escape, len(escape))
    331             return LITERAL, c
    332         elif c in DIGITS:
    333             raise ValueError
    334         if len(escape) == 2:
    335             if c in ASCIILETTERS:
    336                 raise source.error('bad escape %s' % escape, len(escape))
    337             return LITERAL, ord(escape[1])
    338     except ValueError:
    339         pass
    340     raise source.error("bad escape %s" % escape, len(escape))
    341 
    342 def _escape(source, escape, state):
    343     # handle escape code in expression
    344     code = CATEGORIES.get(escape)
    345     if code:
    346         return code
    347     code = ESCAPES.get(escape)
    348     if code:
    349         return code
    350     try:
    351         c = escape[1:2]
    352         if c == "x":
    353             # hexadecimal escape
    354             escape += source.getwhile(2, HEXDIGITS)
    355             if len(escape) != 4:
    356                 raise source.error("incomplete escape %s" % escape, len(escape))
    357             return LITERAL, int(escape[2:], 16)
    358         elif c == "u" and source.istext:
    359             # unicode escape (exactly four digits)
    360             escape += source.getwhile(4, HEXDIGITS)
    361             if len(escape) != 6:
    362                 raise source.error("incomplete escape %s" % escape, len(escape))
    363             return LITERAL, int(escape[2:], 16)
    364         elif c == "U" and source.istext:
    365             # unicode escape (exactly eight digits)
    366             escape += source.getwhile(8, HEXDIGITS)
    367             if len(escape) != 10:
    368                 raise source.error("incomplete escape %s" % escape, len(escape))
    369             c = int(escape[2:], 16)
    370             chr(c) # raise ValueError for invalid code
    371             return LITERAL, c
    372         elif c == "0":
    373             # octal escape
    374             escape += source.getwhile(2, OCTDIGITS)
    375             return LITERAL, int(escape[1:], 8)
    376         elif c in DIGITS:
    377             # octal escape *or* decimal group reference (sigh)
    378             if source.next in DIGITS:
    379                 escape += source.get()
    380                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
    381                     source.next in OCTDIGITS):
    382                     # got three octal digits; this is an octal escape
    383                     escape += source.get()
    384                     c = int(escape[1:], 8)
    385                     if c > 0o377:
    386                         raise source.error('octal escape value %s outside of '
    387                                            'range 0-0o377' % escape,
    388                                            len(escape))
    389                     return LITERAL, c
    390             # not an octal escape, so this is a group reference
    391             group = int(escape[1:])
    392             if group < state.groups:
    393                 if not state.checkgroup(group):
    394                     raise source.error("cannot refer to an open group",
    395                                        len(escape))
    396                 state.checklookbehindgroup(group, source)
    397                 return GROUPREF, group
    398             raise source.error("invalid group reference %d" % group, len(escape) - 1)
    399         if len(escape) == 2:
    400             if c in ASCIILETTERS:
    401                 raise source.error("bad escape %s" % escape, len(escape))
    402             return LITERAL, ord(escape[1])
    403     except ValueError:
    404         pass
    405     raise source.error("bad escape %s" % escape, len(escape))
    406 
    407 def _parse_sub(source, state, verbose, nested=True):
    408     # parse an alternation: a|b|c
    409 
    410     items = []
    411     itemsappend = items.append
    412     sourcematch = source.match
    413     start = source.tell()
    414     while True:
    415         itemsappend(_parse(source, state, verbose))
    416         if not sourcematch("|"):
    417             break
    418 
    419     if len(items) == 1:
    420         return items[0]
    421 
    422     subpattern = SubPattern(state)
    423     subpatternappend = subpattern.append
    424 
    425     # check if all items share a common prefix
    426     while True:
    427         prefix = None
    428         for item in items:
    429             if not item:
    430                 break
    431             if prefix is None:
    432                 prefix = item[0]
    433             elif item[0] != prefix:
    434                 break
    435         else:
    436             # all subitems start with a common "prefix".
    437             # move it out of the branch
    438             for item in items:
    439                 del item[0]
    440             subpatternappend(prefix)
    441             continue # check next one
    442         break
    443 
    444     # check if the branch can be replaced by a character set
    445     for item in items:
    446         if len(item) != 1 or item[0][0] is not LITERAL:
    447             break
    448     else:
    449         # we can store this as a character set instead of a
    450         # branch (the compiler may optimize this even more)
    451         subpatternappend((IN, [item[0] for item in items]))
    452         return subpattern
    453 
    454     subpattern.append((BRANCH, (None, items)))
    455     return subpattern
    456 
    457 def _parse_sub_cond(source, state, condgroup, verbose):
    458     item_yes = _parse(source, state, verbose)
    459     if source.match("|"):
    460         item_no = _parse(source, state, verbose)
    461         if source.next == "|":
    462             raise source.error("conditional backref with more than two branches")
    463     else:
    464         item_no = None
    465     subpattern = SubPattern(state)
    466     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    467     return subpattern
    468 
    469 def _parse(source, state, verbose):
    470     # parse a simple pattern
    471     subpattern = SubPattern(state)
    472 
    473     # precompute constants into local variables
    474     subpatternappend = subpattern.append
    475     sourceget = source.get
    476     sourcematch = source.match
    477     _len = len
    478     _ord = ord
    479 
    480     while True:
    481 
    482         this = source.next
    483         if this is None:
    484             break # end of pattern
    485         if this in "|)":
    486             break # end of subpattern
    487         sourceget()
    488 
    489         if verbose:
    490             # skip whitespace and comments
    491             if this in WHITESPACE:
    492                 continue
    493             if this == "#":
    494                 while True:
    495                     this = sourceget()
    496                     if this is None or this == "\n":
    497                         break
    498                 continue
    499 
    500         if this[0] == "\\":
    501             code = _escape(source, this, state)
    502             subpatternappend(code)
    503 
    504         elif this not in SPECIAL_CHARS:
    505             subpatternappend((LITERAL, _ord(this)))
    506 
    507         elif this == "[":
    508             here = source.tell() - 1
    509             # character set
    510             set = []
    511             setappend = set.append
    512 ##          if sourcematch(":"):
    513 ##              pass # handle character classes
    514             if sourcematch("^"):
    515                 setappend((NEGATE, None))
    516             # check remaining characters
    517             start = set[:]
    518             while True:
    519                 this = sourceget()
    520                 if this is None:
    521                     raise source.error("unterminated character set",
    522                                        source.tell() - here)
    523                 if this == "]" and set != start:
    524                     break
    525                 elif this[0] == "\\":
    526                     code1 = _class_escape(source, this)
    527                 else:
    528                     code1 = LITERAL, _ord(this)
    529                 if sourcematch("-"):
    530                     # potential range
    531                     that = sourceget()
    532                     if that is None:
    533                         raise source.error("unterminated character set",
    534                                            source.tell() - here)
    535                     if that == "]":
    536                         if code1[0] is IN:
    537                             code1 = code1[1][0]
    538                         setappend(code1)
    539                         setappend((LITERAL, _ord("-")))
    540                         break
    541                     if that[0] == "\\":
    542                         code2 = _class_escape(source, that)
    543                     else:
    544                         code2 = LITERAL, _ord(that)
    545                     if code1[0] != LITERAL or code2[0] != LITERAL:
    546                         msg = "bad character range %s-%s" % (this, that)
    547                         raise source.error(msg, len(this) + 1 + len(that))
    548                     lo = code1[1]
    549                     hi = code2[1]
    550                     if hi < lo:
    551                         msg = "bad character range %s-%s" % (this, that)
    552                         raise source.error(msg, len(this) + 1 + len(that))
    553                     setappend((RANGE, (lo, hi)))
    554                 else:
    555                     if code1[0] is IN:
    556                         code1 = code1[1][0]
    557                     setappend(code1)
    558 
    559             # XXX: <fl> should move set optimization to compiler!
    560             if _len(set)==1 and set[0][0] is LITERAL:
    561                 subpatternappend(set[0]) # optimization
    562             elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
    563                 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
    564             else:
    565                 # XXX: <fl> should add charmap optimization here
    566                 subpatternappend((IN, set))
    567 
    568         elif this in REPEAT_CHARS:
    569             # repeat previous item
    570             here = source.tell()
    571             if this == "?":
    572                 min, max = 0, 1
    573             elif this == "*":
    574                 min, max = 0, MAXREPEAT
    575 
    576             elif this == "+":
    577                 min, max = 1, MAXREPEAT
    578             elif this == "{":
    579                 if source.next == "}":
    580                     subpatternappend((LITERAL, _ord(this)))
    581                     continue
    582                 min, max = 0, MAXREPEAT
    583                 lo = hi = ""
    584                 while source.next in DIGITS:
    585                     lo += sourceget()
    586                 if sourcematch(","):
    587                     while source.next in DIGITS:
    588                         hi += sourceget()
    589                 else:
    590                     hi = lo
    591                 if not sourcematch("}"):
    592                     subpatternappend((LITERAL, _ord(this)))
    593                     source.seek(here)
    594                     continue
    595                 if lo:
    596                     min = int(lo)
    597                     if min >= MAXREPEAT:
    598                         raise OverflowError("the repetition number is too large")
    599                 if hi:
    600                     max = int(hi)
    601                     if max >= MAXREPEAT:
    602                         raise OverflowError("the repetition number is too large")
    603                     if max < min:
    604                         raise source.error("min repeat greater than max repeat",
    605                                            source.tell() - here)
    606             else:
    607                 raise AssertionError("unsupported quantifier %r" % (char,))
    608             # figure out which item to repeat
    609             if subpattern:
    610                 item = subpattern[-1:]
    611             else:
    612                 item = None
    613             if not item or (_len(item) == 1 and item[0][0] is AT):
    614                 raise source.error("nothing to repeat",
    615                                    source.tell() - here + len(this))
    616             if item[0][0] in _REPEATCODES:
    617                 raise source.error("multiple repeat",
    618                                    source.tell() - here + len(this))
    619             if sourcematch("?"):
    620                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
    621             else:
    622                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
    623 
    624         elif this == ".":
    625             subpatternappend((ANY, None))
    626 
    627         elif this == "(":
    628             start = source.tell() - 1
    629             group = True
    630             name = None
    631             condgroup = None
    632             add_flags = 0
    633             del_flags = 0
    634             if sourcematch("?"):
    635                 # options
    636                 char = sourceget()
    637                 if char is None:
    638                     raise source.error("unexpected end of pattern")
    639                 if char == "P":
    640                     # python extensions
    641                     if sourcematch("<"):
    642                         # named group: skip forward to end of name
    643                         name = source.getuntil(">")
    644                         if not name.isidentifier():
    645                             msg = "bad character in group name %r" % name
    646                             raise source.error(msg, len(name) + 1)
    647                     elif sourcematch("="):
    648                         # named backreference
    649                         name = source.getuntil(")")
    650                         if not name.isidentifier():
    651                             msg = "bad character in group name %r" % name
    652                             raise source.error(msg, len(name) + 1)
    653                         gid = state.groupdict.get(name)
    654                         if gid is None:
    655                             msg = "unknown group name %r" % name
    656                             raise source.error(msg, len(name) + 1)
    657                         if not state.checkgroup(gid):
    658                             raise source.error("cannot refer to an open group",
    659                                                len(name) + 1)
    660                         state.checklookbehindgroup(gid, source)
    661                         subpatternappend((GROUPREF, gid))
    662                         continue
    663                     else:
    664                         char = sourceget()
    665                         if char is None:
    666                             raise source.error("unexpected end of pattern")
    667                         raise source.error("unknown extension ?P" + char,
    668                                            len(char) + 2)
    669                 elif char == ":":
    670                     # non-capturing group
    671                     group = None
    672                 elif char == "#":
    673                     # comment
    674                     while True:
    675                         if source.next is None:
    676                             raise source.error("missing ), unterminated comment",
    677                                                source.tell() - start)
    678                         if sourceget() == ")":
    679                             break
    680                     continue
    681                 elif char in "=!<":
    682                     # lookahead assertions
    683                     dir = 1
    684                     if char == "<":
    685                         char = sourceget()
    686                         if char is None:
    687                             raise source.error("unexpected end of pattern")
    688                         if char not in "=!":
    689                             raise source.error("unknown extension ?<" + char,
    690                                                len(char) + 2)
    691                         dir = -1 # lookbehind
    692                         lookbehindgroups = state.lookbehindgroups
    693                         if lookbehindgroups is None:
    694                             state.lookbehindgroups = state.groups
    695                     p = _parse_sub(source, state, verbose)
    696                     if dir < 0:
    697                         if lookbehindgroups is None:
    698                             state.lookbehindgroups = None
    699                     if not sourcematch(")"):
    700                         raise source.error("missing ), unterminated subpattern",
    701                                            source.tell() - start)
    702                     if char == "=":
    703                         subpatternappend((ASSERT, (dir, p)))
    704                     else:
    705                         subpatternappend((ASSERT_NOT, (dir, p)))
    706                     continue
    707                 elif char == "(":
    708                     # conditional backreference group
    709                     condname = source.getuntil(")")
    710                     group = None
    711                     if condname.isidentifier():
    712                         condgroup = state.groupdict.get(condname)
    713                         if condgroup is None:
    714                             msg = "unknown group name %r" % condname
    715                             raise source.error(msg, len(condname) + 1)
    716                     else:
    717                         try:
    718                             condgroup = int(condname)
    719                             if condgroup < 0:
    720                                 raise ValueError
    721                         except ValueError:
    722                             msg = "bad character in group name %r" % condname
    723                             raise source.error(msg, len(condname) + 1) from None
    724                         if not condgroup:
    725                             raise source.error("bad group number",
    726                                                len(condname) + 1)
    727                         if condgroup >= MAXGROUPS:
    728                             msg = "invalid group reference %d" % condgroup
    729                             raise source.error(msg, len(condname) + 1)
    730                     state.checklookbehindgroup(condgroup, source)
    731                 elif char in FLAGS or char == "-":
    732                     # flags
    733                     pos = source.pos
    734                     flags = _parse_flags(source, state, char)
    735                     if flags is None:  # global flags
    736                         if pos != 3:  # "(?x"
    737                             import warnings
    738                             warnings.warn(
    739                                 'Flags not at the start of the expression %s%s' % (
    740                                     source.string[:20],  # truncate long regexes
    741                                     ' (truncated)' if len(source.string) > 20 else '',
    742                                 ),
    743                                 DeprecationWarning, stacklevel=7
    744                             )
    745                         continue
    746                     add_flags, del_flags = flags
    747                     group = None
    748                 else:
    749                     raise source.error("unknown extension ?" + char,
    750                                        len(char) + 1)
    751 
    752             # parse group contents
    753             if group is not None:
    754                 try:
    755                     group = state.opengroup(name)
    756                 except error as err:
    757                     raise source.error(err.msg, len(name) + 1) from None
    758             if condgroup:
    759                 p = _parse_sub_cond(source, state, condgroup, verbose)
    760             else:
    761                 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
    762                                not (del_flags & SRE_FLAG_VERBOSE))
    763                 p = _parse_sub(source, state, sub_verbose)
    764             if not source.match(")"):
    765                 raise source.error("missing ), unterminated subpattern",
    766                                    source.tell() - start)
    767             if group is not None:
    768                 state.closegroup(group, p)
    769             subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
    770 
    771         elif this == "^":
    772             subpatternappend((AT, AT_BEGINNING))
    773 
    774         elif this == "$":
    775             subpattern.append((AT, AT_END))
    776 
    777         else:
    778             raise AssertionError("unsupported special character %r" % (char,))
    779 
    780     return subpattern
    781 
    782 def _parse_flags(source, state, char):
    783     sourceget = source.get
    784     add_flags = 0
    785     del_flags = 0
    786     if char != "-":
    787         while True:
    788             add_flags |= FLAGS[char]
    789             char = sourceget()
    790             if char is None:
    791                 raise source.error("missing -, : or )")
    792             if char in ")-:":
    793                 break
    794             if char not in FLAGS:
    795                 msg = "unknown flag" if char.isalpha() else "missing -, : or )"
    796                 raise source.error(msg, len(char))
    797     if char == ")":
    798         if ((add_flags & SRE_FLAG_VERBOSE) and
    799             not (state.flags & SRE_FLAG_VERBOSE)):
    800             raise Verbose
    801         state.flags |= add_flags
    802         return None
    803     if add_flags & GLOBAL_FLAGS:
    804         raise source.error("bad inline flags: cannot turn on global flag", 1)
    805     if char == "-":
    806         char = sourceget()
    807         if char is None:
    808             raise source.error("missing flag")
    809         if char not in FLAGS:
    810             msg = "unknown flag" if char.isalpha() else "missing flag"
    811             raise source.error(msg, len(char))
    812         while True:
    813             del_flags |= FLAGS[char]
    814             char = sourceget()
    815             if char is None:
    816                 raise source.error("missing :")
    817             if char == ":":
    818                 break
    819             if char not in FLAGS:
    820                 msg = "unknown flag" if char.isalpha() else "missing :"
    821                 raise source.error(msg, len(char))
    822     assert char == ":"
    823     if del_flags & GLOBAL_FLAGS:
    824         raise source.error("bad inline flags: cannot turn off global flag", 1)
    825     if add_flags & del_flags:
    826         raise source.error("bad inline flags: flag turned on and off", 1)
    827     return add_flags, del_flags
    828 
    829 def fix_flags(src, flags):
    830     # Check and fix flags according to the type of pattern (str or bytes)
    831     if isinstance(src, str):
    832         if flags & SRE_FLAG_LOCALE:
    833             raise ValueError("cannot use LOCALE flag with a str pattern")
    834         if not flags & SRE_FLAG_ASCII:
    835             flags |= SRE_FLAG_UNICODE
    836         elif flags & SRE_FLAG_UNICODE:
    837             raise ValueError("ASCII and UNICODE flags are incompatible")
    838     else:
    839         if flags & SRE_FLAG_UNICODE:
    840             raise ValueError("cannot use UNICODE flag with a bytes pattern")
    841         if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
    842             raise ValueError("ASCII and LOCALE flags are incompatible")
    843     return flags
    844 
    845 def parse(str, flags=0, pattern=None):
    846     # parse 're' pattern into list of (opcode, argument) tuples
    847 
    848     source = Tokenizer(str)
    849 
    850     if pattern is None:
    851         pattern = Pattern()
    852     pattern.flags = flags
    853     pattern.str = str
    854 
    855     try:
    856         p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
    857     except Verbose:
    858         # the VERBOSE flag was switched on inside the pattern.  to be
    859         # on the safe side, we'll parse the whole thing again...
    860         pattern = Pattern()
    861         pattern.flags = flags | SRE_FLAG_VERBOSE
    862         pattern.str = str
    863         source.seek(0)
    864         p = _parse_sub(source, pattern, True, False)
    865 
    866     p.pattern.flags = fix_flags(str, p.pattern.flags)
    867 
    868     if source.next is not None:
    869         assert source.next == ")"
    870         raise source.error("unbalanced parenthesis")
    871 
    872     if flags & SRE_FLAG_DEBUG:
    873         p.dump()
    874 
    875     return p
    876 
    877 def parse_template(source, pattern):
    878     # parse 're' replacement string into list of literals and
    879     # group references
    880     s = Tokenizer(source)
    881     sget = s.get
    882     groups = []
    883     literals = []
    884     literal = []
    885     lappend = literal.append
    886     def addgroup(index, pos):
    887         if index > pattern.groups:
    888             raise s.error("invalid group reference %d" % index, pos)
    889         if literal:
    890             literals.append(''.join(literal))
    891             del literal[:]
    892         groups.append((len(literals), index))
    893         literals.append(None)
    894     groupindex = pattern.groupindex
    895     while True:
    896         this = sget()
    897         if this is None:
    898             break # end of replacement string
    899         if this[0] == "\\":
    900             # group
    901             c = this[1]
    902             if c == "g":
    903                 name = ""
    904                 if not s.match("<"):
    905                     raise s.error("missing <")
    906                 name = s.getuntil(">")
    907                 if name.isidentifier():
    908                     try:
    909                         index = groupindex[name]
    910                     except KeyError:
    911                         raise IndexError("unknown group name %r" % name)
    912                 else:
    913                     try:
    914                         index = int(name)
    915                         if index < 0:
    916                             raise ValueError
    917                     except ValueError:
    918                         raise s.error("bad character in group name %r" % name,
    919                                       len(name) + 1) from None
    920                     if index >= MAXGROUPS:
    921                         raise s.error("invalid group reference %d" % index,
    922                                       len(name) + 1)
    923                 addgroup(index, len(name) + 1)
    924             elif c == "0":
    925                 if s.next in OCTDIGITS:
    926                     this += sget()
    927                     if s.next in OCTDIGITS:
    928                         this += sget()
    929                 lappend(chr(int(this[1:], 8) & 0xff))
    930             elif c in DIGITS:
    931                 isoctal = False
    932                 if s.next in DIGITS:
    933                     this += sget()
    934                     if (c in OCTDIGITS and this[2] in OCTDIGITS and
    935                         s.next in OCTDIGITS):
    936                         this += sget()
    937                         isoctal = True
    938                         c = int(this[1:], 8)
    939                         if c > 0o377:
    940                             raise s.error('octal escape value %s outside of '
    941                                           'range 0-0o377' % this, len(this))
    942                         lappend(chr(c))
    943                 if not isoctal:
    944                     addgroup(int(this[1:]), len(this) - 1)
    945             else:
    946                 try:
    947                     this = chr(ESCAPES[this][1])
    948                 except KeyError:
    949                     if c in ASCIILETTERS:
    950                         import warnings
    951                         warnings.warn('bad escape %s' % this,
    952                                       DeprecationWarning, stacklevel=4)
    953                 lappend(this)
    954         else:
    955             lappend(this)
    956     if literal:
    957         literals.append(''.join(literal))
    958     if not isinstance(source, str):
    959         # The tokenizer implicitly decodes bytes objects as latin-1, we must
    960         # therefore re-encode the final representation.
    961         literals = [None if s is None else s.encode('latin-1') for s in literals]
    962     return groups, literals
    963 
    964 def expand_template(template, match):
    965     g = match.group
    966     empty = match.string[:0]
    967     groups, literals = template
    968     literals = literals[:]
    969     try:
    970         for index, group in groups:
    971             literals[index] = g(group) or empty
    972     except IndexError:
    973         raise error("invalid group reference %d" % index)
    974     return empty.join(literals)
    975