Home | History | Annotate | Download | only in Lib
      1 #
      2 # Secret Labs' Regular Expression Engine
      3 #
      4 # convert template to internal format
      5 #
      6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
      7 #
      8 # See the sre.py file for information on usage and redistribution.
      9 #
     10 
     11 """Internal support module for sre"""
     12 
     13 import _sre
     14 import sre_parse
     15 from sre_constants import *
     16 
     17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
     18 
     19 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
     20 _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
     21 _SUCCESS_CODES = {SUCCESS, FAILURE}
     22 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
     23 _UNIT_CODES = _LITERAL_CODES | {ANY, IN}
     24 
     25 # Sets of lowercase characters which have the same uppercase.
     26 _equivalences = (
     27     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
     28     (0x69, 0x131), # i
     29     # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
     30     (0x73, 0x17f), # s
     31     # MICRO SIGN, GREEK SMALL LETTER MU
     32     (0xb5, 0x3bc), # 
     33     # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
     34     (0x345, 0x3b9, 0x1fbe), # \u0345
     35     # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
     36     (0x390, 0x1fd3), # 
     37     # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
     38     (0x3b0, 0x1fe3), # 
     39     # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
     40     (0x3b2, 0x3d0), # 
     41     # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
     42     (0x3b5, 0x3f5), # 
     43     # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
     44     (0x3b8, 0x3d1), # 
     45     # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
     46     (0x3ba, 0x3f0), # 
     47     # GREEK SMALL LETTER PI, GREEK PI SYMBOL
     48     (0x3c0, 0x3d6), # 
     49     # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
     50     (0x3c1, 0x3f1), # 
     51     # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
     52     (0x3c2, 0x3c3), # 
     53     # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
     54     (0x3c6, 0x3d5), # 
     55     # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
     56     (0x1e61, 0x1e9b), # 
     57     # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
     58     (0xfb05, 0xfb06), # 
     59 )
     60 
     61 # Maps the lowercase code to lowercase codes which have the same uppercase.
     62 _ignorecase_fixes = {i: tuple(j for j in t if i != j)
     63                      for t in _equivalences for i in t}
     64 
     65 def _combine_flags(flags, add_flags, del_flags,
     66                    TYPE_FLAGS=sre_parse.TYPE_FLAGS):
     67     if add_flags & TYPE_FLAGS:
     68         flags &= ~TYPE_FLAGS
     69     return (flags | add_flags) & ~del_flags
     70 
     71 def _compile(code, pattern, flags):
     72     # internal: compile a (sub)pattern
     73     emit = code.append
     74     _len = len
     75     LITERAL_CODES = _LITERAL_CODES
     76     REPEATING_CODES = _REPEATING_CODES
     77     SUCCESS_CODES = _SUCCESS_CODES
     78     ASSERT_CODES = _ASSERT_CODES
     79     iscased = None
     80     tolower = None
     81     fixes = None
     82     if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
     83         if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
     84             iscased = _sre.unicode_iscased
     85             tolower = _sre.unicode_tolower
     86             fixes = _ignorecase_fixes
     87         else:
     88             iscased = _sre.ascii_iscased
     89             tolower = _sre.ascii_tolower
     90     for op, av in pattern:
     91         if op in LITERAL_CODES:
     92             if not flags & SRE_FLAG_IGNORECASE:
     93                 emit(op)
     94                 emit(av)
     95             elif flags & SRE_FLAG_LOCALE:
     96                 emit(OP_LOCALE_IGNORE[op])
     97                 emit(av)
     98             elif not iscased(av):
     99                 emit(op)
    100                 emit(av)
    101             else:
    102                 lo = tolower(av)
    103                 if not fixes:  # ascii
    104                     emit(OP_IGNORE[op])
    105                     emit(lo)
    106                 elif lo not in fixes:
    107                     emit(OP_UNICODE_IGNORE[op])
    108                     emit(lo)
    109                 else:
    110                     emit(IN_UNI_IGNORE)
    111                     skip = _len(code); emit(0)
    112                     if op is NOT_LITERAL:
    113                         emit(NEGATE)
    114                     for k in (lo,) + fixes[lo]:
    115                         emit(LITERAL)
    116                         emit(k)
    117                     emit(FAILURE)
    118                     code[skip] = _len(code) - skip
    119         elif op is IN:
    120             charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
    121             if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
    122                 emit(IN_LOC_IGNORE)
    123             elif not hascased:
    124                 emit(IN)
    125             elif not fixes:  # ascii
    126                 emit(IN_IGNORE)
    127             else:
    128                 emit(IN_UNI_IGNORE)
    129             skip = _len(code); emit(0)
    130             _compile_charset(charset, flags, code)
    131             code[skip] = _len(code) - skip
    132         elif op is ANY:
    133             if flags & SRE_FLAG_DOTALL:
    134                 emit(ANY_ALL)
    135             else:
    136                 emit(ANY)
    137         elif op in REPEATING_CODES:
    138             if flags & SRE_FLAG_TEMPLATE:
    139                 raise error("internal: unsupported template operator %r" % (op,))
    140             if _simple(av[2]):
    141                 if op is MAX_REPEAT:
    142                     emit(REPEAT_ONE)
    143                 else:
    144                     emit(MIN_REPEAT_ONE)
    145                 skip = _len(code); emit(0)
    146                 emit(av[0])
    147                 emit(av[1])
    148                 _compile(code, av[2], flags)
    149                 emit(SUCCESS)
    150                 code[skip] = _len(code) - skip
    151             else:
    152                 emit(REPEAT)
    153                 skip = _len(code); emit(0)
    154                 emit(av[0])
    155                 emit(av[1])
    156                 _compile(code, av[2], flags)
    157                 code[skip] = _len(code) - skip
    158                 if op is MAX_REPEAT:
    159                     emit(MAX_UNTIL)
    160                 else:
    161                     emit(MIN_UNTIL)
    162         elif op is SUBPATTERN:
    163             group, add_flags, del_flags, p = av
    164             if group:
    165                 emit(MARK)
    166                 emit((group-1)*2)
    167             # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
    168             _compile(code, p, _combine_flags(flags, add_flags, del_flags))
    169             if group:
    170                 emit(MARK)
    171                 emit((group-1)*2+1)
    172         elif op in SUCCESS_CODES:
    173             emit(op)
    174         elif op in ASSERT_CODES:
    175             emit(op)
    176             skip = _len(code); emit(0)
    177             if av[0] >= 0:
    178                 emit(0) # look ahead
    179             else:
    180                 lo, hi = av[1].getwidth()
    181                 if lo != hi:
    182                     raise error("look-behind requires fixed-width pattern")
    183                 emit(lo) # look behind
    184             _compile(code, av[1], flags)
    185             emit(SUCCESS)
    186             code[skip] = _len(code) - skip
    187         elif op is CALL:
    188             emit(op)
    189             skip = _len(code); emit(0)
    190             _compile(code, av, flags)
    191             emit(SUCCESS)
    192             code[skip] = _len(code) - skip
    193         elif op is AT:
    194             emit(op)
    195             if flags & SRE_FLAG_MULTILINE:
    196                 av = AT_MULTILINE.get(av, av)
    197             if flags & SRE_FLAG_LOCALE:
    198                 av = AT_LOCALE.get(av, av)
    199             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    200                 av = AT_UNICODE.get(av, av)
    201             emit(av)
    202         elif op is BRANCH:
    203             emit(op)
    204             tail = []
    205             tailappend = tail.append
    206             for av in av[1]:
    207                 skip = _len(code); emit(0)
    208                 # _compile_info(code, av, flags)
    209                 _compile(code, av, flags)
    210                 emit(JUMP)
    211                 tailappend(_len(code)); emit(0)
    212                 code[skip] = _len(code) - skip
    213             emit(FAILURE) # end of branch
    214             for tail in tail:
    215                 code[tail] = _len(code) - tail
    216         elif op is CATEGORY:
    217             emit(op)
    218             if flags & SRE_FLAG_LOCALE:
    219                 av = CH_LOCALE[av]
    220             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    221                 av = CH_UNICODE[av]
    222             emit(av)
    223         elif op is GROUPREF:
    224             if not flags & SRE_FLAG_IGNORECASE:
    225                 emit(op)
    226             elif flags & SRE_FLAG_LOCALE:
    227                 emit(GROUPREF_LOC_IGNORE)
    228             elif not fixes:  # ascii
    229                 emit(GROUPREF_IGNORE)
    230             else:
    231                 emit(GROUPREF_UNI_IGNORE)
    232             emit(av-1)
    233         elif op is GROUPREF_EXISTS:
    234             emit(op)
    235             emit(av[0]-1)
    236             skipyes = _len(code); emit(0)
    237             _compile(code, av[1], flags)
    238             if av[2]:
    239                 emit(JUMP)
    240                 skipno = _len(code); emit(0)
    241                 code[skipyes] = _len(code) - skipyes + 1
    242                 _compile(code, av[2], flags)
    243                 code[skipno] = _len(code) - skipno
    244             else:
    245                 code[skipyes] = _len(code) - skipyes + 1
    246         else:
    247             raise error("internal: unsupported operand type %r" % (op,))
    248 
    249 def _compile_charset(charset, flags, code):
    250     # compile charset subprogram
    251     emit = code.append
    252     for op, av in charset:
    253         emit(op)
    254         if op is NEGATE:
    255             pass
    256         elif op is LITERAL:
    257             emit(av)
    258         elif op is RANGE or op is RANGE_UNI_IGNORE:
    259             emit(av[0])
    260             emit(av[1])
    261         elif op is CHARSET:
    262             code.extend(av)
    263         elif op is BIGCHARSET:
    264             code.extend(av)
    265         elif op is CATEGORY:
    266             if flags & SRE_FLAG_LOCALE:
    267                 emit(CH_LOCALE[av])
    268             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    269                 emit(CH_UNICODE[av])
    270             else:
    271                 emit(av)
    272         else:
    273             raise error("internal: unsupported set operator %r" % (op,))
    274     emit(FAILURE)
    275 
    276 def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
    277     # internal: optimize character set
    278     out = []
    279     tail = []
    280     charmap = bytearray(256)
    281     hascased = False
    282     for op, av in charset:
    283         while True:
    284             try:
    285                 if op is LITERAL:
    286                     if fixup:
    287                         lo = fixup(av)
    288                         charmap[lo] = 1
    289                         if fixes and lo in fixes:
    290                             for k in fixes[lo]:
    291                                 charmap[k] = 1
    292                         if not hascased and iscased(av):
    293                             hascased = True
    294                     else:
    295                         charmap[av] = 1
    296                 elif op is RANGE:
    297                     r = range(av[0], av[1]+1)
    298                     if fixup:
    299                         if fixes:
    300                             for i in map(fixup, r):
    301                                 charmap[i] = 1
    302                                 if i in fixes:
    303                                     for k in fixes[i]:
    304                                         charmap[k] = 1
    305                         else:
    306                             for i in map(fixup, r):
    307                                 charmap[i] = 1
    308                         if not hascased:
    309                             hascased = any(map(iscased, r))
    310                     else:
    311                         for i in r:
    312                             charmap[i] = 1
    313                 elif op is NEGATE:
    314                     out.append((op, av))
    315                 else:
    316                     tail.append((op, av))
    317             except IndexError:
    318                 if len(charmap) == 256:
    319                     # character set contains non-UCS1 character codes
    320                     charmap += b'\0' * 0xff00
    321                     continue
    322                 # Character set contains non-BMP character codes.
    323                 if fixup:
    324                     hascased = True
    325                     # There are only two ranges of cased non-BMP characters:
    326                     # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
    327                     # and for both ranges RANGE_UNI_IGNORE works.
    328                     if op is RANGE:
    329                         op = RANGE_UNI_IGNORE
    330                 tail.append((op, av))
    331             break
    332 
    333     # compress character map
    334     runs = []
    335     q = 0
    336     while True:
    337         p = charmap.find(1, q)
    338         if p < 0:
    339             break
    340         if len(runs) >= 2:
    341             runs = None
    342             break
    343         q = charmap.find(0, p)
    344         if q < 0:
    345             runs.append((p, len(charmap)))
    346             break
    347         runs.append((p, q))
    348     if runs is not None:
    349         # use literal/range
    350         for p, q in runs:
    351             if q - p == 1:
    352                 out.append((LITERAL, p))
    353             else:
    354                 out.append((RANGE, (p, q - 1)))
    355         out += tail
    356         # if the case was changed or new representation is more compact
    357         if hascased or len(out) < len(charset):
    358             return out, hascased
    359         # else original character set is good enough
    360         return charset, hascased
    361 
    362     # use bitmap
    363     if len(charmap) == 256:
    364         data = _mk_bitmap(charmap)
    365         out.append((CHARSET, data))
    366         out += tail
    367         return out, hascased
    368 
    369     # To represent a big charset, first a bitmap of all characters in the
    370     # set is constructed. Then, this bitmap is sliced into chunks of 256
    371     # characters, duplicate chunks are eliminated, and each chunk is
    372     # given a number. In the compiled expression, the charset is
    373     # represented by a 32-bit word sequence, consisting of one word for
    374     # the number of different chunks, a sequence of 256 bytes (64 words)
    375     # of chunk numbers indexed by their original chunk position, and a
    376     # sequence of 256-bit chunks (8 words each).
    377 
    378     # Compression is normally good: in a typical charset, large ranges of
    379     # Unicode will be either completely excluded (e.g. if only cyrillic
    380     # letters are to be matched), or completely included (e.g. if large
    381     # subranges of Kanji match). These ranges will be represented by
    382     # chunks of all one-bits or all zero-bits.
    383 
    384     # Matching can be also done efficiently: the more significant byte of
    385     # the Unicode character is an index into the chunk number, and the
    386     # less significant byte is a bit index in the chunk (just like the
    387     # CHARSET matching).
    388 
    389     charmap = bytes(charmap) # should be hashable
    390     comps = {}
    391     mapping = bytearray(256)
    392     block = 0
    393     data = bytearray()
    394     for i in range(0, 65536, 256):
    395         chunk = charmap[i: i + 256]
    396         if chunk in comps:
    397             mapping[i // 256] = comps[chunk]
    398         else:
    399             mapping[i // 256] = comps[chunk] = block
    400             block += 1
    401             data += chunk
    402     data = _mk_bitmap(data)
    403     data[0:0] = [block] + _bytes_to_codes(mapping)
    404     out.append((BIGCHARSET, data))
    405     out += tail
    406     return out, hascased
    407 
    408 _CODEBITS = _sre.CODESIZE * 8
    409 MAXCODE = (1 << _CODEBITS) - 1
    410 _BITS_TRANS = b'0' + b'1' * 255
    411 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    412     s = bits.translate(_BITS_TRANS)[::-1]
    413     return [_int(s[i - _CODEBITS: i], 2)
    414             for i in range(len(s), 0, -_CODEBITS)]
    415 
    416 def _bytes_to_codes(b):
    417     # Convert block indices to word array
    418     a = memoryview(b).cast('I')
    419     assert a.itemsize == _sre.CODESIZE
    420     assert len(a) * a.itemsize == len(b)
    421     return a.tolist()
    422 
    423 def _simple(p):
    424     # check if this subpattern is a "simple" operator
    425     if len(p) != 1:
    426         return False
    427     op, av = p[0]
    428     if op is SUBPATTERN:
    429         return av[0] is None and _simple(av[-1])
    430     return op in _UNIT_CODES
    431 
    432 def _generate_overlap_table(prefix):
    433     """
    434     Generate an overlap table for the following prefix.
    435     An overlap table is a table of the same size as the prefix which
    436     informs about the potential self-overlap for each index in the prefix:
    437     - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    438     - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
    439       prefix[0:k]
    440     """
    441     table = [0] * len(prefix)
    442     for i in range(1, len(prefix)):
    443         idx = table[i - 1]
    444         while prefix[i] != prefix[idx]:
    445             if idx == 0:
    446                 table[i] = 0
    447                 break
    448             idx = table[idx - 1]
    449         else:
    450             table[i] = idx + 1
    451     return table
    452 
    453 def _get_iscased(flags):
    454     if not flags & SRE_FLAG_IGNORECASE:
    455         return None
    456     elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
    457         return _sre.unicode_iscased
    458     else:
    459         return _sre.ascii_iscased
    460 
    461 def _get_literal_prefix(pattern, flags):
    462     # look for literal prefix
    463     prefix = []
    464     prefixappend = prefix.append
    465     prefix_skip = None
    466     iscased = _get_iscased(flags)
    467     for op, av in pattern.data:
    468         if op is LITERAL:
    469             if iscased and iscased(av):
    470                 break
    471             prefixappend(av)
    472         elif op is SUBPATTERN:
    473             group, add_flags, del_flags, p = av
    474             flags1 = _combine_flags(flags, add_flags, del_flags)
    475             if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
    476                 break
    477             prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
    478             if prefix_skip is None:
    479                 if group is not None:
    480                     prefix_skip = len(prefix)
    481                 elif prefix_skip1 is not None:
    482                     prefix_skip = len(prefix) + prefix_skip1
    483             prefix.extend(prefix1)
    484             if not got_all:
    485                 break
    486         else:
    487             break
    488     else:
    489         return prefix, prefix_skip, True
    490     return prefix, prefix_skip, False
    491 
    492 def _get_charset_prefix(pattern, flags):
    493     while True:
    494         if not pattern.data:
    495             return None
    496         op, av = pattern.data[0]
    497         if op is not SUBPATTERN:
    498             break
    499         group, add_flags, del_flags, pattern = av
    500         flags = _combine_flags(flags, add_flags, del_flags)
    501         if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
    502             return None
    503 
    504     iscased = _get_iscased(flags)
    505     if op is LITERAL:
    506         if iscased and iscased(av):
    507             return None
    508         return [(op, av)]
    509     elif op is BRANCH:
    510         charset = []
    511         charsetappend = charset.append
    512         for p in av[1]:
    513             if not p:
    514                 return None
    515             op, av = p[0]
    516             if op is LITERAL and not (iscased and iscased(av)):
    517                 charsetappend((op, av))
    518             else:
    519                 return None
    520         return charset
    521     elif op is IN:
    522         charset = av
    523         if iscased:
    524             for op, av in charset:
    525                 if op is LITERAL:
    526                     if iscased(av):
    527                         return None
    528                 elif op is RANGE:
    529                     if av[1] > 0xffff:
    530                         return None
    531                     if any(map(iscased, range(av[0], av[1]+1))):
    532                         return None
    533         return charset
    534     return None
    535 
    536 def _compile_info(code, pattern, flags):
    537     # internal: compile an info block.  in the current version,
    538     # this contains min/max pattern width, and an optional literal
    539     # prefix or a character map
    540     lo, hi = pattern.getwidth()
    541     if hi > MAXCODE:
    542         hi = MAXCODE
    543     if lo == 0:
    544         code.extend([INFO, 4, 0, lo, hi])
    545         return
    546     # look for a literal prefix
    547     prefix = []
    548     prefix_skip = 0
    549     charset = [] # not used
    550     if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
    551         # look for literal prefix
    552         prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
    553         # if no prefix, look for charset prefix
    554         if not prefix:
    555             charset = _get_charset_prefix(pattern, flags)
    556 ##     if prefix:
    557 ##         print("*** PREFIX", prefix, prefix_skip)
    558 ##     if charset:
    559 ##         print("*** CHARSET", charset)
    560     # add an info block
    561     emit = code.append
    562     emit(INFO)
    563     skip = len(code); emit(0)
    564     # literal flag
    565     mask = 0
    566     if prefix:
    567         mask = SRE_INFO_PREFIX
    568         if prefix_skip is None and got_all:
    569             mask = mask | SRE_INFO_LITERAL
    570     elif charset:
    571         mask = mask | SRE_INFO_CHARSET
    572     emit(mask)
    573     # pattern length
    574     if lo < MAXCODE:
    575         emit(lo)
    576     else:
    577         emit(MAXCODE)
    578         prefix = prefix[:MAXCODE]
    579     emit(min(hi, MAXCODE))
    580     # add literal prefix
    581     if prefix:
    582         emit(len(prefix)) # length
    583         if prefix_skip is None:
    584             prefix_skip =  len(prefix)
    585         emit(prefix_skip) # skip
    586         code.extend(prefix)
    587         # generate overlap table
    588         code.extend(_generate_overlap_table(prefix))
    589     elif charset:
    590         charset, hascased = _optimize_charset(charset)
    591         assert not hascased
    592         _compile_charset(charset, flags, code)
    593     code[skip] = len(code) - skip
    594 
    595 def isstring(obj):
    596     return isinstance(obj, (str, bytes))
    597 
    598 def _code(p, flags):
    599 
    600     flags = p.pattern.flags | flags
    601     code = []
    602 
    603     # compile info block
    604     _compile_info(code, p, flags)
    605 
    606     # compile the pattern
    607     _compile(code, p.data, flags)
    608 
    609     code.append(SUCCESS)
    610 
    611     return code
    612 
    613 def _hex_code(code):
    614     return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
    615 
    616 def dis(code):
    617     import sys
    618 
    619     labels = set()
    620     level = 0
    621     offset_width = len(str(len(code) - 1))
    622 
    623     def dis_(start, end):
    624         def print_(*args, to=None):
    625             if to is not None:
    626                 labels.add(to)
    627                 args += ('(to %d)' % (to,),)
    628             print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
    629                   end='  '*(level-1))
    630             print(*args)
    631 
    632         def print_2(*args):
    633             print(end=' '*(offset_width + 2*level))
    634             print(*args)
    635 
    636         nonlocal level
    637         level += 1
    638         i = start
    639         while i < end:
    640             start = i
    641             op = code[i]
    642             i += 1
    643             op = OPCODES[op]
    644             if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
    645                       MAX_UNTIL, MIN_UNTIL, NEGATE):
    646                 print_(op)
    647             elif op in (LITERAL, NOT_LITERAL,
    648                         LITERAL_IGNORE, NOT_LITERAL_IGNORE,
    649                         LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
    650                         LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
    651                 arg = code[i]
    652                 i += 1
    653                 print_(op, '%#02x (%r)' % (arg, chr(arg)))
    654             elif op is AT:
    655                 arg = code[i]
    656                 i += 1
    657                 arg = str(ATCODES[arg])
    658                 assert arg[:3] == 'AT_'
    659                 print_(op, arg[3:])
    660             elif op is CATEGORY:
    661                 arg = code[i]
    662                 i += 1
    663                 arg = str(CHCODES[arg])
    664                 assert arg[:9] == 'CATEGORY_'
    665                 print_(op, arg[9:])
    666             elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
    667                 skip = code[i]
    668                 print_(op, skip, to=i+skip)
    669                 dis_(i+1, i+skip)
    670                 i += skip
    671             elif op in (RANGE, RANGE_UNI_IGNORE):
    672                 lo, hi = code[i: i+2]
    673                 i += 2
    674                 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
    675             elif op is CHARSET:
    676                 print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
    677                 i += 256//_CODEBITS
    678             elif op is BIGCHARSET:
    679                 arg = code[i]
    680                 i += 1
    681                 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
    682                                         for x in code[i: i + 256//_sre.CODESIZE]))
    683                 print_(op, arg, mapping)
    684                 i += 256//_sre.CODESIZE
    685                 level += 1
    686                 for j in range(arg):
    687                     print_2(_hex_code(code[i: i + 256//_CODEBITS]))
    688                     i += 256//_CODEBITS
    689                 level -= 1
    690             elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
    691                         GROUPREF_LOC_IGNORE):
    692                 arg = code[i]
    693                 i += 1
    694                 print_(op, arg)
    695             elif op is JUMP:
    696                 skip = code[i]
    697                 print_(op, skip, to=i+skip)
    698                 i += 1
    699             elif op is BRANCH:
    700                 skip = code[i]
    701                 print_(op, skip, to=i+skip)
    702                 while skip:
    703                     dis_(i+1, i+skip)
    704                     i += skip
    705                     start = i
    706                     skip = code[i]
    707                     if skip:
    708                         print_('branch', skip, to=i+skip)
    709                     else:
    710                         print_(FAILURE)
    711                 i += 1
    712             elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
    713                 skip, min, max = code[i: i+3]
    714                 if max == MAXREPEAT:
    715                     max = 'MAXREPEAT'
    716                 print_(op, skip, min, max, to=i+skip)
    717                 dis_(i+3, i+skip)
    718                 i += skip
    719             elif op is GROUPREF_EXISTS:
    720                 arg, skip = code[i: i+2]
    721                 print_(op, arg, skip, to=i+skip)
    722                 i += 2
    723             elif op in (ASSERT, ASSERT_NOT):
    724                 skip, arg = code[i: i+2]
    725                 print_(op, skip, arg, to=i+skip)
    726                 dis_(i+2, i+skip)
    727                 i += skip
    728             elif op is INFO:
    729                 skip, flags, min, max = code[i: i+4]
    730                 if max == MAXREPEAT:
    731                     max = 'MAXREPEAT'
    732                 print_(op, skip, bin(flags), min, max, to=i+skip)
    733                 start = i+4
    734                 if flags & SRE_INFO_PREFIX:
    735                     prefix_len, prefix_skip = code[i+4: i+6]
    736                     print_2('  prefix_skip', prefix_skip)
    737                     start = i + 6
    738                     prefix = code[start: start+prefix_len]
    739                     print_2('  prefix',
    740                             '[%s]' % ', '.join('%#02x' % x for x in prefix),
    741                             '(%r)' % ''.join(map(chr, prefix)))
    742                     start += prefix_len
    743                     print_2('  overlap', code[start: start+prefix_len])
    744                     start += prefix_len
    745                 if flags & SRE_INFO_CHARSET:
    746                     level += 1
    747                     print_2('in')
    748                     dis_(start, i+skip)
    749                     level -= 1
    750                 i += skip
    751             else:
    752                 raise ValueError(op)
    753 
    754         level -= 1
    755 
    756     dis_(0, len(code))
    757 
    758 
    759 def compile(p, flags=0):
    760     # internal: convert pattern list to internal format
    761 
    762     if isstring(p):
    763         pattern = p
    764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None
    767 
    768     code = _code(p, flags)
    769 
    770     if flags & SRE_FLAG_DEBUG:
    771         print()
    772         dis(code)
    773 
    774     # map in either direction
    775     groupindex = p.pattern.groupdict
    776     indexgroup = [None] * p.pattern.groups
    777     for k, i in groupindex.items():
    778         indexgroup[i] = k
    779 
    780     return _sre.compile(
    781         pattern, flags | p.pattern.flags, code,
    782         p.pattern.groups-1,
    783         groupindex, tuple(indexgroup)
    784         )
    785