Home | History | Annotate | Download | only in Lib
      1 #
      2 # Secret Labs' Regular Expression Engine
      3 #
      4 # convert template to internal format
      5 #
      6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
      7 #
      8 # See the sre.py file for information on usage and redistribution.
      9 #
     10 
     11 """Internal support module for sre"""
     12 
     13 import _sre
     14 import sre_parse
     15 from sre_constants import *
     16 
     17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
     18 
     19 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
     20 _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
     21 _SUCCESS_CODES = {SUCCESS, FAILURE}
     22 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
     23 
     24 # Sets of lowercase characters which have the same uppercase.
     25 _equivalences = (
     26     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
     27     (0x69, 0x131), # i
     28     # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
     29     (0x73, 0x17f), # s
     30     # MICRO SIGN, GREEK SMALL LETTER MU
     31     (0xb5, 0x3bc), # 
     32     # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
     33     (0x345, 0x3b9, 0x1fbe), # \u0345
     34     # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
     35     (0x390, 0x1fd3), # 
     36     # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
     37     (0x3b0, 0x1fe3), # 
     38     # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
     39     (0x3b2, 0x3d0), # 
     40     # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
     41     (0x3b5, 0x3f5), # 
     42     # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
     43     (0x3b8, 0x3d1), # 
     44     # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
     45     (0x3ba, 0x3f0), # 
     46     # GREEK SMALL LETTER PI, GREEK PI SYMBOL
     47     (0x3c0, 0x3d6), # 
     48     # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
     49     (0x3c1, 0x3f1), # 
     50     # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
     51     (0x3c2, 0x3c3), # 
     52     # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
     53     (0x3c6, 0x3d5), # 
     54     # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
     55     (0x1e61, 0x1e9b), # 
     56     # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
     57     (0xfb05, 0xfb06), # 
     58 )
     59 
     60 # Maps the lowercase code to lowercase codes which have the same uppercase.
     61 _ignorecase_fixes = {i: tuple(j for j in t if i != j)
     62                      for t in _equivalences for i in t}
     63 
     64 def _compile(code, pattern, flags):
     65     # internal: compile a (sub)pattern
     66     emit = code.append
     67     _len = len
     68     LITERAL_CODES = _LITERAL_CODES
     69     REPEATING_CODES = _REPEATING_CODES
     70     SUCCESS_CODES = _SUCCESS_CODES
     71     ASSERT_CODES = _ASSERT_CODES
     72     if (flags & SRE_FLAG_IGNORECASE and
     73             not (flags & SRE_FLAG_LOCALE) and
     74             flags & SRE_FLAG_UNICODE and
     75             not (flags & SRE_FLAG_ASCII)):
     76         fixes = _ignorecase_fixes
     77     else:
     78         fixes = None
     79     for op, av in pattern:
     80         if op in LITERAL_CODES:
     81             if flags & SRE_FLAG_IGNORECASE:
     82                 lo = _sre.getlower(av, flags)
     83                 if fixes and lo in fixes:
     84                     emit(IN_IGNORE)
     85                     skip = _len(code); emit(0)
     86                     if op is NOT_LITERAL:
     87                         emit(NEGATE)
     88                     for k in (lo,) + fixes[lo]:
     89                         emit(LITERAL)
     90                         emit(k)
     91                     emit(FAILURE)
     92                     code[skip] = _len(code) - skip
     93                 else:
     94                     emit(OP_IGNORE[op])
     95                     emit(lo)
     96             else:
     97                 emit(op)
     98                 emit(av)
     99         elif op is IN:
    100             if flags & SRE_FLAG_IGNORECASE:
    101                 emit(OP_IGNORE[op])
    102                 def fixup(literal, flags=flags):
    103                     return _sre.getlower(literal, flags)
    104             else:
    105                 emit(op)
    106                 fixup = None
    107             skip = _len(code); emit(0)
    108             _compile_charset(av, flags, code, fixup, fixes)
    109             code[skip] = _len(code) - skip
    110         elif op is ANY:
    111             if flags & SRE_FLAG_DOTALL:
    112                 emit(ANY_ALL)
    113             else:
    114                 emit(ANY)
    115         elif op in REPEATING_CODES:
    116             if flags & SRE_FLAG_TEMPLATE:
    117                 raise error("internal: unsupported template operator %r" % (op,))
    118             elif _simple(av) and op is not REPEAT:
    119                 if op is MAX_REPEAT:
    120                     emit(REPEAT_ONE)
    121                 else:
    122                     emit(MIN_REPEAT_ONE)
    123                 skip = _len(code); emit(0)
    124                 emit(av[0])
    125                 emit(av[1])
    126                 _compile(code, av[2], flags)
    127                 emit(SUCCESS)
    128                 code[skip] = _len(code) - skip
    129             else:
    130                 emit(REPEAT)
    131                 skip = _len(code); emit(0)
    132                 emit(av[0])
    133                 emit(av[1])
    134                 _compile(code, av[2], flags)
    135                 code[skip] = _len(code) - skip
    136                 if op is MAX_REPEAT:
    137                     emit(MAX_UNTIL)
    138                 else:
    139                     emit(MIN_UNTIL)
    140         elif op is SUBPATTERN:
    141             group, add_flags, del_flags, p = av
    142             if group:
    143                 emit(MARK)
    144                 emit((group-1)*2)
    145             # _compile_info(code, p, (flags | add_flags) & ~del_flags)
    146             _compile(code, p, (flags | add_flags) & ~del_flags)
    147             if group:
    148                 emit(MARK)
    149                 emit((group-1)*2+1)
    150         elif op in SUCCESS_CODES:
    151             emit(op)
    152         elif op in ASSERT_CODES:
    153             emit(op)
    154             skip = _len(code); emit(0)
    155             if av[0] >= 0:
    156                 emit(0) # look ahead
    157             else:
    158                 lo, hi = av[1].getwidth()
    159                 if lo != hi:
    160                     raise error("look-behind requires fixed-width pattern")
    161                 emit(lo) # look behind
    162             _compile(code, av[1], flags)
    163             emit(SUCCESS)
    164             code[skip] = _len(code) - skip
    165         elif op is CALL:
    166             emit(op)
    167             skip = _len(code); emit(0)
    168             _compile(code, av, flags)
    169             emit(SUCCESS)
    170             code[skip] = _len(code) - skip
    171         elif op is AT:
    172             emit(op)
    173             if flags & SRE_FLAG_MULTILINE:
    174                 av = AT_MULTILINE.get(av, av)
    175             if flags & SRE_FLAG_LOCALE:
    176                 av = AT_LOCALE.get(av, av)
    177             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    178                 av = AT_UNICODE.get(av, av)
    179             emit(av)
    180         elif op is BRANCH:
    181             emit(op)
    182             tail = []
    183             tailappend = tail.append
    184             for av in av[1]:
    185                 skip = _len(code); emit(0)
    186                 # _compile_info(code, av, flags)
    187                 _compile(code, av, flags)
    188                 emit(JUMP)
    189                 tailappend(_len(code)); emit(0)
    190                 code[skip] = _len(code) - skip
    191             emit(FAILURE) # end of branch
    192             for tail in tail:
    193                 code[tail] = _len(code) - tail
    194         elif op is CATEGORY:
    195             emit(op)
    196             if flags & SRE_FLAG_LOCALE:
    197                 av = CH_LOCALE[av]
    198             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    199                 av = CH_UNICODE[av]
    200             emit(av)
    201         elif op is GROUPREF:
    202             if flags & SRE_FLAG_IGNORECASE:
    203                 emit(OP_IGNORE[op])
    204             else:
    205                 emit(op)
    206             emit(av-1)
    207         elif op is GROUPREF_EXISTS:
    208             emit(op)
    209             emit(av[0]-1)
    210             skipyes = _len(code); emit(0)
    211             _compile(code, av[1], flags)
    212             if av[2]:
    213                 emit(JUMP)
    214                 skipno = _len(code); emit(0)
    215                 code[skipyes] = _len(code) - skipyes + 1
    216                 _compile(code, av[2], flags)
    217                 code[skipno] = _len(code) - skipno
    218             else:
    219                 code[skipyes] = _len(code) - skipyes + 1
    220         else:
    221             raise error("internal: unsupported operand type %r" % (op,))
    222 
    223 def _compile_charset(charset, flags, code, fixup=None, fixes=None):
    224     # compile charset subprogram
    225     emit = code.append
    226     for op, av in _optimize_charset(charset, fixup, fixes):
    227         emit(op)
    228         if op is NEGATE:
    229             pass
    230         elif op is LITERAL:
    231             emit(av)
    232         elif op is RANGE or op is RANGE_IGNORE:
    233             emit(av[0])
    234             emit(av[1])
    235         elif op is CHARSET:
    236             code.extend(av)
    237         elif op is BIGCHARSET:
    238             code.extend(av)
    239         elif op is CATEGORY:
    240             if flags & SRE_FLAG_LOCALE:
    241                 emit(CH_LOCALE[av])
    242             elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
    243                 emit(CH_UNICODE[av])
    244             else:
    245                 emit(av)
    246         else:
    247             raise error("internal: unsupported set operator %r" % (op,))
    248     emit(FAILURE)
    249 
    250 def _optimize_charset(charset, fixup, fixes):
    251     # internal: optimize character set
    252     out = []
    253     tail = []
    254     charmap = bytearray(256)
    255     for op, av in charset:
    256         while True:
    257             try:
    258                 if op is LITERAL:
    259                     if fixup:
    260                         lo = fixup(av)
    261                         charmap[lo] = 1
    262                         if fixes and lo in fixes:
    263                             for k in fixes[lo]:
    264                                 charmap[k] = 1
    265                     else:
    266                         charmap[av] = 1
    267                 elif op is RANGE:
    268                     r = range(av[0], av[1]+1)
    269                     if fixup:
    270                         r = map(fixup, r)
    271                     if fixup and fixes:
    272                         for i in r:
    273                             charmap[i] = 1
    274                             if i in fixes:
    275                                 for k in fixes[i]:
    276                                     charmap[k] = 1
    277                     else:
    278                         for i in r:
    279                             charmap[i] = 1
    280                 elif op is NEGATE:
    281                     out.append((op, av))
    282                 else:
    283                     tail.append((op, av))
    284             except IndexError:
    285                 if len(charmap) == 256:
    286                     # character set contains non-UCS1 character codes
    287                     charmap += b'\0' * 0xff00
    288                     continue
    289                 # Character set contains non-BMP character codes.
    290                 # There are only two ranges of cased non-BMP characters:
    291                 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
    292                 # and for both ranges RANGE_IGNORE works.
    293                 if fixup and op is RANGE:
    294                     op = RANGE_IGNORE
    295                 tail.append((op, av))
    296             break
    297 
    298     # compress character map
    299     runs = []
    300     q = 0
    301     while True:
    302         p = charmap.find(1, q)
    303         if p < 0:
    304             break
    305         if len(runs) >= 2:
    306             runs = None
    307             break
    308         q = charmap.find(0, p)
    309         if q < 0:
    310             runs.append((p, len(charmap)))
    311             break
    312         runs.append((p, q))
    313     if runs is not None:
    314         # use literal/range
    315         for p, q in runs:
    316             if q - p == 1:
    317                 out.append((LITERAL, p))
    318             else:
    319                 out.append((RANGE, (p, q - 1)))
    320         out += tail
    321         # if the case was changed or new representation is more compact
    322         if fixup or len(out) < len(charset):
    323             return out
    324         # else original character set is good enough
    325         return charset
    326 
    327     # use bitmap
    328     if len(charmap) == 256:
    329         data = _mk_bitmap(charmap)
    330         out.append((CHARSET, data))
    331         out += tail
    332         return out
    333 
    334     # To represent a big charset, first a bitmap of all characters in the
    335     # set is constructed. Then, this bitmap is sliced into chunks of 256
    336     # characters, duplicate chunks are eliminated, and each chunk is
    337     # given a number. In the compiled expression, the charset is
    338     # represented by a 32-bit word sequence, consisting of one word for
    339     # the number of different chunks, a sequence of 256 bytes (64 words)
    340     # of chunk numbers indexed by their original chunk position, and a
    341     # sequence of 256-bit chunks (8 words each).
    342 
    343     # Compression is normally good: in a typical charset, large ranges of
    344     # Unicode will be either completely excluded (e.g. if only cyrillic
    345     # letters are to be matched), or completely included (e.g. if large
    346     # subranges of Kanji match). These ranges will be represented by
    347     # chunks of all one-bits or all zero-bits.
    348 
    349     # Matching can be also done efficiently: the more significant byte of
    350     # the Unicode character is an index into the chunk number, and the
    351     # less significant byte is a bit index in the chunk (just like the
    352     # CHARSET matching).
    353 
    354     charmap = bytes(charmap) # should be hashable
    355     comps = {}
    356     mapping = bytearray(256)
    357     block = 0
    358     data = bytearray()
    359     for i in range(0, 65536, 256):
    360         chunk = charmap[i: i + 256]
    361         if chunk in comps:
    362             mapping[i // 256] = comps[chunk]
    363         else:
    364             mapping[i // 256] = comps[chunk] = block
    365             block += 1
    366             data += chunk
    367     data = _mk_bitmap(data)
    368     data[0:0] = [block] + _bytes_to_codes(mapping)
    369     out.append((BIGCHARSET, data))
    370     out += tail
    371     return out
    372 
    373 _CODEBITS = _sre.CODESIZE * 8
    374 MAXCODE = (1 << _CODEBITS) - 1
    375 _BITS_TRANS = b'0' + b'1' * 255
    376 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    377     s = bits.translate(_BITS_TRANS)[::-1]
    378     return [_int(s[i - _CODEBITS: i], 2)
    379             for i in range(len(s), 0, -_CODEBITS)]
    380 
    381 def _bytes_to_codes(b):
    382     # Convert block indices to word array
    383     a = memoryview(b).cast('I')
    384     assert a.itemsize == _sre.CODESIZE
    385     assert len(a) * a.itemsize == len(b)
    386     return a.tolist()
    387 
    388 def _simple(av):
    389     # check if av is a "simple" operator
    390     lo, hi = av[2].getwidth()
    391     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
    392 
    393 def _generate_overlap_table(prefix):
    394     """
    395     Generate an overlap table for the following prefix.
    396     An overlap table is a table of the same size as the prefix which
    397     informs about the potential self-overlap for each index in the prefix:
    398     - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    399     - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
    400       prefix[0:k]
    401     """
    402     table = [0] * len(prefix)
    403     for i in range(1, len(prefix)):
    404         idx = table[i - 1]
    405         while prefix[i] != prefix[idx]:
    406             if idx == 0:
    407                 table[i] = 0
    408                 break
    409             idx = table[idx - 1]
    410         else:
    411             table[i] = idx + 1
    412     return table
    413 
    414 def _get_literal_prefix(pattern):
    415     # look for literal prefix
    416     prefix = []
    417     prefixappend = prefix.append
    418     prefix_skip = None
    419     for op, av in pattern.data:
    420         if op is LITERAL:
    421             prefixappend(av)
    422         elif op is SUBPATTERN:
    423             group, add_flags, del_flags, p = av
    424             if add_flags & SRE_FLAG_IGNORECASE:
    425                 break
    426             prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
    427             if prefix_skip is None:
    428                 if group is not None:
    429                     prefix_skip = len(prefix)
    430                 elif prefix_skip1 is not None:
    431                     prefix_skip = len(prefix) + prefix_skip1
    432             prefix.extend(prefix1)
    433             if not got_all:
    434                 break
    435         else:
    436             break
    437     else:
    438         return prefix, prefix_skip, True
    439     return prefix, prefix_skip, False
    440 
    441 def _get_charset_prefix(pattern):
    442     charset = [] # not used
    443     charsetappend = charset.append
    444     if pattern.data:
    445         op, av = pattern.data[0]
    446         if op is SUBPATTERN:
    447             group, add_flags, del_flags, p = av
    448             if p and not (add_flags & SRE_FLAG_IGNORECASE):
    449                 op, av = p[0]
    450                 if op is LITERAL:
    451                     charsetappend((op, av))
    452                 elif op is BRANCH:
    453                     c = []
    454                     cappend = c.append
    455                     for p in av[1]:
    456                         if not p:
    457                             break
    458                         op, av = p[0]
    459                         if op is LITERAL:
    460                             cappend((op, av))
    461                         else:
    462                             break
    463                     else:
    464                         charset = c
    465         elif op is BRANCH:
    466             c = []
    467             cappend = c.append
    468             for p in av[1]:
    469                 if not p:
    470                     break
    471                 op, av = p[0]
    472                 if op is LITERAL:
    473                     cappend((op, av))
    474                 else:
    475                     break
    476             else:
    477                 charset = c
    478         elif op is IN:
    479             charset = av
    480     return charset
    481 
    482 def _compile_info(code, pattern, flags):
    483     # internal: compile an info block.  in the current version,
    484     # this contains min/max pattern width, and an optional literal
    485     # prefix or a character map
    486     lo, hi = pattern.getwidth()
    487     if hi > MAXCODE:
    488         hi = MAXCODE
    489     if lo == 0:
    490         code.extend([INFO, 4, 0, lo, hi])
    491         return
    492     # look for a literal prefix
    493     prefix = []
    494     prefix_skip = 0
    495     charset = [] # not used
    496     if not (flags & SRE_FLAG_IGNORECASE):
    497         # look for literal prefix
    498         prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
    499         # if no prefix, look for charset prefix
    500         if not prefix:
    501             charset = _get_charset_prefix(pattern)
    502 ##     if prefix:
    503 ##         print("*** PREFIX", prefix, prefix_skip)
    504 ##     if charset:
    505 ##         print("*** CHARSET", charset)
    506     # add an info block
    507     emit = code.append
    508     emit(INFO)
    509     skip = len(code); emit(0)
    510     # literal flag
    511     mask = 0
    512     if prefix:
    513         mask = SRE_INFO_PREFIX
    514         if prefix_skip is None and got_all:
    515             mask = mask | SRE_INFO_LITERAL
    516     elif charset:
    517         mask = mask | SRE_INFO_CHARSET
    518     emit(mask)
    519     # pattern length
    520     if lo < MAXCODE:
    521         emit(lo)
    522     else:
    523         emit(MAXCODE)
    524         prefix = prefix[:MAXCODE]
    525     emit(min(hi, MAXCODE))
    526     # add literal prefix
    527     if prefix:
    528         emit(len(prefix)) # length
    529         if prefix_skip is None:
    530             prefix_skip =  len(prefix)
    531         emit(prefix_skip) # skip
    532         code.extend(prefix)
    533         # generate overlap table
    534         code.extend(_generate_overlap_table(prefix))
    535     elif charset:
    536         _compile_charset(charset, flags, code)
    537     code[skip] = len(code) - skip
    538 
    539 def isstring(obj):
    540     return isinstance(obj, (str, bytes))
    541 
    542 def _code(p, flags):
    543 
    544     flags = p.pattern.flags | flags
    545     code = []
    546 
    547     # compile info block
    548     _compile_info(code, p, flags)
    549 
    550     # compile the pattern
    551     _compile(code, p.data, flags)
    552 
    553     code.append(SUCCESS)
    554 
    555     return code
    556 
    557 def compile(p, flags=0):
    558     # internal: convert pattern list to internal format
    559 
    560     if isstring(p):
    561         pattern = p
    562         p = sre_parse.parse(p, flags)
    563     else:
    564         pattern = None
    565 
    566     code = _code(p, flags)
    567 
    568     # print(code)
    569 
    570     # map in either direction
    571     groupindex = p.pattern.groupdict
    572     indexgroup = [None] * p.pattern.groups
    573     for k, i in groupindex.items():
    574         indexgroup[i] = k
    575 
    576     return _sre.compile(
    577         pattern, flags | p.pattern.flags, code,
    578         p.pattern.groups-1,
    579         groupindex, indexgroup
    580         )
    581