Home | History | Annotate | Download | only in python2.7
      1 #
      2 # Secret Labs' Regular Expression Engine
      3 #
      4 # convert template to internal format
      5 #
      6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
      7 #
      8 # See the sre.py file for information on usage and redistribution.
      9 #
     10 
     11 """Internal support module for sre"""
     12 
     13 import _sre, sys
     14 import sre_parse
     15 from sre_constants import *
     16 from _sre import MAXREPEAT
     17 
     18 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
     19 
     20 if _sre.CODESIZE == 2:
     21     MAXCODE = 65535
     22 else:
     23     MAXCODE = 0xFFFFFFFFL
     24 
     25 def _identityfunction(x):
     26     return x
     27 
     28 _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
     29 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
     30 _SUCCESS_CODES = set([SUCCESS, FAILURE])
     31 _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
     32 
     33 def _compile(code, pattern, flags):
     34     # internal: compile a (sub)pattern
     35     emit = code.append
     36     _len = len
     37     LITERAL_CODES = _LITERAL_CODES
     38     REPEATING_CODES = _REPEATING_CODES
     39     SUCCESS_CODES = _SUCCESS_CODES
     40     ASSERT_CODES = _ASSERT_CODES
     41     for op, av in pattern:
     42         if op in LITERAL_CODES:
     43             if flags & SRE_FLAG_IGNORECASE:
     44                 emit(OPCODES[OP_IGNORE[op]])
     45                 emit(_sre.getlower(av, flags))
     46             else:
     47                 emit(OPCODES[op])
     48                 emit(av)
     49         elif op is IN:
     50             if flags & SRE_FLAG_IGNORECASE:
     51                 emit(OPCODES[OP_IGNORE[op]])
     52                 def fixup(literal, flags=flags):
     53                     return _sre.getlower(literal, flags)
     54             else:
     55                 emit(OPCODES[op])
     56                 fixup = _identityfunction
     57             skip = _len(code); emit(0)
     58             _compile_charset(av, flags, code, fixup)
     59             code[skip] = _len(code) - skip
     60         elif op is ANY:
     61             if flags & SRE_FLAG_DOTALL:
     62                 emit(OPCODES[ANY_ALL])
     63             else:
     64                 emit(OPCODES[ANY])
     65         elif op in REPEATING_CODES:
     66             if flags & SRE_FLAG_TEMPLATE:
     67                 raise error, "internal: unsupported template operator"
     68                 emit(OPCODES[REPEAT])
     69                 skip = _len(code); emit(0)
     70                 emit(av[0])
     71                 emit(av[1])
     72                 _compile(code, av[2], flags)
     73                 emit(OPCODES[SUCCESS])
     74                 code[skip] = _len(code) - skip
     75             elif _simple(av) and op is not REPEAT:
     76                 if op is MAX_REPEAT:
     77                     emit(OPCODES[REPEAT_ONE])
     78                 else:
     79                     emit(OPCODES[MIN_REPEAT_ONE])
     80                 skip = _len(code); emit(0)
     81                 emit(av[0])
     82                 emit(av[1])
     83                 _compile(code, av[2], flags)
     84                 emit(OPCODES[SUCCESS])
     85                 code[skip] = _len(code) - skip
     86             else:
     87                 emit(OPCODES[REPEAT])
     88                 skip = _len(code); emit(0)
     89                 emit(av[0])
     90                 emit(av[1])
     91                 _compile(code, av[2], flags)
     92                 code[skip] = _len(code) - skip
     93                 if op is MAX_REPEAT:
     94                     emit(OPCODES[MAX_UNTIL])
     95                 else:
     96                     emit(OPCODES[MIN_UNTIL])
     97         elif op is SUBPATTERN:
     98             if av[0]:
     99                 emit(OPCODES[MARK])
    100                 emit((av[0]-1)*2)
    101             # _compile_info(code, av[1], flags)
    102             _compile(code, av[1], flags)
    103             if av[0]:
    104                 emit(OPCODES[MARK])
    105                 emit((av[0]-1)*2+1)
    106         elif op in SUCCESS_CODES:
    107             emit(OPCODES[op])
    108         elif op in ASSERT_CODES:
    109             emit(OPCODES[op])
    110             skip = _len(code); emit(0)
    111             if av[0] >= 0:
    112                 emit(0) # look ahead
    113             else:
    114                 lo, hi = av[1].getwidth()
    115                 if lo != hi:
    116                     raise error, "look-behind requires fixed-width pattern"
    117                 emit(lo) # look behind
    118             _compile(code, av[1], flags)
    119             emit(OPCODES[SUCCESS])
    120             code[skip] = _len(code) - skip
    121         elif op is CALL:
    122             emit(OPCODES[op])
    123             skip = _len(code); emit(0)
    124             _compile(code, av, flags)
    125             emit(OPCODES[SUCCESS])
    126             code[skip] = _len(code) - skip
    127         elif op is AT:
    128             emit(OPCODES[op])
    129             if flags & SRE_FLAG_MULTILINE:
    130                 av = AT_MULTILINE.get(av, av)
    131             if flags & SRE_FLAG_LOCALE:
    132                 av = AT_LOCALE.get(av, av)
    133             elif flags & SRE_FLAG_UNICODE:
    134                 av = AT_UNICODE.get(av, av)
    135             emit(ATCODES[av])
    136         elif op is BRANCH:
    137             emit(OPCODES[op])
    138             tail = []
    139             tailappend = tail.append
    140             for av in av[1]:
    141                 skip = _len(code); emit(0)
    142                 # _compile_info(code, av, flags)
    143                 _compile(code, av, flags)
    144                 emit(OPCODES[JUMP])
    145                 tailappend(_len(code)); emit(0)
    146                 code[skip] = _len(code) - skip
    147             emit(0) # end of branch
    148             for tail in tail:
    149                 code[tail] = _len(code) - tail
    150         elif op is CATEGORY:
    151             emit(OPCODES[op])
    152             if flags & SRE_FLAG_LOCALE:
    153                 av = CH_LOCALE[av]
    154             elif flags & SRE_FLAG_UNICODE:
    155                 av = CH_UNICODE[av]
    156             emit(CHCODES[av])
    157         elif op is GROUPREF:
    158             if flags & SRE_FLAG_IGNORECASE:
    159                 emit(OPCODES[OP_IGNORE[op]])
    160             else:
    161                 emit(OPCODES[op])
    162             emit(av-1)
    163         elif op is GROUPREF_EXISTS:
    164             emit(OPCODES[op])
    165             emit(av[0]-1)
    166             skipyes = _len(code); emit(0)
    167             _compile(code, av[1], flags)
    168             if av[2]:
    169                 emit(OPCODES[JUMP])
    170                 skipno = _len(code); emit(0)
    171                 code[skipyes] = _len(code) - skipyes + 1
    172                 _compile(code, av[2], flags)
    173                 code[skipno] = _len(code) - skipno
    174             else:
    175                 code[skipyes] = _len(code) - skipyes + 1
    176         else:
    177             raise ValueError, ("unsupported operand type", op)
    178 
    179 def _compile_charset(charset, flags, code, fixup=None):
    180     # compile charset subprogram
    181     emit = code.append
    182     if fixup is None:
    183         fixup = _identityfunction
    184     for op, av in _optimize_charset(charset, fixup):
    185         emit(OPCODES[op])
    186         if op is NEGATE:
    187             pass
    188         elif op is LITERAL:
    189             emit(fixup(av))
    190         elif op is RANGE:
    191             emit(fixup(av[0]))
    192             emit(fixup(av[1]))
    193         elif op is CHARSET:
    194             code.extend(av)
    195         elif op is BIGCHARSET:
    196             code.extend(av)
    197         elif op is CATEGORY:
    198             if flags & SRE_FLAG_LOCALE:
    199                 emit(CHCODES[CH_LOCALE[av]])
    200             elif flags & SRE_FLAG_UNICODE:
    201                 emit(CHCODES[CH_UNICODE[av]])
    202             else:
    203                 emit(CHCODES[av])
    204         else:
    205             raise error, "internal: unsupported set operator"
    206     emit(OPCODES[FAILURE])
    207 
    208 def _optimize_charset(charset, fixup):
    209     # internal: optimize character set
    210     out = []
    211     outappend = out.append
    212     charmap = [0]*256
    213     try:
    214         for op, av in charset:
    215             if op is NEGATE:
    216                 outappend((op, av))
    217             elif op is LITERAL:
    218                 charmap[fixup(av)] = 1
    219             elif op is RANGE:
    220                 for i in range(fixup(av[0]), fixup(av[1])+1):
    221                     charmap[i] = 1
    222             elif op is CATEGORY:
    223                 # XXX: could append to charmap tail
    224                 return charset # cannot compress
    225     except IndexError:
    226         # character set contains unicode characters
    227         return _optimize_unicode(charset, fixup)
    228     # compress character map
    229     i = p = n = 0
    230     runs = []
    231     runsappend = runs.append
    232     for c in charmap:
    233         if c:
    234             if n == 0:
    235                 p = i
    236             n = n + 1
    237         elif n:
    238             runsappend((p, n))
    239             n = 0
    240         i = i + 1
    241     if n:
    242         runsappend((p, n))
    243     if len(runs) <= 2:
    244         # use literal/range
    245         for p, n in runs:
    246             if n == 1:
    247                 outappend((LITERAL, p))
    248             else:
    249                 outappend((RANGE, (p, p+n-1)))
    250         if len(out) < len(charset):
    251             return out
    252     else:
    253         # use bitmap
    254         data = _mk_bitmap(charmap)
    255         outappend((CHARSET, data))
    256         return out
    257     return charset
    258 
    259 def _mk_bitmap(bits):
    260     data = []
    261     dataappend = data.append
    262     if _sre.CODESIZE == 2:
    263         start = (1, 0)
    264     else:
    265         start = (1L, 0L)
    266     m, v = start
    267     for c in bits:
    268         if c:
    269             v = v + m
    270         m = m + m
    271         if m > MAXCODE:
    272             dataappend(v)
    273             m, v = start
    274     return data
    275 
    276 # To represent a big charset, first a bitmap of all characters in the
    277 # set is constructed. Then, this bitmap is sliced into chunks of 256
    278 # characters, duplicate chunks are eliminated, and each chunk is
    279 # given a number. In the compiled expression, the charset is
    280 # represented by a 16-bit word sequence, consisting of one word for
    281 # the number of different chunks, a sequence of 256 bytes (128 words)
    282 # of chunk numbers indexed by their original chunk position, and a
    283 # sequence of chunks (16 words each).
    284 
    285 # Compression is normally good: in a typical charset, large ranges of
    286 # Unicode will be either completely excluded (e.g. if only cyrillic
    287 # letters are to be matched), or completely included (e.g. if large
    288 # subranges of Kanji match). These ranges will be represented by
    289 # chunks of all one-bits or all zero-bits.
    290 
    291 # Matching can be also done efficiently: the more significant byte of
    292 # the Unicode character is an index into the chunk number, and the
    293 # less significant byte is a bit index in the chunk (just like the
    294 # CHARSET matching).
    295 
    296 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
    297 # of the basic multilingual plane; an efficient representation
    298 # for all of UTF-16 has not yet been developed. This means,
    299 # in particular, that negated charsets cannot be represented as
    300 # bigcharsets.
    301 
    302 def _optimize_unicode(charset, fixup):
    303     try:
    304         import array
    305     except ImportError:
    306         return charset
    307     charmap = [0]*65536
    308     negate = 0
    309     try:
    310         for op, av in charset:
    311             if op is NEGATE:
    312                 negate = 1
    313             elif op is LITERAL:
    314                 charmap[fixup(av)] = 1
    315             elif op is RANGE:
    316                 for i in xrange(fixup(av[0]), fixup(av[1])+1):
    317                     charmap[i] = 1
    318             elif op is CATEGORY:
    319                 # XXX: could expand category
    320                 return charset # cannot compress
    321     except IndexError:
    322         # non-BMP characters
    323         return charset
    324     if negate:
    325         if sys.maxunicode != 65535:
    326             # XXX: negation does not work with big charsets
    327             return charset
    328         for i in xrange(65536):
    329             charmap[i] = not charmap[i]
    330     comps = {}
    331     mapping = [0]*256
    332     block = 0
    333     data = []
    334     for i in xrange(256):
    335         chunk = tuple(charmap[i*256:(i+1)*256])
    336         new = comps.setdefault(chunk, block)
    337         mapping[i] = new
    338         if new == block:
    339             block = block + 1
    340             data = data + _mk_bitmap(chunk)
    341     header = [block]
    342     if _sre.CODESIZE == 2:
    343         code = 'H'
    344     else:
    345         code = 'I'
    346     # Convert block indices to byte array of 256 bytes
    347     mapping = array.array('b', mapping).tostring()
    348     # Convert byte array to word array
    349     mapping = array.array(code, mapping)
    350     assert mapping.itemsize == _sre.CODESIZE
    351     header = header + mapping.tolist()
    352     data[0:0] = header
    353     return [(BIGCHARSET, data)]
    354 
    355 def _simple(av):
    356     # check if av is a "simple" operator
    357     lo, hi = av[2].getwidth()
    358     if lo == 0 and hi == MAXREPEAT:
    359         raise error, "nothing to repeat"
    360     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
    361 
    362 def _compile_info(code, pattern, flags):
    363     # internal: compile an info block.  in the current version,
    364     # this contains min/max pattern width, and an optional literal
    365     # prefix or a character map
    366     lo, hi = pattern.getwidth()
    367     if lo == 0:
    368         return # not worth it
    369     # look for a literal prefix
    370     prefix = []
    371     prefixappend = prefix.append
    372     prefix_skip = 0
    373     charset = [] # not used
    374     charsetappend = charset.append
    375     if not (flags & SRE_FLAG_IGNORECASE):
    376         # look for literal prefix
    377         for op, av in pattern.data:
    378             if op is LITERAL:
    379                 if len(prefix) == prefix_skip:
    380                     prefix_skip = prefix_skip + 1
    381                 prefixappend(av)
    382             elif op is SUBPATTERN and len(av[1]) == 1:
    383                 op, av = av[1][0]
    384                 if op is LITERAL:
    385                     prefixappend(av)
    386                 else:
    387                     break
    388             else:
    389                 break
    390         # if no prefix, look for charset prefix
    391         if not prefix and pattern.data:
    392             op, av = pattern.data[0]
    393             if op is SUBPATTERN and av[1]:
    394                 op, av = av[1][0]
    395                 if op is LITERAL:
    396                     charsetappend((op, av))
    397                 elif op is BRANCH:
    398                     c = []
    399                     cappend = c.append
    400                     for p in av[1]:
    401                         if not p:
    402                             break
    403                         op, av = p[0]
    404                         if op is LITERAL:
    405                             cappend((op, av))
    406                         else:
    407                             break
    408                     else:
    409                         charset = c
    410             elif op is BRANCH:
    411                 c = []
    412                 cappend = c.append
    413                 for p in av[1]:
    414                     if not p:
    415                         break
    416                     op, av = p[0]
    417                     if op is LITERAL:
    418                         cappend((op, av))
    419                     else:
    420                         break
    421                 else:
    422                     charset = c
    423             elif op is IN:
    424                 charset = av
    425 ##     if prefix:
    426 ##         print "*** PREFIX", prefix, prefix_skip
    427 ##     if charset:
    428 ##         print "*** CHARSET", charset
    429     # add an info block
    430     emit = code.append
    431     emit(OPCODES[INFO])
    432     skip = len(code); emit(0)
    433     # literal flag
    434     mask = 0
    435     if prefix:
    436         mask = SRE_INFO_PREFIX
    437         if len(prefix) == prefix_skip == len(pattern.data):
    438             mask = mask + SRE_INFO_LITERAL
    439     elif charset:
    440         mask = mask + SRE_INFO_CHARSET
    441     emit(mask)
    442     # pattern length
    443     if lo < MAXCODE:
    444         emit(lo)
    445     else:
    446         emit(MAXCODE)
    447         prefix = prefix[:MAXCODE]
    448     if hi < MAXCODE:
    449         emit(hi)
    450     else:
    451         emit(0)
    452     # add literal prefix
    453     if prefix:
    454         emit(len(prefix)) # length
    455         emit(prefix_skip) # skip
    456         code.extend(prefix)
    457         # generate overlap table
    458         table = [-1] + ([0]*len(prefix))
    459         for i in xrange(len(prefix)):
    460             table[i+1] = table[i]+1
    461             while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
    462                 table[i+1] = table[table[i+1]-1]+1
    463         code.extend(table[1:]) # don't store first entry
    464     elif charset:
    465         _compile_charset(charset, flags, code)
    466     code[skip] = len(code) - skip
    467 
    468 try:
    469     unicode
    470 except NameError:
    471     STRING_TYPES = (type(""),)
    472 else:
    473     STRING_TYPES = (type(""), type(unicode("")))
    474 
    475 def isstring(obj):
    476     for tp in STRING_TYPES:
    477         if isinstance(obj, tp):
    478             return 1
    479     return 0
    480 
    481 def _code(p, flags):
    482 
    483     flags = p.pattern.flags | flags
    484     code = []
    485 
    486     # compile info block
    487     _compile_info(code, p, flags)
    488 
    489     # compile the pattern
    490     _compile(code, p.data, flags)
    491 
    492     code.append(OPCODES[SUCCESS])
    493 
    494     return code
    495 
    496 def compile(p, flags=0):
    497     # internal: convert pattern list to internal format
    498 
    499     if isstring(p):
    500         pattern = p
    501         p = sre_parse.parse(p, flags)
    502     else:
    503         pattern = None
    504 
    505     code = _code(p, flags)
    506 
    507     # print code
    508 
    509     # XXX: <fl> get rid of this limitation!
    510     if p.pattern.groups > 100:
    511         raise AssertionError(
    512             "sorry, but this version only supports 100 named groups"
    513             )
    514 
    515     # map in either direction
    516     groupindex = p.pattern.groupdict
    517     indexgroup = [None] * p.pattern.groups
    518     for k, i in groupindex.items():
    519         indexgroup[i] = k
    520 
    521     return _sre.compile(
    522         pattern, flags | p.pattern.flags, code,
    523         p.pattern.groups-1,
    524         groupindex, indexgroup
    525         )
    526