Home | History | Annotate | Download | only in Lib
      1 #

      2 # Secret Labs' Regular Expression Engine

      3 #

      4 # convert template to internal format

      5 #

      6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.

      7 #

      8 # See the sre.py file for information on usage and redistribution.

      9 #

     10 
     11 """Internal support module for sre"""
     12 
     13 import _sre, sys
     14 import sre_parse
     15 from sre_constants import *
     16 
     17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
     18 
     19 if _sre.CODESIZE == 2:
     20     MAXCODE = 65535
     21 else:
     22     MAXCODE = 0xFFFFFFFFL
     23 
     24 def _identityfunction(x):
     25     return x
     26 
     27 _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
     28 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
     29 _SUCCESS_CODES = set([SUCCESS, FAILURE])
     30 _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
     31 
     32 def _compile(code, pattern, flags):
     33     # internal: compile a (sub)pattern

     34     emit = code.append
     35     _len = len
     36     LITERAL_CODES = _LITERAL_CODES
     37     REPEATING_CODES = _REPEATING_CODES
     38     SUCCESS_CODES = _SUCCESS_CODES
     39     ASSERT_CODES = _ASSERT_CODES
     40     for op, av in pattern:
     41         if op in LITERAL_CODES:
     42             if flags & SRE_FLAG_IGNORECASE:
     43                 emit(OPCODES[OP_IGNORE[op]])
     44                 emit(_sre.getlower(av, flags))
     45             else:
     46                 emit(OPCODES[op])
     47                 emit(av)
     48         elif op is IN:
     49             if flags & SRE_FLAG_IGNORECASE:
     50                 emit(OPCODES[OP_IGNORE[op]])
     51                 def fixup(literal, flags=flags):
     52                     return _sre.getlower(literal, flags)
     53             else:
     54                 emit(OPCODES[op])
     55                 fixup = _identityfunction
     56             skip = _len(code); emit(0)
     57             _compile_charset(av, flags, code, fixup)
     58             code[skip] = _len(code) - skip
     59         elif op is ANY:
     60             if flags & SRE_FLAG_DOTALL:
     61                 emit(OPCODES[ANY_ALL])
     62             else:
     63                 emit(OPCODES[ANY])
     64         elif op in REPEATING_CODES:
     65             if flags & SRE_FLAG_TEMPLATE:
     66                 raise error, "internal: unsupported template operator"
     67                 emit(OPCODES[REPEAT])
     68                 skip = _len(code); emit(0)
     69                 emit(av[0])
     70                 emit(av[1])
     71                 _compile(code, av[2], flags)
     72                 emit(OPCODES[SUCCESS])
     73                 code[skip] = _len(code) - skip
     74             elif _simple(av) and op is not REPEAT:
     75                 if op is MAX_REPEAT:
     76                     emit(OPCODES[REPEAT_ONE])
     77                 else:
     78                     emit(OPCODES[MIN_REPEAT_ONE])
     79                 skip = _len(code); emit(0)
     80                 emit(av[0])
     81                 emit(av[1])
     82                 _compile(code, av[2], flags)
     83                 emit(OPCODES[SUCCESS])
     84                 code[skip] = _len(code) - skip
     85             else:
     86                 emit(OPCODES[REPEAT])
     87                 skip = _len(code); emit(0)
     88                 emit(av[0])
     89                 emit(av[1])
     90                 _compile(code, av[2], flags)
     91                 code[skip] = _len(code) - skip
     92                 if op is MAX_REPEAT:
     93                     emit(OPCODES[MAX_UNTIL])
     94                 else:
     95                     emit(OPCODES[MIN_UNTIL])
     96         elif op is SUBPATTERN:
     97             if av[0]:
     98                 emit(OPCODES[MARK])
     99                 emit((av[0]-1)*2)
    100             # _compile_info(code, av[1], flags)

    101             _compile(code, av[1], flags)
    102             if av[0]:
    103                 emit(OPCODES[MARK])
    104                 emit((av[0]-1)*2+1)
    105         elif op in SUCCESS_CODES:
    106             emit(OPCODES[op])
    107         elif op in ASSERT_CODES:
    108             emit(OPCODES[op])
    109             skip = _len(code); emit(0)
    110             if av[0] >= 0:
    111                 emit(0) # look ahead

    112             else:
    113                 lo, hi = av[1].getwidth()
    114                 if lo != hi:
    115                     raise error, "look-behind requires fixed-width pattern"
    116                 emit(lo) # look behind

    117             _compile(code, av[1], flags)
    118             emit(OPCODES[SUCCESS])
    119             code[skip] = _len(code) - skip
    120         elif op is CALL:
    121             emit(OPCODES[op])
    122             skip = _len(code); emit(0)
    123             _compile(code, av, flags)
    124             emit(OPCODES[SUCCESS])
    125             code[skip] = _len(code) - skip
    126         elif op is AT:
    127             emit(OPCODES[op])
    128             if flags & SRE_FLAG_MULTILINE:
    129                 av = AT_MULTILINE.get(av, av)
    130             if flags & SRE_FLAG_LOCALE:
    131                 av = AT_LOCALE.get(av, av)
    132             elif flags & SRE_FLAG_UNICODE:
    133                 av = AT_UNICODE.get(av, av)
    134             emit(ATCODES[av])
    135         elif op is BRANCH:
    136             emit(OPCODES[op])
    137             tail = []
    138             tailappend = tail.append
    139             for av in av[1]:
    140                 skip = _len(code); emit(0)
    141                 # _compile_info(code, av, flags)

    142                 _compile(code, av, flags)
    143                 emit(OPCODES[JUMP])
    144                 tailappend(_len(code)); emit(0)
    145                 code[skip] = _len(code) - skip
    146             emit(0) # end of branch

    147             for tail in tail:
    148                 code[tail] = _len(code) - tail
    149         elif op is CATEGORY:
    150             emit(OPCODES[op])
    151             if flags & SRE_FLAG_LOCALE:
    152                 av = CH_LOCALE[av]
    153             elif flags & SRE_FLAG_UNICODE:
    154                 av = CH_UNICODE[av]
    155             emit(CHCODES[av])
    156         elif op is GROUPREF:
    157             if flags & SRE_FLAG_IGNORECASE:
    158                 emit(OPCODES[OP_IGNORE[op]])
    159             else:
    160                 emit(OPCODES[op])
    161             emit(av-1)
    162         elif op is GROUPREF_EXISTS:
    163             emit(OPCODES[op])
    164             emit(av[0]-1)
    165             skipyes = _len(code); emit(0)
    166             _compile(code, av[1], flags)
    167             if av[2]:
    168                 emit(OPCODES[JUMP])
    169                 skipno = _len(code); emit(0)
    170                 code[skipyes] = _len(code) - skipyes + 1
    171                 _compile(code, av[2], flags)
    172                 code[skipno] = _len(code) - skipno
    173             else:
    174                 code[skipyes] = _len(code) - skipyes + 1
    175         else:
    176             raise ValueError, ("unsupported operand type", op)
    177 
    178 def _compile_charset(charset, flags, code, fixup=None):
    179     # compile charset subprogram

    180     emit = code.append
    181     if fixup is None:
    182         fixup = _identityfunction
    183     for op, av in _optimize_charset(charset, fixup):
    184         emit(OPCODES[op])
    185         if op is NEGATE:
    186             pass
    187         elif op is LITERAL:
    188             emit(fixup(av))
    189         elif op is RANGE:
    190             emit(fixup(av[0]))
    191             emit(fixup(av[1]))
    192         elif op is CHARSET:
    193             code.extend(av)
    194         elif op is BIGCHARSET:
    195             code.extend(av)
    196         elif op is CATEGORY:
    197             if flags & SRE_FLAG_LOCALE:
    198                 emit(CHCODES[CH_LOCALE[av]])
    199             elif flags & SRE_FLAG_UNICODE:
    200                 emit(CHCODES[CH_UNICODE[av]])
    201             else:
    202                 emit(CHCODES[av])
    203         else:
    204             raise error, "internal: unsupported set operator"
    205     emit(OPCODES[FAILURE])
    206 
    207 def _optimize_charset(charset, fixup):
    208     # internal: optimize character set

    209     out = []
    210     outappend = out.append
    211     charmap = [0]*256
    212     try:
    213         for op, av in charset:
    214             if op is NEGATE:
    215                 outappend((op, av))
    216             elif op is LITERAL:
    217                 charmap[fixup(av)] = 1
    218             elif op is RANGE:
    219                 for i in range(fixup(av[0]), fixup(av[1])+1):
    220                     charmap[i] = 1
    221             elif op is CATEGORY:
    222                 # XXX: could append to charmap tail

    223                 return charset # cannot compress

    224     except IndexError:
    225         # character set contains unicode characters

    226         return _optimize_unicode(charset, fixup)
    227     # compress character map

    228     i = p = n = 0
    229     runs = []
    230     runsappend = runs.append
    231     for c in charmap:
    232         if c:
    233             if n == 0:
    234                 p = i
    235             n = n + 1
    236         elif n:
    237             runsappend((p, n))
    238             n = 0
    239         i = i + 1
    240     if n:
    241         runsappend((p, n))
    242     if len(runs) <= 2:
    243         # use literal/range

    244         for p, n in runs:
    245             if n == 1:
    246                 outappend((LITERAL, p))
    247             else:
    248                 outappend((RANGE, (p, p+n-1)))
    249         if len(out) < len(charset):
    250             return out
    251     else:
    252         # use bitmap

    253         data = _mk_bitmap(charmap)
    254         outappend((CHARSET, data))
    255         return out
    256     return charset
    257 
    258 def _mk_bitmap(bits):
    259     data = []
    260     dataappend = data.append
    261     if _sre.CODESIZE == 2:
    262         start = (1, 0)
    263     else:
    264         start = (1L, 0L)
    265     m, v = start
    266     for c in bits:
    267         if c:
    268             v = v + m
    269         m = m + m
    270         if m > MAXCODE:
    271             dataappend(v)
    272             m, v = start
    273     return data
    274 
    275 # To represent a big charset, first a bitmap of all characters in the

    276 # set is constructed. Then, this bitmap is sliced into chunks of 256

    277 # characters, duplicate chunks are eliminated, and each chunk is

    278 # given a number. In the compiled expression, the charset is

    279 # represented by a 16-bit word sequence, consisting of one word for

    280 # the number of different chunks, a sequence of 256 bytes (128 words)

    281 # of chunk numbers indexed by their original chunk position, and a

    282 # sequence of chunks (16 words each).

    283 
    284 # Compression is normally good: in a typical charset, large ranges of

    285 # Unicode will be either completely excluded (e.g. if only cyrillic

    286 # letters are to be matched), or completely included (e.g. if large

    287 # subranges of Kanji match). These ranges will be represented by

    288 # chunks of all one-bits or all zero-bits.

    289 
    290 # Matching can be also done efficiently: the more significant byte of

    291 # the Unicode character is an index into the chunk number, and the

    292 # less significant byte is a bit index in the chunk (just like the

    293 # CHARSET matching).

    294 
    295 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets

    296 # of the basic multilingual plane; an efficient representation

    297 # for all of UTF-16 has not yet been developed. This means,

    298 # in particular, that negated charsets cannot be represented as

    299 # bigcharsets.

    300 
    301 def _optimize_unicode(charset, fixup):
    302     try:
    303         import array
    304     except ImportError:
    305         return charset
    306     charmap = [0]*65536
    307     negate = 0
    308     try:
    309         for op, av in charset:
    310             if op is NEGATE:
    311                 negate = 1
    312             elif op is LITERAL:
    313                 charmap[fixup(av)] = 1
    314             elif op is RANGE:
    315                 for i in xrange(fixup(av[0]), fixup(av[1])+1):
    316                     charmap[i] = 1
    317             elif op is CATEGORY:
    318                 # XXX: could expand category

    319                 return charset # cannot compress

    320     except IndexError:
    321         # non-BMP characters

    322         return charset
    323     if negate:
    324         if sys.maxunicode != 65535:
    325             # XXX: negation does not work with big charsets

    326             return charset
    327         for i in xrange(65536):
    328             charmap[i] = not charmap[i]
    329     comps = {}
    330     mapping = [0]*256
    331     block = 0
    332     data = []
    333     for i in xrange(256):
    334         chunk = tuple(charmap[i*256:(i+1)*256])
    335         new = comps.setdefault(chunk, block)
    336         mapping[i] = new
    337         if new == block:
    338             block = block + 1
    339             data = data + _mk_bitmap(chunk)
    340     header = [block]
    341     if _sre.CODESIZE == 2:
    342         code = 'H'
    343     else:
    344         code = 'I'
    345     # Convert block indices to byte array of 256 bytes

    346     mapping = array.array('b', mapping).tostring()
    347     # Convert byte array to word array

    348     mapping = array.array(code, mapping)
    349     assert mapping.itemsize == _sre.CODESIZE
    350     header = header + mapping.tolist()
    351     data[0:0] = header
    352     return [(BIGCHARSET, data)]
    353 
    354 def _simple(av):
    355     # check if av is a "simple" operator

    356     lo, hi = av[2].getwidth()
    357     if lo == 0 and hi == MAXREPEAT:
    358         raise error, "nothing to repeat"
    359     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
    360 
    361 def _compile_info(code, pattern, flags):
    362     # internal: compile an info block.  in the current version,

    363     # this contains min/max pattern width, and an optional literal

    364     # prefix or a character map

    365     lo, hi = pattern.getwidth()
    366     if lo == 0:
    367         return # not worth it

    368     # look for a literal prefix

    369     prefix = []
    370     prefixappend = prefix.append
    371     prefix_skip = 0
    372     charset = [] # not used

    373     charsetappend = charset.append
    374     if not (flags & SRE_FLAG_IGNORECASE):
    375         # look for literal prefix

    376         for op, av in pattern.data:
    377             if op is LITERAL:
    378                 if len(prefix) == prefix_skip:
    379                     prefix_skip = prefix_skip + 1
    380                 prefixappend(av)
    381             elif op is SUBPATTERN and len(av[1]) == 1:
    382                 op, av = av[1][0]
    383                 if op is LITERAL:
    384                     prefixappend(av)
    385                 else:
    386                     break
    387             else:
    388                 break
    389         # if no prefix, look for charset prefix

    390         if not prefix and pattern.data:
    391             op, av = pattern.data[0]
    392             if op is SUBPATTERN and av[1]:
    393                 op, av = av[1][0]
    394                 if op is LITERAL:
    395                     charsetappend((op, av))
    396                 elif op is BRANCH:
    397                     c = []
    398                     cappend = c.append
    399                     for p in av[1]:
    400                         if not p:
    401                             break
    402                         op, av = p[0]
    403                         if op is LITERAL:
    404                             cappend((op, av))
    405                         else:
    406                             break
    407                     else:
    408                         charset = c
    409             elif op is BRANCH:
    410                 c = []
    411                 cappend = c.append
    412                 for p in av[1]:
    413                     if not p:
    414                         break
    415                     op, av = p[0]
    416                     if op is LITERAL:
    417                         cappend((op, av))
    418                     else:
    419                         break
    420                 else:
    421                     charset = c
    422             elif op is IN:
    423                 charset = av
    424 ##     if prefix:

    425 ##         print "*** PREFIX", prefix, prefix_skip

    426 ##     if charset:

    427 ##         print "*** CHARSET", charset

    428     # add an info block

    429     emit = code.append
    430     emit(OPCODES[INFO])
    431     skip = len(code); emit(0)
    432     # literal flag

    433     mask = 0
    434     if prefix:
    435         mask = SRE_INFO_PREFIX
    436         if len(prefix) == prefix_skip == len(pattern.data):
    437             mask = mask + SRE_INFO_LITERAL
    438     elif charset:
    439         mask = mask + SRE_INFO_CHARSET
    440     emit(mask)
    441     # pattern length

    442     if lo < MAXCODE:
    443         emit(lo)
    444     else:
    445         emit(MAXCODE)
    446         prefix = prefix[:MAXCODE]
    447     if hi < MAXCODE:
    448         emit(hi)
    449     else:
    450         emit(0)
    451     # add literal prefix

    452     if prefix:
    453         emit(len(prefix)) # length

    454         emit(prefix_skip) # skip

    455         code.extend(prefix)
    456         # generate overlap table

    457         table = [-1] + ([0]*len(prefix))
    458         for i in xrange(len(prefix)):
    459             table[i+1] = table[i]+1
    460             while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
    461                 table[i+1] = table[table[i+1]-1]+1
    462         code.extend(table[1:]) # don't store first entry

    463     elif charset:
    464         _compile_charset(charset, flags, code)
    465     code[skip] = len(code) - skip
    466 
    467 try:
    468     unicode
    469 except NameError:
    470     STRING_TYPES = (type(""),)
    471 else:
    472     STRING_TYPES = (type(""), type(unicode("")))
    473 
    474 def isstring(obj):
    475     for tp in STRING_TYPES:
    476         if isinstance(obj, tp):
    477             return 1
    478     return 0
    479 
    480 def _code(p, flags):
    481 
    482     flags = p.pattern.flags | flags
    483     code = []
    484 
    485     # compile info block

    486     _compile_info(code, p, flags)
    487 
    488     # compile the pattern

    489     _compile(code, p.data, flags)
    490 
    491     code.append(OPCODES[SUCCESS])
    492 
    493     return code
    494 
    495 def compile(p, flags=0):
    496     # internal: convert pattern list to internal format

    497 
    498     if isstring(p):
    499         pattern = p
    500         p = sre_parse.parse(p, flags)
    501     else:
    502         pattern = None
    503 
    504     code = _code(p, flags)
    505 
    506     # print code

    507 
    508     # XXX: <fl> get rid of this limitation!

    509     if p.pattern.groups > 100:
    510         raise AssertionError(
    511             "sorry, but this version only supports 100 named groups"
    512             )
    513 
    514     # map in either direction

    515     groupindex = p.pattern.groupdict
    516     indexgroup = [None] * p.pattern.groups
    517     for k, i in groupindex.items():
    518         indexgroup[i] = k
    519 
    520     return _sre.compile(
    521         pattern, flags | p.pattern.flags, code,
    522         p.pattern.groups-1,
    523         groupindex, indexgroup
    524         )
    525