1 # 2 # Secret Labs' Regular Expression Engine 3 # 4 # convert template to internal format 5 # 6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7 # 8 # See the sre.py file for information on usage and redistribution. 9 # 10 11 """Internal support module for sre""" 12 13 import _sre, sys 14 import sre_parse 15 from sre_constants import * 16 from _sre import MAXREPEAT 17 18 assert _sre.MAGIC == MAGIC, "SRE module mismatch" 19 20 if _sre.CODESIZE == 2: 21 MAXCODE = 65535 22 else: 23 MAXCODE = 0xFFFFFFFFL 24 25 def _identityfunction(x): 26 return x 27 28 _LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 29 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 30 _SUCCESS_CODES = set([SUCCESS, FAILURE]) 31 _ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 32 33 def _compile(code, pattern, flags): 34 # internal: compile a (sub)pattern 35 emit = code.append 36 _len = len 37 LITERAL_CODES = _LITERAL_CODES 38 REPEATING_CODES = _REPEATING_CODES 39 SUCCESS_CODES = _SUCCESS_CODES 40 ASSERT_CODES = _ASSERT_CODES 41 for op, av in pattern: 42 if op in LITERAL_CODES: 43 if flags & SRE_FLAG_IGNORECASE: 44 emit(OPCODES[OP_IGNORE[op]]) 45 emit(_sre.getlower(av, flags)) 46 else: 47 emit(OPCODES[op]) 48 emit(av) 49 elif op is IN: 50 if flags & SRE_FLAG_IGNORECASE: 51 emit(OPCODES[OP_IGNORE[op]]) 52 def fixup(literal, flags=flags): 53 return _sre.getlower(literal, flags) 54 else: 55 emit(OPCODES[op]) 56 fixup = _identityfunction 57 skip = _len(code); emit(0) 58 _compile_charset(av, flags, code, fixup) 59 code[skip] = _len(code) - skip 60 elif op is ANY: 61 if flags & SRE_FLAG_DOTALL: 62 emit(OPCODES[ANY_ALL]) 63 else: 64 emit(OPCODES[ANY]) 65 elif op in REPEATING_CODES: 66 if flags & SRE_FLAG_TEMPLATE: 67 raise error, "internal: unsupported template operator" 68 emit(OPCODES[REPEAT]) 69 skip = _len(code); emit(0) 70 emit(av[0]) 71 emit(av[1]) 72 _compile(code, av[2], flags) 73 emit(OPCODES[SUCCESS]) 74 code[skip] = _len(code) - skip 75 elif _simple(av) and op is not REPEAT: 76 if op is MAX_REPEAT: 77 emit(OPCODES[REPEAT_ONE]) 78 else: 79 emit(OPCODES[MIN_REPEAT_ONE]) 80 skip = _len(code); emit(0) 81 emit(av[0]) 82 emit(av[1]) 83 _compile(code, av[2], flags) 84 emit(OPCODES[SUCCESS]) 85 code[skip] = _len(code) - skip 86 else: 87 emit(OPCODES[REPEAT]) 88 skip = _len(code); emit(0) 89 emit(av[0]) 90 emit(av[1]) 91 _compile(code, av[2], flags) 92 code[skip] = _len(code) - skip 93 if op is MAX_REPEAT: 94 emit(OPCODES[MAX_UNTIL]) 95 else: 96 emit(OPCODES[MIN_UNTIL]) 97 elif op is SUBPATTERN: 98 if av[0]: 99 emit(OPCODES[MARK]) 100 emit((av[0]-1)*2) 101 # _compile_info(code, av[1], flags) 102 _compile(code, av[1], flags) 103 if av[0]: 104 emit(OPCODES[MARK]) 105 emit((av[0]-1)*2+1) 106 elif op in SUCCESS_CODES: 107 emit(OPCODES[op]) 108 elif op in ASSERT_CODES: 109 emit(OPCODES[op]) 110 skip = _len(code); emit(0) 111 if av[0] >= 0: 112 emit(0) # look ahead 113 else: 114 lo, hi = av[1].getwidth() 115 if lo != hi: 116 raise error, "look-behind requires fixed-width pattern" 117 emit(lo) # look behind 118 _compile(code, av[1], flags) 119 emit(OPCODES[SUCCESS]) 120 code[skip] = _len(code) - skip 121 elif op is CALL: 122 emit(OPCODES[op]) 123 skip = _len(code); emit(0) 124 _compile(code, av, flags) 125 emit(OPCODES[SUCCESS]) 126 code[skip] = _len(code) - skip 127 elif op is AT: 128 emit(OPCODES[op]) 129 if flags & SRE_FLAG_MULTILINE: 130 av = AT_MULTILINE.get(av, av) 131 if flags & SRE_FLAG_LOCALE: 132 av = AT_LOCALE.get(av, av) 133 elif flags & SRE_FLAG_UNICODE: 134 av = AT_UNICODE.get(av, av) 135 emit(ATCODES[av]) 136 elif op is BRANCH: 137 emit(OPCODES[op]) 138 tail = [] 139 tailappend = tail.append 140 for av in av[1]: 141 skip = _len(code); emit(0) 142 # _compile_info(code, av, flags) 143 _compile(code, av, flags) 144 emit(OPCODES[JUMP]) 145 tailappend(_len(code)); emit(0) 146 code[skip] = _len(code) - skip 147 emit(0) # end of branch 148 for tail in tail: 149 code[tail] = _len(code) - tail 150 elif op is CATEGORY: 151 emit(OPCODES[op]) 152 if flags & SRE_FLAG_LOCALE: 153 av = CH_LOCALE[av] 154 elif flags & SRE_FLAG_UNICODE: 155 av = CH_UNICODE[av] 156 emit(CHCODES[av]) 157 elif op is GROUPREF: 158 if flags & SRE_FLAG_IGNORECASE: 159 emit(OPCODES[OP_IGNORE[op]]) 160 else: 161 emit(OPCODES[op]) 162 emit(av-1) 163 elif op is GROUPREF_EXISTS: 164 emit(OPCODES[op]) 165 emit(av[0]-1) 166 skipyes = _len(code); emit(0) 167 _compile(code, av[1], flags) 168 if av[2]: 169 emit(OPCODES[JUMP]) 170 skipno = _len(code); emit(0) 171 code[skipyes] = _len(code) - skipyes + 1 172 _compile(code, av[2], flags) 173 code[skipno] = _len(code) - skipno 174 else: 175 code[skipyes] = _len(code) - skipyes + 1 176 else: 177 raise ValueError, ("unsupported operand type", op) 178 179 def _compile_charset(charset, flags, code, fixup=None): 180 # compile charset subprogram 181 emit = code.append 182 if fixup is None: 183 fixup = _identityfunction 184 for op, av in _optimize_charset(charset, fixup): 185 emit(OPCODES[op]) 186 if op is NEGATE: 187 pass 188 elif op is LITERAL: 189 emit(fixup(av)) 190 elif op is RANGE: 191 emit(fixup(av[0])) 192 emit(fixup(av[1])) 193 elif op is CHARSET: 194 code.extend(av) 195 elif op is BIGCHARSET: 196 code.extend(av) 197 elif op is CATEGORY: 198 if flags & SRE_FLAG_LOCALE: 199 emit(CHCODES[CH_LOCALE[av]]) 200 elif flags & SRE_FLAG_UNICODE: 201 emit(CHCODES[CH_UNICODE[av]]) 202 else: 203 emit(CHCODES[av]) 204 else: 205 raise error, "internal: unsupported set operator" 206 emit(OPCODES[FAILURE]) 207 208 def _optimize_charset(charset, fixup): 209 # internal: optimize character set 210 out = [] 211 outappend = out.append 212 charmap = [0]*256 213 try: 214 for op, av in charset: 215 if op is NEGATE: 216 outappend((op, av)) 217 elif op is LITERAL: 218 charmap[fixup(av)] = 1 219 elif op is RANGE: 220 for i in range(fixup(av[0]), fixup(av[1])+1): 221 charmap[i] = 1 222 elif op is CATEGORY: 223 # XXX: could append to charmap tail 224 return charset # cannot compress 225 except IndexError: 226 # character set contains unicode characters 227 return _optimize_unicode(charset, fixup) 228 # compress character map 229 i = p = n = 0 230 runs = [] 231 runsappend = runs.append 232 for c in charmap: 233 if c: 234 if n == 0: 235 p = i 236 n = n + 1 237 elif n: 238 runsappend((p, n)) 239 n = 0 240 i = i + 1 241 if n: 242 runsappend((p, n)) 243 if len(runs) <= 2: 244 # use literal/range 245 for p, n in runs: 246 if n == 1: 247 outappend((LITERAL, p)) 248 else: 249 outappend((RANGE, (p, p+n-1))) 250 if len(out) < len(charset): 251 return out 252 else: 253 # use bitmap 254 data = _mk_bitmap(charmap) 255 outappend((CHARSET, data)) 256 return out 257 return charset 258 259 def _mk_bitmap(bits): 260 data = [] 261 dataappend = data.append 262 if _sre.CODESIZE == 2: 263 start = (1, 0) 264 else: 265 start = (1L, 0L) 266 m, v = start 267 for c in bits: 268 if c: 269 v = v + m 270 m = m + m 271 if m > MAXCODE: 272 dataappend(v) 273 m, v = start 274 return data 275 276 # To represent a big charset, first a bitmap of all characters in the 277 # set is constructed. Then, this bitmap is sliced into chunks of 256 278 # characters, duplicate chunks are eliminated, and each chunk is 279 # given a number. In the compiled expression, the charset is 280 # represented by a 16-bit word sequence, consisting of one word for 281 # the number of different chunks, a sequence of 256 bytes (128 words) 282 # of chunk numbers indexed by their original chunk position, and a 283 # sequence of chunks (16 words each). 284 285 # Compression is normally good: in a typical charset, large ranges of 286 # Unicode will be either completely excluded (e.g. if only cyrillic 287 # letters are to be matched), or completely included (e.g. if large 288 # subranges of Kanji match). These ranges will be represented by 289 # chunks of all one-bits or all zero-bits. 290 291 # Matching can be also done efficiently: the more significant byte of 292 # the Unicode character is an index into the chunk number, and the 293 # less significant byte is a bit index in the chunk (just like the 294 # CHARSET matching). 295 296 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 297 # of the basic multilingual plane; an efficient representation 298 # for all of UTF-16 has not yet been developed. This means, 299 # in particular, that negated charsets cannot be represented as 300 # bigcharsets. 301 302 def _optimize_unicode(charset, fixup): 303 try: 304 import array 305 except ImportError: 306 return charset 307 charmap = [0]*65536 308 negate = 0 309 try: 310 for op, av in charset: 311 if op is NEGATE: 312 negate = 1 313 elif op is LITERAL: 314 charmap[fixup(av)] = 1 315 elif op is RANGE: 316 for i in xrange(fixup(av[0]), fixup(av[1])+1): 317 charmap[i] = 1 318 elif op is CATEGORY: 319 # XXX: could expand category 320 return charset # cannot compress 321 except IndexError: 322 # non-BMP characters 323 return charset 324 if negate: 325 if sys.maxunicode != 65535: 326 # XXX: negation does not work with big charsets 327 return charset 328 for i in xrange(65536): 329 charmap[i] = not charmap[i] 330 comps = {} 331 mapping = [0]*256 332 block = 0 333 data = [] 334 for i in xrange(256): 335 chunk = tuple(charmap[i*256:(i+1)*256]) 336 new = comps.setdefault(chunk, block) 337 mapping[i] = new 338 if new == block: 339 block = block + 1 340 data = data + _mk_bitmap(chunk) 341 header = [block] 342 if _sre.CODESIZE == 2: 343 code = 'H' 344 else: 345 code = 'I' 346 # Convert block indices to byte array of 256 bytes 347 mapping = array.array('b', mapping).tostring() 348 # Convert byte array to word array 349 mapping = array.array(code, mapping) 350 assert mapping.itemsize == _sre.CODESIZE 351 header = header + mapping.tolist() 352 data[0:0] = header 353 return [(BIGCHARSET, data)] 354 355 def _simple(av): 356 # check if av is a "simple" operator 357 lo, hi = av[2].getwidth() 358 if lo == 0 and hi == MAXREPEAT: 359 raise error, "nothing to repeat" 360 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 361 362 def _compile_info(code, pattern, flags): 363 # internal: compile an info block. in the current version, 364 # this contains min/max pattern width, and an optional literal 365 # prefix or a character map 366 lo, hi = pattern.getwidth() 367 if lo == 0: 368 return # not worth it 369 # look for a literal prefix 370 prefix = [] 371 prefixappend = prefix.append 372 prefix_skip = 0 373 charset = [] # not used 374 charsetappend = charset.append 375 if not (flags & SRE_FLAG_IGNORECASE): 376 # look for literal prefix 377 for op, av in pattern.data: 378 if op is LITERAL: 379 if len(prefix) == prefix_skip: 380 prefix_skip = prefix_skip + 1 381 prefixappend(av) 382 elif op is SUBPATTERN and len(av[1]) == 1: 383 op, av = av[1][0] 384 if op is LITERAL: 385 prefixappend(av) 386 else: 387 break 388 else: 389 break 390 # if no prefix, look for charset prefix 391 if not prefix and pattern.data: 392 op, av = pattern.data[0] 393 if op is SUBPATTERN and av[1]: 394 op, av = av[1][0] 395 if op is LITERAL: 396 charsetappend((op, av)) 397 elif op is BRANCH: 398 c = [] 399 cappend = c.append 400 for p in av[1]: 401 if not p: 402 break 403 op, av = p[0] 404 if op is LITERAL: 405 cappend((op, av)) 406 else: 407 break 408 else: 409 charset = c 410 elif op is BRANCH: 411 c = [] 412 cappend = c.append 413 for p in av[1]: 414 if not p: 415 break 416 op, av = p[0] 417 if op is LITERAL: 418 cappend((op, av)) 419 else: 420 break 421 else: 422 charset = c 423 elif op is IN: 424 charset = av 425 ## if prefix: 426 ## print "*** PREFIX", prefix, prefix_skip 427 ## if charset: 428 ## print "*** CHARSET", charset 429 # add an info block 430 emit = code.append 431 emit(OPCODES[INFO]) 432 skip = len(code); emit(0) 433 # literal flag 434 mask = 0 435 if prefix: 436 mask = SRE_INFO_PREFIX 437 if len(prefix) == prefix_skip == len(pattern.data): 438 mask = mask + SRE_INFO_LITERAL 439 elif charset: 440 mask = mask + SRE_INFO_CHARSET 441 emit(mask) 442 # pattern length 443 if lo < MAXCODE: 444 emit(lo) 445 else: 446 emit(MAXCODE) 447 prefix = prefix[:MAXCODE] 448 if hi < MAXCODE: 449 emit(hi) 450 else: 451 emit(0) 452 # add literal prefix 453 if prefix: 454 emit(len(prefix)) # length 455 emit(prefix_skip) # skip 456 code.extend(prefix) 457 # generate overlap table 458 table = [-1] + ([0]*len(prefix)) 459 for i in xrange(len(prefix)): 460 table[i+1] = table[i]+1 461 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 462 table[i+1] = table[table[i+1]-1]+1 463 code.extend(table[1:]) # don't store first entry 464 elif charset: 465 _compile_charset(charset, flags, code) 466 code[skip] = len(code) - skip 467 468 try: 469 unicode 470 except NameError: 471 STRING_TYPES = (type(""),) 472 else: 473 STRING_TYPES = (type(""), type(unicode(""))) 474 475 def isstring(obj): 476 for tp in STRING_TYPES: 477 if isinstance(obj, tp): 478 return 1 479 return 0 480 481 def _code(p, flags): 482 483 flags = p.pattern.flags | flags 484 code = [] 485 486 # compile info block 487 _compile_info(code, p, flags) 488 489 # compile the pattern 490 _compile(code, p.data, flags) 491 492 code.append(OPCODES[SUCCESS]) 493 494 return code 495 496 def compile(p, flags=0): 497 # internal: convert pattern list to internal format 498 499 if isstring(p): 500 pattern = p 501 p = sre_parse.parse(p, flags) 502 else: 503 pattern = None 504 505 code = _code(p, flags) 506 507 # print code 508 509 # XXX: <fl> get rid of this limitation! 510 if p.pattern.groups > 100: 511 raise AssertionError( 512 "sorry, but this version only supports 100 named groups" 513 ) 514 515 # map in either direction 516 groupindex = p.pattern.groupdict 517 indexgroup = [None] * p.pattern.groups 518 for k, i in groupindex.items(): 519 indexgroup[i] = k 520 521 return _sre.compile( 522 pattern, flags | p.pattern.flags, code, 523 p.pattern.groups-1, 524 groupindex, indexgroup 525 ) 526