1 # 2 # Secret Labs' Regular Expression Engine 3 # 4 # convert template to internal format 5 # 6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7 # 8 # See the sre.py file for information on usage and redistribution. 9 # 10 11 """Internal support module for sre""" 12 13 import _sre 14 import sre_parse 15 from sre_constants import * 16 17 assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19 _LITERAL_CODES = {LITERAL, NOT_LITERAL} 20 _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} 21 _SUCCESS_CODES = {SUCCESS, FAILURE} 22 _ASSERT_CODES = {ASSERT, ASSERT_NOT} 23 24 # Sets of lowercase characters which have the same uppercase. 25 _equivalences = ( 26 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I 27 (0x69, 0x131), # i 28 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S 29 (0x73, 0x17f), # s 30 # MICRO SIGN, GREEK SMALL LETTER MU 31 (0xb5, 0x3bc), # 32 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI 33 (0x345, 0x3b9, 0x1fbe), # \u0345 34 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 35 (0x390, 0x1fd3), # 36 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 37 (0x3b0, 0x1fe3), # 38 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL 39 (0x3b2, 0x3d0), # 40 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL 41 (0x3b5, 0x3f5), # 42 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL 43 (0x3b8, 0x3d1), # 44 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL 45 (0x3ba, 0x3f0), # 46 # GREEK SMALL LETTER PI, GREEK PI SYMBOL 47 (0x3c0, 0x3d6), # 48 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL 49 (0x3c1, 0x3f1), # 50 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA 51 (0x3c2, 0x3c3), # 52 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL 53 (0x3c6, 0x3d5), # 54 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE 55 (0x1e61, 0x1e9b), # 56 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST 57 (0xfb05, 0xfb06), # 58 ) 59 60 # Maps the lowercase code to lowercase codes which have the same uppercase. 61 _ignorecase_fixes = {i: tuple(j for j in t if i != j) 62 for t in _equivalences for i in t} 63 64 def _compile(code, pattern, flags): 65 # internal: compile a (sub)pattern 66 emit = code.append 67 _len = len 68 LITERAL_CODES = _LITERAL_CODES 69 REPEATING_CODES = _REPEATING_CODES 70 SUCCESS_CODES = _SUCCESS_CODES 71 ASSERT_CODES = _ASSERT_CODES 72 if (flags & SRE_FLAG_IGNORECASE and 73 not (flags & SRE_FLAG_LOCALE) and 74 flags & SRE_FLAG_UNICODE and 75 not (flags & SRE_FLAG_ASCII)): 76 fixes = _ignorecase_fixes 77 else: 78 fixes = None 79 for op, av in pattern: 80 if op in LITERAL_CODES: 81 if flags & SRE_FLAG_IGNORECASE: 82 lo = _sre.getlower(av, flags) 83 if fixes and lo in fixes: 84 emit(IN_IGNORE) 85 skip = _len(code); emit(0) 86 if op is NOT_LITERAL: 87 emit(NEGATE) 88 for k in (lo,) + fixes[lo]: 89 emit(LITERAL) 90 emit(k) 91 emit(FAILURE) 92 code[skip] = _len(code) - skip 93 else: 94 emit(OP_IGNORE[op]) 95 emit(lo) 96 else: 97 emit(op) 98 emit(av) 99 elif op is IN: 100 if flags & SRE_FLAG_IGNORECASE: 101 emit(OP_IGNORE[op]) 102 def fixup(literal, flags=flags): 103 return _sre.getlower(literal, flags) 104 else: 105 emit(op) 106 fixup = None 107 skip = _len(code); emit(0) 108 _compile_charset(av, flags, code, fixup, fixes) 109 code[skip] = _len(code) - skip 110 elif op is ANY: 111 if flags & SRE_FLAG_DOTALL: 112 emit(ANY_ALL) 113 else: 114 emit(ANY) 115 elif op in REPEATING_CODES: 116 if flags & SRE_FLAG_TEMPLATE: 117 raise error("internal: unsupported template operator %r" % (op,)) 118 elif _simple(av) and op is not REPEAT: 119 if op is MAX_REPEAT: 120 emit(REPEAT_ONE) 121 else: 122 emit(MIN_REPEAT_ONE) 123 skip = _len(code); emit(0) 124 emit(av[0]) 125 emit(av[1]) 126 _compile(code, av[2], flags) 127 emit(SUCCESS) 128 code[skip] = _len(code) - skip 129 else: 130 emit(REPEAT) 131 skip = _len(code); emit(0) 132 emit(av[0]) 133 emit(av[1]) 134 _compile(code, av[2], flags) 135 code[skip] = _len(code) - skip 136 if op is MAX_REPEAT: 137 emit(MAX_UNTIL) 138 else: 139 emit(MIN_UNTIL) 140 elif op is SUBPATTERN: 141 group, add_flags, del_flags, p = av 142 if group: 143 emit(MARK) 144 emit((group-1)*2) 145 # _compile_info(code, p, (flags | add_flags) & ~del_flags) 146 _compile(code, p, (flags | add_flags) & ~del_flags) 147 if group: 148 emit(MARK) 149 emit((group-1)*2+1) 150 elif op in SUCCESS_CODES: 151 emit(op) 152 elif op in ASSERT_CODES: 153 emit(op) 154 skip = _len(code); emit(0) 155 if av[0] >= 0: 156 emit(0) # look ahead 157 else: 158 lo, hi = av[1].getwidth() 159 if lo != hi: 160 raise error("look-behind requires fixed-width pattern") 161 emit(lo) # look behind 162 _compile(code, av[1], flags) 163 emit(SUCCESS) 164 code[skip] = _len(code) - skip 165 elif op is CALL: 166 emit(op) 167 skip = _len(code); emit(0) 168 _compile(code, av, flags) 169 emit(SUCCESS) 170 code[skip] = _len(code) - skip 171 elif op is AT: 172 emit(op) 173 if flags & SRE_FLAG_MULTILINE: 174 av = AT_MULTILINE.get(av, av) 175 if flags & SRE_FLAG_LOCALE: 176 av = AT_LOCALE.get(av, av) 177 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): 178 av = AT_UNICODE.get(av, av) 179 emit(av) 180 elif op is BRANCH: 181 emit(op) 182 tail = [] 183 tailappend = tail.append 184 for av in av[1]: 185 skip = _len(code); emit(0) 186 # _compile_info(code, av, flags) 187 _compile(code, av, flags) 188 emit(JUMP) 189 tailappend(_len(code)); emit(0) 190 code[skip] = _len(code) - skip 191 emit(FAILURE) # end of branch 192 for tail in tail: 193 code[tail] = _len(code) - tail 194 elif op is CATEGORY: 195 emit(op) 196 if flags & SRE_FLAG_LOCALE: 197 av = CH_LOCALE[av] 198 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): 199 av = CH_UNICODE[av] 200 emit(av) 201 elif op is GROUPREF: 202 if flags & SRE_FLAG_IGNORECASE: 203 emit(OP_IGNORE[op]) 204 else: 205 emit(op) 206 emit(av-1) 207 elif op is GROUPREF_EXISTS: 208 emit(op) 209 emit(av[0]-1) 210 skipyes = _len(code); emit(0) 211 _compile(code, av[1], flags) 212 if av[2]: 213 emit(JUMP) 214 skipno = _len(code); emit(0) 215 code[skipyes] = _len(code) - skipyes + 1 216 _compile(code, av[2], flags) 217 code[skipno] = _len(code) - skipno 218 else: 219 code[skipyes] = _len(code) - skipyes + 1 220 else: 221 raise error("internal: unsupported operand type %r" % (op,)) 222 223 def _compile_charset(charset, flags, code, fixup=None, fixes=None): 224 # compile charset subprogram 225 emit = code.append 226 for op, av in _optimize_charset(charset, fixup, fixes): 227 emit(op) 228 if op is NEGATE: 229 pass 230 elif op is LITERAL: 231 emit(av) 232 elif op is RANGE or op is RANGE_IGNORE: 233 emit(av[0]) 234 emit(av[1]) 235 elif op is CHARSET: 236 code.extend(av) 237 elif op is BIGCHARSET: 238 code.extend(av) 239 elif op is CATEGORY: 240 if flags & SRE_FLAG_LOCALE: 241 emit(CH_LOCALE[av]) 242 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): 243 emit(CH_UNICODE[av]) 244 else: 245 emit(av) 246 else: 247 raise error("internal: unsupported set operator %r" % (op,)) 248 emit(FAILURE) 249 250 def _optimize_charset(charset, fixup, fixes): 251 # internal: optimize character set 252 out = [] 253 tail = [] 254 charmap = bytearray(256) 255 for op, av in charset: 256 while True: 257 try: 258 if op is LITERAL: 259 if fixup: 260 lo = fixup(av) 261 charmap[lo] = 1 262 if fixes and lo in fixes: 263 for k in fixes[lo]: 264 charmap[k] = 1 265 else: 266 charmap[av] = 1 267 elif op is RANGE: 268 r = range(av[0], av[1]+1) 269 if fixup: 270 r = map(fixup, r) 271 if fixup and fixes: 272 for i in r: 273 charmap[i] = 1 274 if i in fixes: 275 for k in fixes[i]: 276 charmap[k] = 1 277 else: 278 for i in r: 279 charmap[i] = 1 280 elif op is NEGATE: 281 out.append((op, av)) 282 else: 283 tail.append((op, av)) 284 except IndexError: 285 if len(charmap) == 256: 286 # character set contains non-UCS1 character codes 287 charmap += b'\0' * 0xff00 288 continue 289 # Character set contains non-BMP character codes. 290 # There are only two ranges of cased non-BMP characters: 291 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), 292 # and for both ranges RANGE_IGNORE works. 293 if fixup and op is RANGE: 294 op = RANGE_IGNORE 295 tail.append((op, av)) 296 break 297 298 # compress character map 299 runs = [] 300 q = 0 301 while True: 302 p = charmap.find(1, q) 303 if p < 0: 304 break 305 if len(runs) >= 2: 306 runs = None 307 break 308 q = charmap.find(0, p) 309 if q < 0: 310 runs.append((p, len(charmap))) 311 break 312 runs.append((p, q)) 313 if runs is not None: 314 # use literal/range 315 for p, q in runs: 316 if q - p == 1: 317 out.append((LITERAL, p)) 318 else: 319 out.append((RANGE, (p, q - 1))) 320 out += tail 321 # if the case was changed or new representation is more compact 322 if fixup or len(out) < len(charset): 323 return out 324 # else original character set is good enough 325 return charset 326 327 # use bitmap 328 if len(charmap) == 256: 329 data = _mk_bitmap(charmap) 330 out.append((CHARSET, data)) 331 out += tail 332 return out 333 334 # To represent a big charset, first a bitmap of all characters in the 335 # set is constructed. Then, this bitmap is sliced into chunks of 256 336 # characters, duplicate chunks are eliminated, and each chunk is 337 # given a number. In the compiled expression, the charset is 338 # represented by a 32-bit word sequence, consisting of one word for 339 # the number of different chunks, a sequence of 256 bytes (64 words) 340 # of chunk numbers indexed by their original chunk position, and a 341 # sequence of 256-bit chunks (8 words each). 342 343 # Compression is normally good: in a typical charset, large ranges of 344 # Unicode will be either completely excluded (e.g. if only cyrillic 345 # letters are to be matched), or completely included (e.g. if large 346 # subranges of Kanji match). These ranges will be represented by 347 # chunks of all one-bits or all zero-bits. 348 349 # Matching can be also done efficiently: the more significant byte of 350 # the Unicode character is an index into the chunk number, and the 351 # less significant byte is a bit index in the chunk (just like the 352 # CHARSET matching). 353 354 charmap = bytes(charmap) # should be hashable 355 comps = {} 356 mapping = bytearray(256) 357 block = 0 358 data = bytearray() 359 for i in range(0, 65536, 256): 360 chunk = charmap[i: i + 256] 361 if chunk in comps: 362 mapping[i // 256] = comps[chunk] 363 else: 364 mapping[i // 256] = comps[chunk] = block 365 block += 1 366 data += chunk 367 data = _mk_bitmap(data) 368 data[0:0] = [block] + _bytes_to_codes(mapping) 369 out.append((BIGCHARSET, data)) 370 out += tail 371 return out 372 373 _CODEBITS = _sre.CODESIZE * 8 374 MAXCODE = (1 << _CODEBITS) - 1 375 _BITS_TRANS = b'0' + b'1' * 255 376 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 377 s = bits.translate(_BITS_TRANS)[::-1] 378 return [_int(s[i - _CODEBITS: i], 2) 379 for i in range(len(s), 0, -_CODEBITS)] 380 381 def _bytes_to_codes(b): 382 # Convert block indices to word array 383 a = memoryview(b).cast('I') 384 assert a.itemsize == _sre.CODESIZE 385 assert len(a) * a.itemsize == len(b) 386 return a.tolist() 387 388 def _simple(av): 389 # check if av is a "simple" operator 390 lo, hi = av[2].getwidth() 391 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 392 393 def _generate_overlap_table(prefix): 394 """ 395 Generate an overlap table for the following prefix. 396 An overlap table is a table of the same size as the prefix which 397 informs about the potential self-overlap for each index in the prefix: 398 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] 399 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with 400 prefix[0:k] 401 """ 402 table = [0] * len(prefix) 403 for i in range(1, len(prefix)): 404 idx = table[i - 1] 405 while prefix[i] != prefix[idx]: 406 if idx == 0: 407 table[i] = 0 408 break 409 idx = table[idx - 1] 410 else: 411 table[i] = idx + 1 412 return table 413 414 def _get_literal_prefix(pattern): 415 # look for literal prefix 416 prefix = [] 417 prefixappend = prefix.append 418 prefix_skip = None 419 for op, av in pattern.data: 420 if op is LITERAL: 421 prefixappend(av) 422 elif op is SUBPATTERN: 423 group, add_flags, del_flags, p = av 424 if add_flags & SRE_FLAG_IGNORECASE: 425 break 426 prefix1, prefix_skip1, got_all = _get_literal_prefix(p) 427 if prefix_skip is None: 428 if group is not None: 429 prefix_skip = len(prefix) 430 elif prefix_skip1 is not None: 431 prefix_skip = len(prefix) + prefix_skip1 432 prefix.extend(prefix1) 433 if not got_all: 434 break 435 else: 436 break 437 else: 438 return prefix, prefix_skip, True 439 return prefix, prefix_skip, False 440 441 def _get_charset_prefix(pattern): 442 charset = [] # not used 443 charsetappend = charset.append 444 if pattern.data: 445 op, av = pattern.data[0] 446 if op is SUBPATTERN: 447 group, add_flags, del_flags, p = av 448 if p and not (add_flags & SRE_FLAG_IGNORECASE): 449 op, av = p[0] 450 if op is LITERAL: 451 charsetappend((op, av)) 452 elif op is BRANCH: 453 c = [] 454 cappend = c.append 455 for p in av[1]: 456 if not p: 457 break 458 op, av = p[0] 459 if op is LITERAL: 460 cappend((op, av)) 461 else: 462 break 463 else: 464 charset = c 465 elif op is BRANCH: 466 c = [] 467 cappend = c.append 468 for p in av[1]: 469 if not p: 470 break 471 op, av = p[0] 472 if op is LITERAL: 473 cappend((op, av)) 474 else: 475 break 476 else: 477 charset = c 478 elif op is IN: 479 charset = av 480 return charset 481 482 def _compile_info(code, pattern, flags): 483 # internal: compile an info block. in the current version, 484 # this contains min/max pattern width, and an optional literal 485 # prefix or a character map 486 lo, hi = pattern.getwidth() 487 if hi > MAXCODE: 488 hi = MAXCODE 489 if lo == 0: 490 code.extend([INFO, 4, 0, lo, hi]) 491 return 492 # look for a literal prefix 493 prefix = [] 494 prefix_skip = 0 495 charset = [] # not used 496 if not (flags & SRE_FLAG_IGNORECASE): 497 # look for literal prefix 498 prefix, prefix_skip, got_all = _get_literal_prefix(pattern) 499 # if no prefix, look for charset prefix 500 if not prefix: 501 charset = _get_charset_prefix(pattern) 502 ## if prefix: 503 ## print("*** PREFIX", prefix, prefix_skip) 504 ## if charset: 505 ## print("*** CHARSET", charset) 506 # add an info block 507 emit = code.append 508 emit(INFO) 509 skip = len(code); emit(0) 510 # literal flag 511 mask = 0 512 if prefix: 513 mask = SRE_INFO_PREFIX 514 if prefix_skip is None and got_all: 515 mask = mask | SRE_INFO_LITERAL 516 elif charset: 517 mask = mask | SRE_INFO_CHARSET 518 emit(mask) 519 # pattern length 520 if lo < MAXCODE: 521 emit(lo) 522 else: 523 emit(MAXCODE) 524 prefix = prefix[:MAXCODE] 525 emit(min(hi, MAXCODE)) 526 # add literal prefix 527 if prefix: 528 emit(len(prefix)) # length 529 if prefix_skip is None: 530 prefix_skip = len(prefix) 531 emit(prefix_skip) # skip 532 code.extend(prefix) 533 # generate overlap table 534 code.extend(_generate_overlap_table(prefix)) 535 elif charset: 536 _compile_charset(charset, flags, code) 537 code[skip] = len(code) - skip 538 539 def isstring(obj): 540 return isinstance(obj, (str, bytes)) 541 542 def _code(p, flags): 543 544 flags = p.pattern.flags | flags 545 code = [] 546 547 # compile info block 548 _compile_info(code, p, flags) 549 550 # compile the pattern 551 _compile(code, p.data, flags) 552 553 code.append(SUCCESS) 554 555 return code 556 557 def compile(p, flags=0): 558 # internal: convert pattern list to internal format 559 560 if isstring(p): 561 pattern = p 562 p = sre_parse.parse(p, flags) 563 else: 564 pattern = None 565 566 code = _code(p, flags) 567 568 # print(code) 569 570 # map in either direction 571 groupindex = p.pattern.groupdict 572 indexgroup = [None] * p.pattern.groups 573 for k, i in groupindex.items(): 574 indexgroup[i] = k 575 576 return _sre.compile( 577 pattern, flags | p.pattern.flags, code, 578 p.pattern.groups-1, 579 groupindex, indexgroup 580 ) 581