1 # 2 # Secret Labs' Regular Expression Engine 3 # 4 # convert re-style regular expression to sre pattern 5 # 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7 # 8 # See the sre.py file for information on usage and redistribution. 9 # 10 11 """Internal support module for sre""" 12 13 # XXX: show string offset and offending character for all errors 14 15 from sre_constants import * 16 17 SPECIAL_CHARS = ".\\[{()*+?^$|" 18 REPEAT_CHARS = "*+?{" 19 20 DIGITS = frozenset("0123456789") 21 22 OCTDIGITS = frozenset("01234567") 23 HEXDIGITS = frozenset("0123456789abcdefABCDEF") 24 ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 25 26 WHITESPACE = frozenset(" \t\n\r\v\f") 27 28 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) 29 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) 30 31 ESCAPES = { 32 r"\a": (LITERAL, ord("\a")), 33 r"\b": (LITERAL, ord("\b")), 34 r"\f": (LITERAL, ord("\f")), 35 r"\n": (LITERAL, ord("\n")), 36 r"\r": (LITERAL, ord("\r")), 37 r"\t": (LITERAL, ord("\t")), 38 r"\v": (LITERAL, ord("\v")), 39 r"\\": (LITERAL, ord("\\")) 40 } 41 42 CATEGORIES = { 43 r"\A": (AT, AT_BEGINNING_STRING), # start of string 44 r"\b": (AT, AT_BOUNDARY), 45 r"\B": (AT, AT_NON_BOUNDARY), 46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 52 r"\Z": (AT, AT_END_STRING), # end of string 53 } 54 55 FLAGS = { 56 # standard flags 57 "i": SRE_FLAG_IGNORECASE, 58 "L": SRE_FLAG_LOCALE, 59 "m": SRE_FLAG_MULTILINE, 60 "s": SRE_FLAG_DOTALL, 61 "x": SRE_FLAG_VERBOSE, 62 # extensions 63 "a": SRE_FLAG_ASCII, 64 "t": SRE_FLAG_TEMPLATE, 65 "u": SRE_FLAG_UNICODE, 66 } 67 68 GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | 69 SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) 70 71 class Verbose(Exception): 72 pass 73 74 class Pattern: 75 # master pattern object. keeps track of global attributes 76 def __init__(self): 77 self.flags = 0 78 self.groupdict = {} 79 self.groupwidths = [None] # group 0 80 self.lookbehindgroups = None 81 @property 82 def groups(self): 83 return len(self.groupwidths) 84 def opengroup(self, name=None): 85 gid = self.groups 86 self.groupwidths.append(None) 87 if self.groups > MAXGROUPS: 88 raise error("too many groups") 89 if name is not None: 90 ogid = self.groupdict.get(name, None) 91 if ogid is not None: 92 raise error("redefinition of group name %r as group %d; " 93 "was group %d" % (name, gid, ogid)) 94 self.groupdict[name] = gid 95 return gid 96 def closegroup(self, gid, p): 97 self.groupwidths[gid] = p.getwidth() 98 def checkgroup(self, gid): 99 return gid < self.groups and self.groupwidths[gid] is not None 100 101 def checklookbehindgroup(self, gid, source): 102 if self.lookbehindgroups is not None: 103 if not self.checkgroup(gid): 104 raise source.error('cannot refer to an open group') 105 if gid >= self.lookbehindgroups: 106 raise source.error('cannot refer to group defined in the same ' 107 'lookbehind subpattern') 108 109 class SubPattern: 110 # a subpattern, in intermediate form 111 def __init__(self, pattern, data=None): 112 self.pattern = pattern 113 if data is None: 114 data = [] 115 self.data = data 116 self.width = None 117 def dump(self, level=0): 118 nl = True 119 seqtypes = (tuple, list) 120 for op, av in self.data: 121 print(level*" " + str(op), end='') 122 if op is IN: 123 # member sublanguage 124 print() 125 for op, a in av: 126 print((level+1)*" " + str(op), a) 127 elif op is BRANCH: 128 print() 129 for i, a in enumerate(av[1]): 130 if i: 131 print(level*" " + "OR") 132 a.dump(level+1) 133 elif op is GROUPREF_EXISTS: 134 condgroup, item_yes, item_no = av 135 print('', condgroup) 136 item_yes.dump(level+1) 137 if item_no: 138 print(level*" " + "ELSE") 139 item_no.dump(level+1) 140 elif isinstance(av, seqtypes): 141 nl = False 142 for a in av: 143 if isinstance(a, SubPattern): 144 if not nl: 145 print() 146 a.dump(level+1) 147 nl = True 148 else: 149 if not nl: 150 print(' ', end='') 151 print(a, end='') 152 nl = False 153 if not nl: 154 print() 155 else: 156 print('', av) 157 def __repr__(self): 158 return repr(self.data) 159 def __len__(self): 160 return len(self.data) 161 def __delitem__(self, index): 162 del self.data[index] 163 def __getitem__(self, index): 164 if isinstance(index, slice): 165 return SubPattern(self.pattern, self.data[index]) 166 return self.data[index] 167 def __setitem__(self, index, code): 168 self.data[index] = code 169 def insert(self, index, code): 170 self.data.insert(index, code) 171 def append(self, code): 172 self.data.append(code) 173 def getwidth(self): 174 # determine the width (min, max) for this subpattern 175 if self.width is not None: 176 return self.width 177 lo = hi = 0 178 for op, av in self.data: 179 if op is BRANCH: 180 i = MAXREPEAT - 1 181 j = 0 182 for av in av[1]: 183 l, h = av.getwidth() 184 i = min(i, l) 185 j = max(j, h) 186 lo = lo + i 187 hi = hi + j 188 elif op is CALL: 189 i, j = av.getwidth() 190 lo = lo + i 191 hi = hi + j 192 elif op is SUBPATTERN: 193 i, j = av[-1].getwidth() 194 lo = lo + i 195 hi = hi + j 196 elif op in _REPEATCODES: 197 i, j = av[2].getwidth() 198 lo = lo + i * av[0] 199 hi = hi + j * av[1] 200 elif op in _UNITCODES: 201 lo = lo + 1 202 hi = hi + 1 203 elif op is GROUPREF: 204 i, j = self.pattern.groupwidths[av] 205 lo = lo + i 206 hi = hi + j 207 elif op is GROUPREF_EXISTS: 208 i, j = av[1].getwidth() 209 if av[2] is not None: 210 l, h = av[2].getwidth() 211 i = min(i, l) 212 j = max(j, h) 213 else: 214 i = 0 215 lo = lo + i 216 hi = hi + j 217 elif op is SUCCESS: 218 break 219 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 220 return self.width 221 222 class Tokenizer: 223 def __init__(self, string): 224 self.istext = isinstance(string, str) 225 self.string = string 226 if not self.istext: 227 string = str(string, 'latin1') 228 self.decoded_string = string 229 self.index = 0 230 self.next = None 231 self.__next() 232 def __next(self): 233 index = self.index 234 try: 235 char = self.decoded_string[index] 236 except IndexError: 237 self.next = None 238 return 239 if char == "\\": 240 index += 1 241 try: 242 char += self.decoded_string[index] 243 except IndexError: 244 raise error("bad escape (end of pattern)", 245 self.string, len(self.string) - 1) from None 246 self.index = index + 1 247 self.next = char 248 def match(self, char): 249 if char == self.next: 250 self.__next() 251 return True 252 return False 253 def get(self): 254 this = self.next 255 self.__next() 256 return this 257 def getwhile(self, n, charset): 258 result = '' 259 for _ in range(n): 260 c = self.next 261 if c not in charset: 262 break 263 result += c 264 self.__next() 265 return result 266 def getuntil(self, terminator): 267 result = '' 268 while True: 269 c = self.next 270 self.__next() 271 if c is None: 272 if not result: 273 raise self.error("missing group name") 274 raise self.error("missing %s, unterminated name" % terminator, 275 len(result)) 276 if c == terminator: 277 if not result: 278 raise self.error("missing group name", 1) 279 break 280 result += c 281 return result 282 @property 283 def pos(self): 284 return self.index - len(self.next or '') 285 def tell(self): 286 return self.index - len(self.next or '') 287 def seek(self, index): 288 self.index = index 289 self.__next() 290 291 def error(self, msg, offset=0): 292 return error(msg, self.string, self.tell() - offset) 293 294 def _class_escape(source, escape): 295 # handle escape code inside character class 296 code = ESCAPES.get(escape) 297 if code: 298 return code 299 code = CATEGORIES.get(escape) 300 if code and code[0] is IN: 301 return code 302 try: 303 c = escape[1:2] 304 if c == "x": 305 # hexadecimal escape (exactly two digits) 306 escape += source.getwhile(2, HEXDIGITS) 307 if len(escape) != 4: 308 raise source.error("incomplete escape %s" % escape, len(escape)) 309 return LITERAL, int(escape[2:], 16) 310 elif c == "u" and source.istext: 311 # unicode escape (exactly four digits) 312 escape += source.getwhile(4, HEXDIGITS) 313 if len(escape) != 6: 314 raise source.error("incomplete escape %s" % escape, len(escape)) 315 return LITERAL, int(escape[2:], 16) 316 elif c == "U" and source.istext: 317 # unicode escape (exactly eight digits) 318 escape += source.getwhile(8, HEXDIGITS) 319 if len(escape) != 10: 320 raise source.error("incomplete escape %s" % escape, len(escape)) 321 c = int(escape[2:], 16) 322 chr(c) # raise ValueError for invalid code 323 return LITERAL, c 324 elif c in OCTDIGITS: 325 # octal escape (up to three digits) 326 escape += source.getwhile(2, OCTDIGITS) 327 c = int(escape[1:], 8) 328 if c > 0o377: 329 raise source.error('octal escape value %s outside of ' 330 'range 0-0o377' % escape, len(escape)) 331 return LITERAL, c 332 elif c in DIGITS: 333 raise ValueError 334 if len(escape) == 2: 335 if c in ASCIILETTERS: 336 raise source.error('bad escape %s' % escape, len(escape)) 337 return LITERAL, ord(escape[1]) 338 except ValueError: 339 pass 340 raise source.error("bad escape %s" % escape, len(escape)) 341 342 def _escape(source, escape, state): 343 # handle escape code in expression 344 code = CATEGORIES.get(escape) 345 if code: 346 return code 347 code = ESCAPES.get(escape) 348 if code: 349 return code 350 try: 351 c = escape[1:2] 352 if c == "x": 353 # hexadecimal escape 354 escape += source.getwhile(2, HEXDIGITS) 355 if len(escape) != 4: 356 raise source.error("incomplete escape %s" % escape, len(escape)) 357 return LITERAL, int(escape[2:], 16) 358 elif c == "u" and source.istext: 359 # unicode escape (exactly four digits) 360 escape += source.getwhile(4, HEXDIGITS) 361 if len(escape) != 6: 362 raise source.error("incomplete escape %s" % escape, len(escape)) 363 return LITERAL, int(escape[2:], 16) 364 elif c == "U" and source.istext: 365 # unicode escape (exactly eight digits) 366 escape += source.getwhile(8, HEXDIGITS) 367 if len(escape) != 10: 368 raise source.error("incomplete escape %s" % escape, len(escape)) 369 c = int(escape[2:], 16) 370 chr(c) # raise ValueError for invalid code 371 return LITERAL, c 372 elif c == "0": 373 # octal escape 374 escape += source.getwhile(2, OCTDIGITS) 375 return LITERAL, int(escape[1:], 8) 376 elif c in DIGITS: 377 # octal escape *or* decimal group reference (sigh) 378 if source.next in DIGITS: 379 escape += source.get() 380 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 381 source.next in OCTDIGITS): 382 # got three octal digits; this is an octal escape 383 escape += source.get() 384 c = int(escape[1:], 8) 385 if c > 0o377: 386 raise source.error('octal escape value %s outside of ' 387 'range 0-0o377' % escape, 388 len(escape)) 389 return LITERAL, c 390 # not an octal escape, so this is a group reference 391 group = int(escape[1:]) 392 if group < state.groups: 393 if not state.checkgroup(group): 394 raise source.error("cannot refer to an open group", 395 len(escape)) 396 state.checklookbehindgroup(group, source) 397 return GROUPREF, group 398 raise source.error("invalid group reference %d" % group, len(escape) - 1) 399 if len(escape) == 2: 400 if c in ASCIILETTERS: 401 raise source.error("bad escape %s" % escape, len(escape)) 402 return LITERAL, ord(escape[1]) 403 except ValueError: 404 pass 405 raise source.error("bad escape %s" % escape, len(escape)) 406 407 def _parse_sub(source, state, verbose, nested=True): 408 # parse an alternation: a|b|c 409 410 items = [] 411 itemsappend = items.append 412 sourcematch = source.match 413 start = source.tell() 414 while True: 415 itemsappend(_parse(source, state, verbose)) 416 if not sourcematch("|"): 417 break 418 419 if len(items) == 1: 420 return items[0] 421 422 subpattern = SubPattern(state) 423 subpatternappend = subpattern.append 424 425 # check if all items share a common prefix 426 while True: 427 prefix = None 428 for item in items: 429 if not item: 430 break 431 if prefix is None: 432 prefix = item[0] 433 elif item[0] != prefix: 434 break 435 else: 436 # all subitems start with a common "prefix". 437 # move it out of the branch 438 for item in items: 439 del item[0] 440 subpatternappend(prefix) 441 continue # check next one 442 break 443 444 # check if the branch can be replaced by a character set 445 for item in items: 446 if len(item) != 1 or item[0][0] is not LITERAL: 447 break 448 else: 449 # we can store this as a character set instead of a 450 # branch (the compiler may optimize this even more) 451 subpatternappend((IN, [item[0] for item in items])) 452 return subpattern 453 454 subpattern.append((BRANCH, (None, items))) 455 return subpattern 456 457 def _parse_sub_cond(source, state, condgroup, verbose): 458 item_yes = _parse(source, state, verbose) 459 if source.match("|"): 460 item_no = _parse(source, state, verbose) 461 if source.next == "|": 462 raise source.error("conditional backref with more than two branches") 463 else: 464 item_no = None 465 subpattern = SubPattern(state) 466 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 467 return subpattern 468 469 def _parse(source, state, verbose): 470 # parse a simple pattern 471 subpattern = SubPattern(state) 472 473 # precompute constants into local variables 474 subpatternappend = subpattern.append 475 sourceget = source.get 476 sourcematch = source.match 477 _len = len 478 _ord = ord 479 480 while True: 481 482 this = source.next 483 if this is None: 484 break # end of pattern 485 if this in "|)": 486 break # end of subpattern 487 sourceget() 488 489 if verbose: 490 # skip whitespace and comments 491 if this in WHITESPACE: 492 continue 493 if this == "#": 494 while True: 495 this = sourceget() 496 if this is None or this == "\n": 497 break 498 continue 499 500 if this[0] == "\\": 501 code = _escape(source, this, state) 502 subpatternappend(code) 503 504 elif this not in SPECIAL_CHARS: 505 subpatternappend((LITERAL, _ord(this))) 506 507 elif this == "[": 508 here = source.tell() - 1 509 # character set 510 set = [] 511 setappend = set.append 512 ## if sourcematch(":"): 513 ## pass # handle character classes 514 if sourcematch("^"): 515 setappend((NEGATE, None)) 516 # check remaining characters 517 start = set[:] 518 while True: 519 this = sourceget() 520 if this is None: 521 raise source.error("unterminated character set", 522 source.tell() - here) 523 if this == "]" and set != start: 524 break 525 elif this[0] == "\\": 526 code1 = _class_escape(source, this) 527 else: 528 code1 = LITERAL, _ord(this) 529 if sourcematch("-"): 530 # potential range 531 that = sourceget() 532 if that is None: 533 raise source.error("unterminated character set", 534 source.tell() - here) 535 if that == "]": 536 if code1[0] is IN: 537 code1 = code1[1][0] 538 setappend(code1) 539 setappend((LITERAL, _ord("-"))) 540 break 541 if that[0] == "\\": 542 code2 = _class_escape(source, that) 543 else: 544 code2 = LITERAL, _ord(that) 545 if code1[0] != LITERAL or code2[0] != LITERAL: 546 msg = "bad character range %s-%s" % (this, that) 547 raise source.error(msg, len(this) + 1 + len(that)) 548 lo = code1[1] 549 hi = code2[1] 550 if hi < lo: 551 msg = "bad character range %s-%s" % (this, that) 552 raise source.error(msg, len(this) + 1 + len(that)) 553 setappend((RANGE, (lo, hi))) 554 else: 555 if code1[0] is IN: 556 code1 = code1[1][0] 557 setappend(code1) 558 559 # XXX: <fl> should move set optimization to compiler! 560 if _len(set)==1 and set[0][0] is LITERAL: 561 subpatternappend(set[0]) # optimization 562 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 563 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 564 else: 565 # XXX: <fl> should add charmap optimization here 566 subpatternappend((IN, set)) 567 568 elif this in REPEAT_CHARS: 569 # repeat previous item 570 here = source.tell() 571 if this == "?": 572 min, max = 0, 1 573 elif this == "*": 574 min, max = 0, MAXREPEAT 575 576 elif this == "+": 577 min, max = 1, MAXREPEAT 578 elif this == "{": 579 if source.next == "}": 580 subpatternappend((LITERAL, _ord(this))) 581 continue 582 min, max = 0, MAXREPEAT 583 lo = hi = "" 584 while source.next in DIGITS: 585 lo += sourceget() 586 if sourcematch(","): 587 while source.next in DIGITS: 588 hi += sourceget() 589 else: 590 hi = lo 591 if not sourcematch("}"): 592 subpatternappend((LITERAL, _ord(this))) 593 source.seek(here) 594 continue 595 if lo: 596 min = int(lo) 597 if min >= MAXREPEAT: 598 raise OverflowError("the repetition number is too large") 599 if hi: 600 max = int(hi) 601 if max >= MAXREPEAT: 602 raise OverflowError("the repetition number is too large") 603 if max < min: 604 raise source.error("min repeat greater than max repeat", 605 source.tell() - here) 606 else: 607 raise AssertionError("unsupported quantifier %r" % (char,)) 608 # figure out which item to repeat 609 if subpattern: 610 item = subpattern[-1:] 611 else: 612 item = None 613 if not item or (_len(item) == 1 and item[0][0] is AT): 614 raise source.error("nothing to repeat", 615 source.tell() - here + len(this)) 616 if item[0][0] in _REPEATCODES: 617 raise source.error("multiple repeat", 618 source.tell() - here + len(this)) 619 if sourcematch("?"): 620 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 621 else: 622 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 623 624 elif this == ".": 625 subpatternappend((ANY, None)) 626 627 elif this == "(": 628 start = source.tell() - 1 629 group = True 630 name = None 631 condgroup = None 632 add_flags = 0 633 del_flags = 0 634 if sourcematch("?"): 635 # options 636 char = sourceget() 637 if char is None: 638 raise source.error("unexpected end of pattern") 639 if char == "P": 640 # python extensions 641 if sourcematch("<"): 642 # named group: skip forward to end of name 643 name = source.getuntil(">") 644 if not name.isidentifier(): 645 msg = "bad character in group name %r" % name 646 raise source.error(msg, len(name) + 1) 647 elif sourcematch("="): 648 # named backreference 649 name = source.getuntil(")") 650 if not name.isidentifier(): 651 msg = "bad character in group name %r" % name 652 raise source.error(msg, len(name) + 1) 653 gid = state.groupdict.get(name) 654 if gid is None: 655 msg = "unknown group name %r" % name 656 raise source.error(msg, len(name) + 1) 657 if not state.checkgroup(gid): 658 raise source.error("cannot refer to an open group", 659 len(name) + 1) 660 state.checklookbehindgroup(gid, source) 661 subpatternappend((GROUPREF, gid)) 662 continue 663 else: 664 char = sourceget() 665 if char is None: 666 raise source.error("unexpected end of pattern") 667 raise source.error("unknown extension ?P" + char, 668 len(char) + 2) 669 elif char == ":": 670 # non-capturing group 671 group = None 672 elif char == "#": 673 # comment 674 while True: 675 if source.next is None: 676 raise source.error("missing ), unterminated comment", 677 source.tell() - start) 678 if sourceget() == ")": 679 break 680 continue 681 elif char in "=!<": 682 # lookahead assertions 683 dir = 1 684 if char == "<": 685 char = sourceget() 686 if char is None: 687 raise source.error("unexpected end of pattern") 688 if char not in "=!": 689 raise source.error("unknown extension ?<" + char, 690 len(char) + 2) 691 dir = -1 # lookbehind 692 lookbehindgroups = state.lookbehindgroups 693 if lookbehindgroups is None: 694 state.lookbehindgroups = state.groups 695 p = _parse_sub(source, state, verbose) 696 if dir < 0: 697 if lookbehindgroups is None: 698 state.lookbehindgroups = None 699 if not sourcematch(")"): 700 raise source.error("missing ), unterminated subpattern", 701 source.tell() - start) 702 if char == "=": 703 subpatternappend((ASSERT, (dir, p))) 704 else: 705 subpatternappend((ASSERT_NOT, (dir, p))) 706 continue 707 elif char == "(": 708 # conditional backreference group 709 condname = source.getuntil(")") 710 group = None 711 if condname.isidentifier(): 712 condgroup = state.groupdict.get(condname) 713 if condgroup is None: 714 msg = "unknown group name %r" % condname 715 raise source.error(msg, len(condname) + 1) 716 else: 717 try: 718 condgroup = int(condname) 719 if condgroup < 0: 720 raise ValueError 721 except ValueError: 722 msg = "bad character in group name %r" % condname 723 raise source.error(msg, len(condname) + 1) from None 724 if not condgroup: 725 raise source.error("bad group number", 726 len(condname) + 1) 727 if condgroup >= MAXGROUPS: 728 msg = "invalid group reference %d" % condgroup 729 raise source.error(msg, len(condname) + 1) 730 state.checklookbehindgroup(condgroup, source) 731 elif char in FLAGS or char == "-": 732 # flags 733 pos = source.pos 734 flags = _parse_flags(source, state, char) 735 if flags is None: # global flags 736 if pos != 3: # "(?x" 737 import warnings 738 warnings.warn( 739 'Flags not at the start of the expression %s%s' % ( 740 source.string[:20], # truncate long regexes 741 ' (truncated)' if len(source.string) > 20 else '', 742 ), 743 DeprecationWarning, stacklevel=7 744 ) 745 continue 746 add_flags, del_flags = flags 747 group = None 748 else: 749 raise source.error("unknown extension ?" + char, 750 len(char) + 1) 751 752 # parse group contents 753 if group is not None: 754 try: 755 group = state.opengroup(name) 756 except error as err: 757 raise source.error(err.msg, len(name) + 1) from None 758 if condgroup: 759 p = _parse_sub_cond(source, state, condgroup, verbose) 760 else: 761 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and 762 not (del_flags & SRE_FLAG_VERBOSE)) 763 p = _parse_sub(source, state, sub_verbose) 764 if not source.match(")"): 765 raise source.error("missing ), unterminated subpattern", 766 source.tell() - start) 767 if group is not None: 768 state.closegroup(group, p) 769 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) 770 771 elif this == "^": 772 subpatternappend((AT, AT_BEGINNING)) 773 774 elif this == "$": 775 subpattern.append((AT, AT_END)) 776 777 else: 778 raise AssertionError("unsupported special character %r" % (char,)) 779 780 return subpattern 781 782 def _parse_flags(source, state, char): 783 sourceget = source.get 784 add_flags = 0 785 del_flags = 0 786 if char != "-": 787 while True: 788 add_flags |= FLAGS[char] 789 char = sourceget() 790 if char is None: 791 raise source.error("missing -, : or )") 792 if char in ")-:": 793 break 794 if char not in FLAGS: 795 msg = "unknown flag" if char.isalpha() else "missing -, : or )" 796 raise source.error(msg, len(char)) 797 if char == ")": 798 if ((add_flags & SRE_FLAG_VERBOSE) and 799 not (state.flags & SRE_FLAG_VERBOSE)): 800 raise Verbose 801 state.flags |= add_flags 802 return None 803 if add_flags & GLOBAL_FLAGS: 804 raise source.error("bad inline flags: cannot turn on global flag", 1) 805 if char == "-": 806 char = sourceget() 807 if char is None: 808 raise source.error("missing flag") 809 if char not in FLAGS: 810 msg = "unknown flag" if char.isalpha() else "missing flag" 811 raise source.error(msg, len(char)) 812 while True: 813 del_flags |= FLAGS[char] 814 char = sourceget() 815 if char is None: 816 raise source.error("missing :") 817 if char == ":": 818 break 819 if char not in FLAGS: 820 msg = "unknown flag" if char.isalpha() else "missing :" 821 raise source.error(msg, len(char)) 822 assert char == ":" 823 if del_flags & GLOBAL_FLAGS: 824 raise source.error("bad inline flags: cannot turn off global flag", 1) 825 if add_flags & del_flags: 826 raise source.error("bad inline flags: flag turned on and off", 1) 827 return add_flags, del_flags 828 829 def fix_flags(src, flags): 830 # Check and fix flags according to the type of pattern (str or bytes) 831 if isinstance(src, str): 832 if flags & SRE_FLAG_LOCALE: 833 raise ValueError("cannot use LOCALE flag with a str pattern") 834 if not flags & SRE_FLAG_ASCII: 835 flags |= SRE_FLAG_UNICODE 836 elif flags & SRE_FLAG_UNICODE: 837 raise ValueError("ASCII and UNICODE flags are incompatible") 838 else: 839 if flags & SRE_FLAG_UNICODE: 840 raise ValueError("cannot use UNICODE flag with a bytes pattern") 841 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: 842 raise ValueError("ASCII and LOCALE flags are incompatible") 843 return flags 844 845 def parse(str, flags=0, pattern=None): 846 # parse 're' pattern into list of (opcode, argument) tuples 847 848 source = Tokenizer(str) 849 850 if pattern is None: 851 pattern = Pattern() 852 pattern.flags = flags 853 pattern.str = str 854 855 try: 856 p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False) 857 except Verbose: 858 # the VERBOSE flag was switched on inside the pattern. to be 859 # on the safe side, we'll parse the whole thing again... 860 pattern = Pattern() 861 pattern.flags = flags | SRE_FLAG_VERBOSE 862 pattern.str = str 863 source.seek(0) 864 p = _parse_sub(source, pattern, True, False) 865 866 p.pattern.flags = fix_flags(str, p.pattern.flags) 867 868 if source.next is not None: 869 assert source.next == ")" 870 raise source.error("unbalanced parenthesis") 871 872 if flags & SRE_FLAG_DEBUG: 873 p.dump() 874 875 return p 876 877 def parse_template(source, pattern): 878 # parse 're' replacement string into list of literals and 879 # group references 880 s = Tokenizer(source) 881 sget = s.get 882 groups = [] 883 literals = [] 884 literal = [] 885 lappend = literal.append 886 def addgroup(index, pos): 887 if index > pattern.groups: 888 raise s.error("invalid group reference %d" % index, pos) 889 if literal: 890 literals.append(''.join(literal)) 891 del literal[:] 892 groups.append((len(literals), index)) 893 literals.append(None) 894 groupindex = pattern.groupindex 895 while True: 896 this = sget() 897 if this is None: 898 break # end of replacement string 899 if this[0] == "\\": 900 # group 901 c = this[1] 902 if c == "g": 903 name = "" 904 if not s.match("<"): 905 raise s.error("missing <") 906 name = s.getuntil(">") 907 if name.isidentifier(): 908 try: 909 index = groupindex[name] 910 except KeyError: 911 raise IndexError("unknown group name %r" % name) 912 else: 913 try: 914 index = int(name) 915 if index < 0: 916 raise ValueError 917 except ValueError: 918 raise s.error("bad character in group name %r" % name, 919 len(name) + 1) from None 920 if index >= MAXGROUPS: 921 raise s.error("invalid group reference %d" % index, 922 len(name) + 1) 923 addgroup(index, len(name) + 1) 924 elif c == "0": 925 if s.next in OCTDIGITS: 926 this += sget() 927 if s.next in OCTDIGITS: 928 this += sget() 929 lappend(chr(int(this[1:], 8) & 0xff)) 930 elif c in DIGITS: 931 isoctal = False 932 if s.next in DIGITS: 933 this += sget() 934 if (c in OCTDIGITS and this[2] in OCTDIGITS and 935 s.next in OCTDIGITS): 936 this += sget() 937 isoctal = True 938 c = int(this[1:], 8) 939 if c > 0o377: 940 raise s.error('octal escape value %s outside of ' 941 'range 0-0o377' % this, len(this)) 942 lappend(chr(c)) 943 if not isoctal: 944 addgroup(int(this[1:]), len(this) - 1) 945 else: 946 try: 947 this = chr(ESCAPES[this][1]) 948 except KeyError: 949 if c in ASCIILETTERS: 950 import warnings 951 warnings.warn('bad escape %s' % this, 952 DeprecationWarning, stacklevel=4) 953 lappend(this) 954 else: 955 lappend(this) 956 if literal: 957 literals.append(''.join(literal)) 958 if not isinstance(source, str): 959 # The tokenizer implicitly decodes bytes objects as latin-1, we must 960 # therefore re-encode the final representation. 961 literals = [None if s is None else s.encode('latin-1') for s in literals] 962 return groups, literals 963 964 def expand_template(template, match): 965 g = match.group 966 empty = match.string[:0] 967 groups, literals = template 968 literals = literals[:] 969 try: 970 for index, group in groups: 971 literals[index] = g(group) or empty 972 except IndexError: 973 raise error("invalid group reference %d" % index) 974 return empty.join(literals) 975