1 /* 2 * Secret Labs' Regular Expression Engine 3 * 4 * regular expression matching engine 5 * 6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7 * 8 * See the _sre.c file for information on usage and redistribution. 9 */ 10 11 /* String matching engine */ 12 13 /* This file is included three times, with different character settings */ 14 15 LOCAL(int) 16 SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) 17 { 18 /* check if pointer is at given position */ 19 20 Py_ssize_t thisp, thatp; 21 22 switch (at) { 23 24 case SRE_AT_BEGINNING: 25 case SRE_AT_BEGINNING_STRING: 26 return ((void*) ptr == state->beginning); 27 28 case SRE_AT_BEGINNING_LINE: 29 return ((void*) ptr == state->beginning || 30 SRE_IS_LINEBREAK((int) ptr[-1])); 31 32 case SRE_AT_END: 33 return (((SRE_CHAR *)state->end - ptr == 1 && 34 SRE_IS_LINEBREAK((int) ptr[0])) || 35 ((void*) ptr == state->end)); 36 37 case SRE_AT_END_LINE: 38 return ((void*) ptr == state->end || 39 SRE_IS_LINEBREAK((int) ptr[0])); 40 41 case SRE_AT_END_STRING: 42 return ((void*) ptr == state->end); 43 44 case SRE_AT_BOUNDARY: 45 if (state->beginning == state->end) 46 return 0; 47 thatp = ((void*) ptr > state->beginning) ? 48 SRE_IS_WORD((int) ptr[-1]) : 0; 49 thisp = ((void*) ptr < state->end) ? 50 SRE_IS_WORD((int) ptr[0]) : 0; 51 return thisp != thatp; 52 53 case SRE_AT_NON_BOUNDARY: 54 if (state->beginning == state->end) 55 return 0; 56 thatp = ((void*) ptr > state->beginning) ? 57 SRE_IS_WORD((int) ptr[-1]) : 0; 58 thisp = ((void*) ptr < state->end) ? 59 SRE_IS_WORD((int) ptr[0]) : 0; 60 return thisp == thatp; 61 62 case SRE_AT_LOC_BOUNDARY: 63 if (state->beginning == state->end) 64 return 0; 65 thatp = ((void*) ptr > state->beginning) ? 66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 67 thisp = ((void*) ptr < state->end) ? 68 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 69 return thisp != thatp; 70 71 case SRE_AT_LOC_NON_BOUNDARY: 72 if (state->beginning == state->end) 73 return 0; 74 thatp = ((void*) ptr > state->beginning) ? 75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0; 76 thisp = ((void*) ptr < state->end) ? 77 SRE_LOC_IS_WORD((int) ptr[0]) : 0; 78 return thisp == thatp; 79 80 case SRE_AT_UNI_BOUNDARY: 81 if (state->beginning == state->end) 82 return 0; 83 thatp = ((void*) ptr > state->beginning) ? 84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 85 thisp = ((void*) ptr < state->end) ? 86 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 87 return thisp != thatp; 88 89 case SRE_AT_UNI_NON_BOUNDARY: 90 if (state->beginning == state->end) 91 return 0; 92 thatp = ((void*) ptr > state->beginning) ? 93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0; 94 thisp = ((void*) ptr < state->end) ? 95 SRE_UNI_IS_WORD((int) ptr[0]) : 0; 96 return thisp == thatp; 97 98 } 99 100 return 0; 101 } 102 103 LOCAL(int) 104 SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) 105 { 106 /* check if character is a member of the given set */ 107 108 int ok = 1; 109 110 for (;;) { 111 switch (*set++) { 112 113 case SRE_OP_FAILURE: 114 return !ok; 115 116 case SRE_OP_LITERAL: 117 /* <LITERAL> <code> */ 118 if (ch == set[0]) 119 return ok; 120 set++; 121 break; 122 123 case SRE_OP_CATEGORY: 124 /* <CATEGORY> <code> */ 125 if (sre_category(set[0], (int) ch)) 126 return ok; 127 set++; 128 break; 129 130 case SRE_OP_CHARSET: 131 /* <CHARSET> <bitmap> */ 132 if (ch < 256 && 133 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1))))) 134 return ok; 135 set += 256/SRE_CODE_BITS; 136 break; 137 138 case SRE_OP_RANGE: 139 /* <RANGE> <lower> <upper> */ 140 if (set[0] <= ch && ch <= set[1]) 141 return ok; 142 set += 2; 143 break; 144 145 case SRE_OP_RANGE_UNI_IGNORE: 146 /* <RANGE_UNI_IGNORE> <lower> <upper> */ 147 { 148 SRE_CODE uch; 149 /* ch is already lower cased */ 150 if (set[0] <= ch && ch <= set[1]) 151 return ok; 152 uch = sre_upper_unicode(ch); 153 if (set[0] <= uch && uch <= set[1]) 154 return ok; 155 set += 2; 156 break; 157 } 158 159 case SRE_OP_NEGATE: 160 ok = !ok; 161 break; 162 163 case SRE_OP_BIGCHARSET: 164 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ 165 { 166 Py_ssize_t count, block; 167 count = *(set++); 168 169 if (ch < 0x10000u) 170 block = ((unsigned char*)set)[ch >> 8]; 171 else 172 block = -1; 173 set += 256/sizeof(SRE_CODE); 174 if (block >=0 && 175 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] & 176 (1u << (ch & (SRE_CODE_BITS-1))))) 177 return ok; 178 set += count * (256/SRE_CODE_BITS); 179 break; 180 } 181 182 default: 183 /* internal error -- there's not much we can do about it 184 here, so let's just pretend it didn't match... */ 185 return 0; 186 } 187 } 188 } 189 190 LOCAL(int) 191 SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) 192 { 193 SRE_CODE lo, up; 194 lo = sre_lower_locale(ch); 195 if (SRE(charset)(state, set, lo)) 196 return 1; 197 198 up = sre_upper_locale(ch); 199 return up != lo && SRE(charset)(state, set, up); 200 } 201 202 LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel); 203 204 LOCAL(Py_ssize_t) 205 SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) 206 { 207 SRE_CODE chr; 208 SRE_CHAR c; 209 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; 210 SRE_CHAR* end = (SRE_CHAR *)state->end; 211 Py_ssize_t i; 212 213 /* adjust end */ 214 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT) 215 end = ptr + maxcount; 216 217 switch (pattern[0]) { 218 219 case SRE_OP_IN: 220 /* repeated set */ 221 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); 222 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr)) 223 ptr++; 224 break; 225 226 case SRE_OP_ANY: 227 /* repeated dot wildcard. */ 228 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); 229 while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) 230 ptr++; 231 break; 232 233 case SRE_OP_ANY_ALL: 234 /* repeated dot wildcard. skip to the end of the target 235 string, and backtrack from there */ 236 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); 237 ptr = end; 238 break; 239 240 case SRE_OP_LITERAL: 241 /* repeated literal */ 242 chr = pattern[1]; 243 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); 244 c = (SRE_CHAR) chr; 245 #if SIZEOF_SRE_CHAR < 4 246 if ((SRE_CODE) c != chr) 247 ; /* literal can't match: doesn't fit in char width */ 248 else 249 #endif 250 while (ptr < end && *ptr == c) 251 ptr++; 252 break; 253 254 case SRE_OP_LITERAL_IGNORE: 255 /* repeated literal */ 256 chr = pattern[1]; 257 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); 258 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr) 259 ptr++; 260 break; 261 262 case SRE_OP_LITERAL_UNI_IGNORE: 263 /* repeated literal */ 264 chr = pattern[1]; 265 TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); 266 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr) 267 ptr++; 268 break; 269 270 case SRE_OP_LITERAL_LOC_IGNORE: 271 /* repeated literal */ 272 chr = pattern[1]; 273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); 274 while (ptr < end && char_loc_ignore(chr, *ptr)) 275 ptr++; 276 break; 277 278 case SRE_OP_NOT_LITERAL: 279 /* repeated non-literal */ 280 chr = pattern[1]; 281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); 282 c = (SRE_CHAR) chr; 283 #if SIZEOF_SRE_CHAR < 4 284 if ((SRE_CODE) c != chr) 285 ptr = end; /* literal can't match: doesn't fit in char width */ 286 else 287 #endif 288 while (ptr < end && *ptr != c) 289 ptr++; 290 break; 291 292 case SRE_OP_NOT_LITERAL_IGNORE: 293 /* repeated non-literal */ 294 chr = pattern[1]; 295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); 296 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr) 297 ptr++; 298 break; 299 300 case SRE_OP_NOT_LITERAL_UNI_IGNORE: 301 /* repeated non-literal */ 302 chr = pattern[1]; 303 TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); 304 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr) 305 ptr++; 306 break; 307 308 case SRE_OP_NOT_LITERAL_LOC_IGNORE: 309 /* repeated non-literal */ 310 chr = pattern[1]; 311 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); 312 while (ptr < end && !char_loc_ignore(chr, *ptr)) 313 ptr++; 314 break; 315 316 default: 317 /* repeated single character pattern */ 318 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); 319 while ((SRE_CHAR*) state->ptr < end) { 320 i = SRE(match)(state, pattern, 0); 321 if (i < 0) 322 return i; 323 if (!i) 324 break; 325 } 326 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr, 327 (SRE_CHAR*) state->ptr - ptr)); 328 return (SRE_CHAR*) state->ptr - ptr; 329 } 330 331 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr, 332 ptr - (SRE_CHAR*) state->ptr)); 333 return ptr - (SRE_CHAR*) state->ptr; 334 } 335 336 #if 0 /* not used in this release */ 337 LOCAL(int) 338 SRE(info)(SRE_STATE* state, SRE_CODE* pattern) 339 { 340 /* check if an SRE_OP_INFO block matches at the current position. 341 returns the number of SRE_CODE objects to skip if successful, 0 342 if no match */ 343 344 SRE_CHAR* end = (SRE_CHAR*) state->end; 345 SRE_CHAR* ptr = (SRE_CHAR*) state->ptr; 346 Py_ssize_t i; 347 348 /* check minimal length */ 349 if (pattern[3] && end - ptr < pattern[3]) 350 return 0; 351 352 /* check known prefix */ 353 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { 354 /* <length> <skip> <prefix data> <overlap data> */ 355 for (i = 0; i < pattern[5]; i++) 356 if ((SRE_CODE) ptr[i] != pattern[7 + i]) 357 return 0; 358 return pattern[0] + 2 * pattern[6]; 359 } 360 return pattern[0]; 361 } 362 #endif 363 364 /* The macros below should be used to protect recursive SRE(match)() 365 * calls that *failed* and do *not* return immediately (IOW, those 366 * that will backtrack). Explaining: 367 * 368 * - Recursive SRE(match)() returned true: that's usually a success 369 * (besides atypical cases like ASSERT_NOT), therefore there's no 370 * reason to restore lastmark; 371 * 372 * - Recursive SRE(match)() returned false but the current SRE(match)() 373 * is returning to the caller: If the current SRE(match)() is the 374 * top function of the recursion, returning false will be a matching 375 * failure, and it doesn't matter where lastmark is pointing to. 376 * If it's *not* the top function, it will be a recursive SRE(match)() 377 * failure by itself, and the calling SRE(match)() will have to deal 378 * with the failure by the same rules explained here (it will restore 379 * lastmark by itself if necessary); 380 * 381 * - Recursive SRE(match)() returned false, and will continue the 382 * outside 'for' loop: must be protected when breaking, since the next 383 * OP could potentially depend on lastmark; 384 * 385 * - Recursive SRE(match)() returned false, and will be called again 386 * inside a local for/while loop: must be protected between each 387 * loop iteration, since the recursive SRE(match)() could do anything, 388 * and could potentially depend on lastmark. 389 * 390 * For more information, check the discussion at SF patch #712900. 391 */ 392 #define LASTMARK_SAVE() \ 393 do { \ 394 ctx->lastmark = state->lastmark; \ 395 ctx->lastindex = state->lastindex; \ 396 } while (0) 397 #define LASTMARK_RESTORE() \ 398 do { \ 399 state->lastmark = ctx->lastmark; \ 400 state->lastindex = ctx->lastindex; \ 401 } while (0) 402 403 #define RETURN_ERROR(i) do { return i; } while(0) 404 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0) 405 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) 406 407 #define RETURN_ON_ERROR(i) \ 408 do { if (i < 0) RETURN_ERROR(i); } while (0) 409 #define RETURN_ON_SUCCESS(i) \ 410 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) 411 #define RETURN_ON_FAILURE(i) \ 412 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) 413 414 #define DATA_STACK_ALLOC(state, type, ptr) \ 415 do { \ 416 alloc_pos = state->data_stack_base; \ 417 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \ 418 "(%" PY_FORMAT_SIZE_T "d)\n", \ 419 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \ 420 if (sizeof(type) > state->data_stack_size - alloc_pos) { \ 421 int j = data_stack_grow(state, sizeof(type)); \ 422 if (j < 0) return j; \ 423 if (ctx_pos != -1) \ 424 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \ 425 } \ 426 ptr = (type*)(state->data_stack+alloc_pos); \ 427 state->data_stack_base += sizeof(type); \ 428 } while (0) 429 430 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ 431 do { \ 432 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", Py_STRINGIFY(type), pos)); \ 433 ptr = (type*)(state->data_stack+pos); \ 434 } while (0) 435 436 #define DATA_STACK_PUSH(state, data, size) \ 437 do { \ 438 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \ 439 "(%" PY_FORMAT_SIZE_T "d)\n", \ 440 data, state->data_stack_base, size)); \ 441 if (size > state->data_stack_size - state->data_stack_base) { \ 442 int j = data_stack_grow(state, size); \ 443 if (j < 0) return j; \ 444 if (ctx_pos != -1) \ 445 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \ 446 } \ 447 memcpy(state->data_stack+state->data_stack_base, data, size); \ 448 state->data_stack_base += size; \ 449 } while (0) 450 451 #define DATA_STACK_POP(state, data, size, discard) \ 452 do { \ 453 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \ 454 "(%" PY_FORMAT_SIZE_T "d)\n", \ 455 data, state->data_stack_base-size, size)); \ 456 memcpy(data, state->data_stack+state->data_stack_base-size, size); \ 457 if (discard) \ 458 state->data_stack_base -= size; \ 459 } while (0) 460 461 #define DATA_STACK_POP_DISCARD(state, size) \ 462 do { \ 463 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \ 464 "(%" PY_FORMAT_SIZE_T "d)\n", \ 465 state->data_stack_base-size, size)); \ 466 state->data_stack_base -= size; \ 467 } while(0) 468 469 #define DATA_PUSH(x) \ 470 DATA_STACK_PUSH(state, (x), sizeof(*(x))) 471 #define DATA_POP(x) \ 472 DATA_STACK_POP(state, (x), sizeof(*(x)), 1) 473 #define DATA_POP_DISCARD(x) \ 474 DATA_STACK_POP_DISCARD(state, sizeof(*(x))) 475 #define DATA_ALLOC(t,p) \ 476 DATA_STACK_ALLOC(state, t, p) 477 #define DATA_LOOKUP_AT(t,p,pos) \ 478 DATA_STACK_LOOKUP_AT(state,t,p,pos) 479 480 #define MARK_PUSH(lastmark) \ 481 do if (lastmark > 0) { \ 482 i = lastmark; /* ctx->lastmark may change if reallocated */ \ 483 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ 484 } while (0) 485 #define MARK_POP(lastmark) \ 486 do if (lastmark > 0) { \ 487 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ 488 } while (0) 489 #define MARK_POP_KEEP(lastmark) \ 490 do if (lastmark > 0) { \ 491 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ 492 } while (0) 493 #define MARK_POP_DISCARD(lastmark) \ 494 do if (lastmark > 0) { \ 495 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ 496 } while (0) 497 498 #define JUMP_NONE 0 499 #define JUMP_MAX_UNTIL_1 1 500 #define JUMP_MAX_UNTIL_2 2 501 #define JUMP_MAX_UNTIL_3 3 502 #define JUMP_MIN_UNTIL_1 4 503 #define JUMP_MIN_UNTIL_2 5 504 #define JUMP_MIN_UNTIL_3 6 505 #define JUMP_REPEAT 7 506 #define JUMP_REPEAT_ONE_1 8 507 #define JUMP_REPEAT_ONE_2 9 508 #define JUMP_MIN_REPEAT_ONE 10 509 #define JUMP_BRANCH 11 510 #define JUMP_ASSERT 12 511 #define JUMP_ASSERT_NOT 13 512 513 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ 514 DATA_ALLOC(SRE(match_context), nextctx); \ 515 nextctx->last_ctx_pos = ctx_pos; \ 516 nextctx->jump = jumpvalue; \ 517 nextctx->pattern = nextpattern; \ 518 nextctx->toplevel = toplevel_; \ 519 ctx_pos = alloc_pos; \ 520 ctx = nextctx; \ 521 goto entrance; \ 522 jumplabel: \ 523 while (0) /* gcc doesn't like labels at end of scopes */ \ 524 525 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ 526 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel) 527 528 #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \ 529 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) 530 531 typedef struct { 532 Py_ssize_t last_ctx_pos; 533 Py_ssize_t jump; 534 SRE_CHAR* ptr; 535 SRE_CODE* pattern; 536 Py_ssize_t count; 537 Py_ssize_t lastmark; 538 Py_ssize_t lastindex; 539 union { 540 SRE_CODE chr; 541 SRE_REPEAT* rep; 542 } u; 543 int toplevel; 544 } SRE(match_context); 545 546 /* check if string matches the given pattern. returns <0 for 547 error, 0 for failure, and 1 for success */ 548 LOCAL(Py_ssize_t) 549 SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) 550 { 551 SRE_CHAR* end = (SRE_CHAR *)state->end; 552 Py_ssize_t alloc_pos, ctx_pos = -1; 553 Py_ssize_t i, ret = 0; 554 Py_ssize_t jump; 555 unsigned int sigcount=0; 556 557 SRE(match_context)* ctx; 558 SRE(match_context)* nextctx; 559 560 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr)); 561 562 DATA_ALLOC(SRE(match_context), ctx); 563 ctx->last_ctx_pos = -1; 564 ctx->jump = JUMP_NONE; 565 ctx->pattern = pattern; 566 ctx->toplevel = toplevel; 567 ctx_pos = alloc_pos; 568 569 entrance: 570 571 ctx->ptr = (SRE_CHAR *)state->ptr; 572 573 if (ctx->pattern[0] == SRE_OP_INFO) { 574 /* optimization info block */ 575 /* <INFO> <1=skip> <2=flags> <3=min> ... */ 576 if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) { 577 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, " 578 "need %" PY_FORMAT_SIZE_T "d)\n", 579 end - ctx->ptr, (Py_ssize_t) ctx->pattern[3])); 580 RETURN_FAILURE; 581 } 582 ctx->pattern += ctx->pattern[1] + 1; 583 } 584 585 for (;;) { 586 ++sigcount; 587 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) 588 RETURN_ERROR(SRE_ERROR_INTERRUPTED); 589 590 switch (*ctx->pattern++) { 591 592 case SRE_OP_MARK: 593 /* set mark */ 594 /* <MARK> <gid> */ 595 TRACE(("|%p|%p|MARK %d\n", ctx->pattern, 596 ctx->ptr, ctx->pattern[0])); 597 i = ctx->pattern[0]; 598 if (i & 1) 599 state->lastindex = i/2 + 1; 600 if (i > state->lastmark) { 601 /* state->lastmark is the highest valid index in the 602 state->mark array. If it is increased by more than 1, 603 the intervening marks must be set to NULL to signal 604 that these marks have not been encountered. */ 605 Py_ssize_t j = state->lastmark + 1; 606 while (j < i) 607 state->mark[j++] = NULL; 608 state->lastmark = i; 609 } 610 state->mark[i] = ctx->ptr; 611 ctx->pattern++; 612 break; 613 614 case SRE_OP_LITERAL: 615 /* match literal string */ 616 /* <LITERAL> <code> */ 617 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, 618 ctx->ptr, *ctx->pattern)); 619 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) 620 RETURN_FAILURE; 621 ctx->pattern++; 622 ctx->ptr++; 623 break; 624 625 case SRE_OP_NOT_LITERAL: 626 /* match anything that is not literal character */ 627 /* <NOT_LITERAL> <code> */ 628 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, 629 ctx->ptr, *ctx->pattern)); 630 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) 631 RETURN_FAILURE; 632 ctx->pattern++; 633 ctx->ptr++; 634 break; 635 636 case SRE_OP_SUCCESS: 637 /* end of pattern */ 638 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); 639 if (ctx->toplevel && 640 ((state->match_all && ctx->ptr != state->end) || 641 (state->must_advance && ctx->ptr == state->start))) 642 { 643 RETURN_FAILURE; 644 } 645 state->ptr = ctx->ptr; 646 RETURN_SUCCESS; 647 648 case SRE_OP_AT: 649 /* match at given position */ 650 /* <AT> <code> */ 651 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); 652 if (!SRE(at)(state, ctx->ptr, *ctx->pattern)) 653 RETURN_FAILURE; 654 ctx->pattern++; 655 break; 656 657 case SRE_OP_CATEGORY: 658 /* match at given category */ 659 /* <CATEGORY> <code> */ 660 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, 661 ctx->ptr, *ctx->pattern)); 662 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) 663 RETURN_FAILURE; 664 ctx->pattern++; 665 ctx->ptr++; 666 break; 667 668 case SRE_OP_ANY: 669 /* match anything (except a newline) */ 670 /* <ANY> */ 671 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); 672 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) 673 RETURN_FAILURE; 674 ctx->ptr++; 675 break; 676 677 case SRE_OP_ANY_ALL: 678 /* match anything */ 679 /* <ANY_ALL> */ 680 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); 681 if (ctx->ptr >= end) 682 RETURN_FAILURE; 683 ctx->ptr++; 684 break; 685 686 case SRE_OP_IN: 687 /* match set member (or non_member) */ 688 /* <IN> <skip> <set> */ 689 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); 690 if (ctx->ptr >= end || 691 !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr)) 692 RETURN_FAILURE; 693 ctx->pattern += ctx->pattern[0]; 694 ctx->ptr++; 695 break; 696 697 case SRE_OP_LITERAL_IGNORE: 698 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", 699 ctx->pattern, ctx->ptr, ctx->pattern[0])); 700 if (ctx->ptr >= end || 701 sre_lower_ascii(*ctx->ptr) != *ctx->pattern) 702 RETURN_FAILURE; 703 ctx->pattern++; 704 ctx->ptr++; 705 break; 706 707 case SRE_OP_LITERAL_UNI_IGNORE: 708 TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n", 709 ctx->pattern, ctx->ptr, ctx->pattern[0])); 710 if (ctx->ptr >= end || 711 sre_lower_unicode(*ctx->ptr) != *ctx->pattern) 712 RETURN_FAILURE; 713 ctx->pattern++; 714 ctx->ptr++; 715 break; 716 717 case SRE_OP_LITERAL_LOC_IGNORE: 718 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n", 719 ctx->pattern, ctx->ptr, ctx->pattern[0])); 720 if (ctx->ptr >= end 721 || !char_loc_ignore(*ctx->pattern, *ctx->ptr)) 722 RETURN_FAILURE; 723 ctx->pattern++; 724 ctx->ptr++; 725 break; 726 727 case SRE_OP_NOT_LITERAL_IGNORE: 728 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", 729 ctx->pattern, ctx->ptr, *ctx->pattern)); 730 if (ctx->ptr >= end || 731 sre_lower_ascii(*ctx->ptr) == *ctx->pattern) 732 RETURN_FAILURE; 733 ctx->pattern++; 734 ctx->ptr++; 735 break; 736 737 case SRE_OP_NOT_LITERAL_UNI_IGNORE: 738 TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n", 739 ctx->pattern, ctx->ptr, *ctx->pattern)); 740 if (ctx->ptr >= end || 741 sre_lower_unicode(*ctx->ptr) == *ctx->pattern) 742 RETURN_FAILURE; 743 ctx->pattern++; 744 ctx->ptr++; 745 break; 746 747 case SRE_OP_NOT_LITERAL_LOC_IGNORE: 748 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n", 749 ctx->pattern, ctx->ptr, *ctx->pattern)); 750 if (ctx->ptr >= end 751 || char_loc_ignore(*ctx->pattern, *ctx->ptr)) 752 RETURN_FAILURE; 753 ctx->pattern++; 754 ctx->ptr++; 755 break; 756 757 case SRE_OP_IN_IGNORE: 758 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); 759 if (ctx->ptr >= end 760 || !SRE(charset)(state, ctx->pattern+1, 761 (SRE_CODE)sre_lower_ascii(*ctx->ptr))) 762 RETURN_FAILURE; 763 ctx->pattern += ctx->pattern[0]; 764 ctx->ptr++; 765 break; 766 767 case SRE_OP_IN_UNI_IGNORE: 768 TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr)); 769 if (ctx->ptr >= end 770 || !SRE(charset)(state, ctx->pattern+1, 771 (SRE_CODE)sre_lower_unicode(*ctx->ptr))) 772 RETURN_FAILURE; 773 ctx->pattern += ctx->pattern[0]; 774 ctx->ptr++; 775 break; 776 777 case SRE_OP_IN_LOC_IGNORE: 778 TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr)); 779 if (ctx->ptr >= end 780 || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr)) 781 RETURN_FAILURE; 782 ctx->pattern += ctx->pattern[0]; 783 ctx->ptr++; 784 break; 785 786 case SRE_OP_JUMP: 787 case SRE_OP_INFO: 788 /* jump forward */ 789 /* <JUMP> <offset> */ 790 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern, 791 ctx->ptr, ctx->pattern[0])); 792 ctx->pattern += ctx->pattern[0]; 793 break; 794 795 case SRE_OP_BRANCH: 796 /* alternation */ 797 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ 798 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); 799 LASTMARK_SAVE(); 800 ctx->u.rep = state->repeat; 801 if (ctx->u.rep) 802 MARK_PUSH(ctx->lastmark); 803 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { 804 if (ctx->pattern[1] == SRE_OP_LITERAL && 805 (ctx->ptr >= end || 806 (SRE_CODE) *ctx->ptr != ctx->pattern[2])) 807 continue; 808 if (ctx->pattern[1] == SRE_OP_IN && 809 (ctx->ptr >= end || 810 !SRE(charset)(state, ctx->pattern + 3, 811 (SRE_CODE) *ctx->ptr))) 812 continue; 813 state->ptr = ctx->ptr; 814 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); 815 if (ret) { 816 if (ctx->u.rep) 817 MARK_POP_DISCARD(ctx->lastmark); 818 RETURN_ON_ERROR(ret); 819 RETURN_SUCCESS; 820 } 821 if (ctx->u.rep) 822 MARK_POP_KEEP(ctx->lastmark); 823 LASTMARK_RESTORE(); 824 } 825 if (ctx->u.rep) 826 MARK_POP_DISCARD(ctx->lastmark); 827 RETURN_FAILURE; 828 829 case SRE_OP_REPEAT_ONE: 830 /* match repeated sequence (maximizing regexp) */ 831 832 /* this operator only works if the repeated item is 833 exactly one character wide, and we're not already 834 collecting backtracking points. for other cases, 835 use the MAX_REPEAT operator */ 836 837 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 838 839 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 840 ctx->pattern[1], ctx->pattern[2])); 841 842 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) 843 RETURN_FAILURE; /* cannot match */ 844 845 state->ptr = ctx->ptr; 846 847 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]); 848 RETURN_ON_ERROR(ret); 849 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); 850 ctx->count = ret; 851 ctx->ptr += ctx->count; 852 853 /* when we arrive here, count contains the number of 854 matches, and ctx->ptr points to the tail of the target 855 string. check if the rest of the pattern matches, 856 and backtrack if not. */ 857 858 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 859 RETURN_FAILURE; 860 861 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && 862 ctx->ptr == state->end && 863 !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) 864 { 865 /* tail is empty. we're finished */ 866 state->ptr = ctx->ptr; 867 RETURN_SUCCESS; 868 } 869 870 LASTMARK_SAVE(); 871 872 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { 873 /* tail starts with a literal. skip positions where 874 the rest of the pattern cannot possibly match */ 875 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; 876 for (;;) { 877 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && 878 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { 879 ctx->ptr--; 880 ctx->count--; 881 } 882 if (ctx->count < (Py_ssize_t) ctx->pattern[1]) 883 break; 884 state->ptr = ctx->ptr; 885 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, 886 ctx->pattern+ctx->pattern[0]); 887 if (ret) { 888 RETURN_ON_ERROR(ret); 889 RETURN_SUCCESS; 890 } 891 892 LASTMARK_RESTORE(); 893 894 ctx->ptr--; 895 ctx->count--; 896 } 897 898 } else { 899 /* general case */ 900 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { 901 state->ptr = ctx->ptr; 902 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, 903 ctx->pattern+ctx->pattern[0]); 904 if (ret) { 905 RETURN_ON_ERROR(ret); 906 RETURN_SUCCESS; 907 } 908 ctx->ptr--; 909 ctx->count--; 910 LASTMARK_RESTORE(); 911 } 912 } 913 RETURN_FAILURE; 914 915 case SRE_OP_MIN_REPEAT_ONE: 916 /* match repeated sequence (minimizing regexp) */ 917 918 /* this operator only works if the repeated item is 919 exactly one character wide, and we're not already 920 collecting backtracking points. for other cases, 921 use the MIN_REPEAT operator */ 922 923 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ 924 925 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, 926 ctx->pattern[1], ctx->pattern[2])); 927 928 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) 929 RETURN_FAILURE; /* cannot match */ 930 931 state->ptr = ctx->ptr; 932 933 if (ctx->pattern[1] == 0) 934 ctx->count = 0; 935 else { 936 /* count using pattern min as the maximum */ 937 ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]); 938 RETURN_ON_ERROR(ret); 939 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); 940 if (ret < (Py_ssize_t) ctx->pattern[1]) 941 /* didn't match minimum number of times */ 942 RETURN_FAILURE; 943 /* advance past minimum matches of repeat */ 944 ctx->count = ret; 945 ctx->ptr += ctx->count; 946 } 947 948 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && 949 !(ctx->toplevel && 950 ((state->match_all && ctx->ptr != state->end) || 951 (state->must_advance && ctx->ptr == state->start)))) 952 { 953 /* tail is empty. we're finished */ 954 state->ptr = ctx->ptr; 955 RETURN_SUCCESS; 956 957 } else { 958 /* general case */ 959 LASTMARK_SAVE(); 960 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT 961 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { 962 state->ptr = ctx->ptr; 963 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, 964 ctx->pattern+ctx->pattern[0]); 965 if (ret) { 966 RETURN_ON_ERROR(ret); 967 RETURN_SUCCESS; 968 } 969 state->ptr = ctx->ptr; 970 ret = SRE(count)(state, ctx->pattern+3, 1); 971 RETURN_ON_ERROR(ret); 972 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); 973 if (ret == 0) 974 break; 975 assert(ret == 1); 976 ctx->ptr++; 977 ctx->count++; 978 LASTMARK_RESTORE(); 979 } 980 } 981 RETURN_FAILURE; 982 983 case SRE_OP_REPEAT: 984 /* create repeat context. all the hard work is done 985 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ 986 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ 987 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, 988 ctx->pattern[1], ctx->pattern[2])); 989 990 /* install new repeat context */ 991 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); 992 if (!ctx->u.rep) { 993 PyErr_NoMemory(); 994 RETURN_FAILURE; 995 } 996 ctx->u.rep->count = -1; 997 ctx->u.rep->pattern = ctx->pattern; 998 ctx->u.rep->prev = state->repeat; 999 ctx->u.rep->last_ptr = NULL; 1000 state->repeat = ctx->u.rep; 1001 1002 state->ptr = ctx->ptr; 1003 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); 1004 state->repeat = ctx->u.rep->prev; 1005 PyObject_FREE(ctx->u.rep); 1006 1007 if (ret) { 1008 RETURN_ON_ERROR(ret); 1009 RETURN_SUCCESS; 1010 } 1011 RETURN_FAILURE; 1012 1013 case SRE_OP_MAX_UNTIL: 1014 /* maximizing repeat */ 1015 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */ 1016 1017 /* FIXME: we probably need to deal with zero-width 1018 matches in here... */ 1019 1020 ctx->u.rep = state->repeat; 1021 if (!ctx->u.rep) 1022 RETURN_ERROR(SRE_ERROR_STATE); 1023 1024 state->ptr = ctx->ptr; 1025 1026 ctx->count = ctx->u.rep->count+1; 1027 1028 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern, 1029 ctx->ptr, ctx->count)); 1030 1031 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { 1032 /* not enough matches */ 1033 ctx->u.rep->count = ctx->count; 1034 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, 1035 ctx->u.rep->pattern+3); 1036 if (ret) { 1037 RETURN_ON_ERROR(ret); 1038 RETURN_SUCCESS; 1039 } 1040 ctx->u.rep->count = ctx->count-1; 1041 state->ptr = ctx->ptr; 1042 RETURN_FAILURE; 1043 } 1044 1045 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] || 1046 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) && 1047 state->ptr != ctx->u.rep->last_ptr) { 1048 /* we may have enough matches, but if we can 1049 match another item, do so */ 1050 ctx->u.rep->count = ctx->count; 1051 LASTMARK_SAVE(); 1052 MARK_PUSH(ctx->lastmark); 1053 /* zero-width match protection */ 1054 DATA_PUSH(&ctx->u.rep->last_ptr); 1055 ctx->u.rep->last_ptr = state->ptr; 1056 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, 1057 ctx->u.rep->pattern+3); 1058 DATA_POP(&ctx->u.rep->last_ptr); 1059 if (ret) { 1060 MARK_POP_DISCARD(ctx->lastmark); 1061 RETURN_ON_ERROR(ret); 1062 RETURN_SUCCESS; 1063 } 1064 MARK_POP(ctx->lastmark); 1065 LASTMARK_RESTORE(); 1066 ctx->u.rep->count = ctx->count-1; 1067 state->ptr = ctx->ptr; 1068 } 1069 1070 /* cannot match more repeated items here. make sure the 1071 tail matches */ 1072 state->repeat = ctx->u.rep->prev; 1073 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); 1074 RETURN_ON_SUCCESS(ret); 1075 state->repeat = ctx->u.rep; 1076 state->ptr = ctx->ptr; 1077 RETURN_FAILURE; 1078 1079 case SRE_OP_MIN_UNTIL: 1080 /* minimizing repeat */ 1081 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ 1082 1083 ctx->u.rep = state->repeat; 1084 if (!ctx->u.rep) 1085 RETURN_ERROR(SRE_ERROR_STATE); 1086 1087 state->ptr = ctx->ptr; 1088 1089 ctx->count = ctx->u.rep->count+1; 1090 1091 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern, 1092 ctx->ptr, ctx->count, ctx->u.rep->pattern)); 1093 1094 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { 1095 /* not enough matches */ 1096 ctx->u.rep->count = ctx->count; 1097 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, 1098 ctx->u.rep->pattern+3); 1099 if (ret) { 1100 RETURN_ON_ERROR(ret); 1101 RETURN_SUCCESS; 1102 } 1103 ctx->u.rep->count = ctx->count-1; 1104 state->ptr = ctx->ptr; 1105 RETURN_FAILURE; 1106 } 1107 1108 LASTMARK_SAVE(); 1109 1110 /* see if the tail matches */ 1111 state->repeat = ctx->u.rep->prev; 1112 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); 1113 if (ret) { 1114 RETURN_ON_ERROR(ret); 1115 RETURN_SUCCESS; 1116 } 1117 1118 state->repeat = ctx->u.rep; 1119 state->ptr = ctx->ptr; 1120 1121 LASTMARK_RESTORE(); 1122 1123 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] 1124 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || 1125 state->ptr == ctx->u.rep->last_ptr) 1126 RETURN_FAILURE; 1127 1128 ctx->u.rep->count = ctx->count; 1129 /* zero-width match protection */ 1130 DATA_PUSH(&ctx->u.rep->last_ptr); 1131 ctx->u.rep->last_ptr = state->ptr; 1132 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, 1133 ctx->u.rep->pattern+3); 1134 DATA_POP(&ctx->u.rep->last_ptr); 1135 if (ret) { 1136 RETURN_ON_ERROR(ret); 1137 RETURN_SUCCESS; 1138 } 1139 ctx->u.rep->count = ctx->count-1; 1140 state->ptr = ctx->ptr; 1141 RETURN_FAILURE; 1142 1143 case SRE_OP_GROUPREF: 1144 /* match backreference */ 1145 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, 1146 ctx->ptr, ctx->pattern[0])); 1147 i = ctx->pattern[0]; 1148 { 1149 Py_ssize_t groupref = i+i; 1150 if (groupref >= state->lastmark) { 1151 RETURN_FAILURE; 1152 } else { 1153 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1154 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1155 if (!p || !e || e < p) 1156 RETURN_FAILURE; 1157 while (p < e) { 1158 if (ctx->ptr >= end || *ctx->ptr != *p) 1159 RETURN_FAILURE; 1160 p++; 1161 ctx->ptr++; 1162 } 1163 } 1164 } 1165 ctx->pattern++; 1166 break; 1167 1168 case SRE_OP_GROUPREF_IGNORE: 1169 /* match backreference */ 1170 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern, 1171 ctx->ptr, ctx->pattern[0])); 1172 i = ctx->pattern[0]; 1173 { 1174 Py_ssize_t groupref = i+i; 1175 if (groupref >= state->lastmark) { 1176 RETURN_FAILURE; 1177 } else { 1178 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1179 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1180 if (!p || !e || e < p) 1181 RETURN_FAILURE; 1182 while (p < e) { 1183 if (ctx->ptr >= end || 1184 sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p)) 1185 RETURN_FAILURE; 1186 p++; 1187 ctx->ptr++; 1188 } 1189 } 1190 } 1191 ctx->pattern++; 1192 break; 1193 1194 case SRE_OP_GROUPREF_UNI_IGNORE: 1195 /* match backreference */ 1196 TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern, 1197 ctx->ptr, ctx->pattern[0])); 1198 i = ctx->pattern[0]; 1199 { 1200 Py_ssize_t groupref = i+i; 1201 if (groupref >= state->lastmark) { 1202 RETURN_FAILURE; 1203 } else { 1204 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1205 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1206 if (!p || !e || e < p) 1207 RETURN_FAILURE; 1208 while (p < e) { 1209 if (ctx->ptr >= end || 1210 sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p)) 1211 RETURN_FAILURE; 1212 p++; 1213 ctx->ptr++; 1214 } 1215 } 1216 } 1217 ctx->pattern++; 1218 break; 1219 1220 case SRE_OP_GROUPREF_LOC_IGNORE: 1221 /* match backreference */ 1222 TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern, 1223 ctx->ptr, ctx->pattern[0])); 1224 i = ctx->pattern[0]; 1225 { 1226 Py_ssize_t groupref = i+i; 1227 if (groupref >= state->lastmark) { 1228 RETURN_FAILURE; 1229 } else { 1230 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1231 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1232 if (!p || !e || e < p) 1233 RETURN_FAILURE; 1234 while (p < e) { 1235 if (ctx->ptr >= end || 1236 sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p)) 1237 RETURN_FAILURE; 1238 p++; 1239 ctx->ptr++; 1240 } 1241 } 1242 } 1243 ctx->pattern++; 1244 break; 1245 1246 case SRE_OP_GROUPREF_EXISTS: 1247 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern, 1248 ctx->ptr, ctx->pattern[0])); 1249 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */ 1250 i = ctx->pattern[0]; 1251 { 1252 Py_ssize_t groupref = i+i; 1253 if (groupref >= state->lastmark) { 1254 ctx->pattern += ctx->pattern[1]; 1255 break; 1256 } else { 1257 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; 1258 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; 1259 if (!p || !e || e < p) { 1260 ctx->pattern += ctx->pattern[1]; 1261 break; 1262 } 1263 } 1264 } 1265 ctx->pattern += 2; 1266 break; 1267 1268 case SRE_OP_ASSERT: 1269 /* assert subpattern */ 1270 /* <ASSERT> <skip> <back> <pattern> */ 1271 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, 1272 ctx->ptr, ctx->pattern[1])); 1273 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1]) 1274 RETURN_FAILURE; 1275 state->ptr = ctx->ptr - ctx->pattern[1]; 1276 DO_JUMP0(JUMP_ASSERT, jump_assert, ctx->pattern+2); 1277 RETURN_ON_FAILURE(ret); 1278 ctx->pattern += ctx->pattern[0]; 1279 break; 1280 1281 case SRE_OP_ASSERT_NOT: 1282 /* assert not subpattern */ 1283 /* <ASSERT_NOT> <skip> <back> <pattern> */ 1284 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, 1285 ctx->ptr, ctx->pattern[1])); 1286 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { 1287 state->ptr = ctx->ptr - ctx->pattern[1]; 1288 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); 1289 if (ret) { 1290 RETURN_ON_ERROR(ret); 1291 RETURN_FAILURE; 1292 } 1293 } 1294 ctx->pattern += ctx->pattern[0]; 1295 break; 1296 1297 case SRE_OP_FAILURE: 1298 /* immediate failure */ 1299 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr)); 1300 RETURN_FAILURE; 1301 1302 default: 1303 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr, 1304 ctx->pattern[-1])); 1305 RETURN_ERROR(SRE_ERROR_ILLEGAL); 1306 } 1307 } 1308 1309 exit: 1310 ctx_pos = ctx->last_ctx_pos; 1311 jump = ctx->jump; 1312 DATA_POP_DISCARD(ctx); 1313 if (ctx_pos == -1) 1314 return ret; 1315 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); 1316 1317 switch (jump) { 1318 case JUMP_MAX_UNTIL_2: 1319 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1320 goto jump_max_until_2; 1321 case JUMP_MAX_UNTIL_3: 1322 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1323 goto jump_max_until_3; 1324 case JUMP_MIN_UNTIL_2: 1325 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr)); 1326 goto jump_min_until_2; 1327 case JUMP_MIN_UNTIL_3: 1328 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr)); 1329 goto jump_min_until_3; 1330 case JUMP_BRANCH: 1331 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr)); 1332 goto jump_branch; 1333 case JUMP_MAX_UNTIL_1: 1334 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1335 goto jump_max_until_1; 1336 case JUMP_MIN_UNTIL_1: 1337 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); 1338 goto jump_min_until_1; 1339 case JUMP_REPEAT: 1340 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); 1341 goto jump_repeat; 1342 case JUMP_REPEAT_ONE_1: 1343 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr)); 1344 goto jump_repeat_one_1; 1345 case JUMP_REPEAT_ONE_2: 1346 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr)); 1347 goto jump_repeat_one_2; 1348 case JUMP_MIN_REPEAT_ONE: 1349 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); 1350 goto jump_min_repeat_one; 1351 case JUMP_ASSERT: 1352 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); 1353 goto jump_assert; 1354 case JUMP_ASSERT_NOT: 1355 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr)); 1356 goto jump_assert_not; 1357 case JUMP_NONE: 1358 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern, 1359 ctx->ptr, ret)); 1360 break; 1361 } 1362 1363 return ret; /* should never get here */ 1364 } 1365 1366 /* need to reset capturing groups between two SRE(match) callings in loops */ 1367 #define RESET_CAPTURE_GROUP() \ 1368 do { state->lastmark = state->lastindex = -1; } while (0) 1369 1370 LOCAL(Py_ssize_t) 1371 SRE(search)(SRE_STATE* state, SRE_CODE* pattern) 1372 { 1373 SRE_CHAR* ptr = (SRE_CHAR *)state->start; 1374 SRE_CHAR* end = (SRE_CHAR *)state->end; 1375 Py_ssize_t status = 0; 1376 Py_ssize_t prefix_len = 0; 1377 Py_ssize_t prefix_skip = 0; 1378 SRE_CODE* prefix = NULL; 1379 SRE_CODE* charset = NULL; 1380 SRE_CODE* overlap = NULL; 1381 int flags = 0; 1382 1383 if (ptr > end) 1384 return 0; 1385 1386 if (pattern[0] == SRE_OP_INFO) { 1387 /* optimization info block */ 1388 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ 1389 1390 flags = pattern[2]; 1391 1392 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) { 1393 TRACE(("reject (got %u chars, need %u)\n", 1394 (unsigned int)(end - ptr), pattern[3])); 1395 return 0; 1396 } 1397 if (pattern[3] > 1) { 1398 /* adjust end point (but make sure we leave at least one 1399 character in there, so literal search will work) */ 1400 end -= pattern[3] - 1; 1401 if (end <= ptr) 1402 end = ptr; 1403 } 1404 1405 if (flags & SRE_INFO_PREFIX) { 1406 /* pattern starts with a known prefix */ 1407 /* <length> <skip> <prefix data> <overlap data> */ 1408 prefix_len = pattern[5]; 1409 prefix_skip = pattern[6]; 1410 prefix = pattern + 7; 1411 overlap = prefix + prefix_len - 1; 1412 } else if (flags & SRE_INFO_CHARSET) 1413 /* pattern starts with a character from a known set */ 1414 /* <charset> */ 1415 charset = pattern + 5; 1416 1417 pattern += 1 + pattern[1]; 1418 } 1419 1420 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n", 1421 prefix, prefix_len, prefix_skip)); 1422 TRACE(("charset = %p\n", charset)); 1423 1424 if (prefix_len == 1) { 1425 /* pattern starts with a literal character */ 1426 SRE_CHAR c = (SRE_CHAR) prefix[0]; 1427 #if SIZEOF_SRE_CHAR < 4 1428 if ((SRE_CODE) c != prefix[0]) 1429 return 0; /* literal can't match: doesn't fit in char width */ 1430 #endif 1431 end = (SRE_CHAR *)state->end; 1432 state->must_advance = 0; 1433 while (ptr < end) { 1434 while (*ptr != c) { 1435 if (++ptr >= end) 1436 return 0; 1437 } 1438 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); 1439 state->start = ptr; 1440 state->ptr = ptr + prefix_skip; 1441 if (flags & SRE_INFO_LITERAL) 1442 return 1; /* we got all of it */ 1443 status = SRE(match)(state, pattern + 2*prefix_skip, 0); 1444 if (status != 0) 1445 return status; 1446 ++ptr; 1447 RESET_CAPTURE_GROUP(); 1448 } 1449 return 0; 1450 } 1451 1452 if (prefix_len > 1) { 1453 /* pattern starts with a known prefix. use the overlap 1454 table to skip forward as fast as we possibly can */ 1455 Py_ssize_t i = 0; 1456 1457 end = (SRE_CHAR *)state->end; 1458 if (prefix_len > end - ptr) 1459 return 0; 1460 #if SIZEOF_SRE_CHAR < 4 1461 for (i = 0; i < prefix_len; i++) 1462 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i]) 1463 return 0; /* literal can't match: doesn't fit in char width */ 1464 #endif 1465 while (ptr < end) { 1466 SRE_CHAR c = (SRE_CHAR) prefix[0]; 1467 while (*ptr++ != c) { 1468 if (ptr >= end) 1469 return 0; 1470 } 1471 if (ptr >= end) 1472 return 0; 1473 1474 i = 1; 1475 state->must_advance = 0; 1476 do { 1477 if (*ptr == (SRE_CHAR) prefix[i]) { 1478 if (++i != prefix_len) { 1479 if (++ptr >= end) 1480 return 0; 1481 continue; 1482 } 1483 /* found a potential match */ 1484 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); 1485 state->start = ptr - (prefix_len - 1); 1486 state->ptr = ptr - (prefix_len - prefix_skip - 1); 1487 if (flags & SRE_INFO_LITERAL) 1488 return 1; /* we got all of it */ 1489 status = SRE(match)(state, pattern + 2*prefix_skip, 0); 1490 if (status != 0) 1491 return status; 1492 /* close but no cigar -- try again */ 1493 if (++ptr >= end) 1494 return 0; 1495 RESET_CAPTURE_GROUP(); 1496 } 1497 i = overlap[i]; 1498 } while (i != 0); 1499 } 1500 return 0; 1501 } 1502 1503 if (charset) { 1504 /* pattern starts with a character from a known set */ 1505 end = (SRE_CHAR *)state->end; 1506 state->must_advance = 0; 1507 for (;;) { 1508 while (ptr < end && !SRE(charset)(state, charset, *ptr)) 1509 ptr++; 1510 if (ptr >= end) 1511 return 0; 1512 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); 1513 state->start = ptr; 1514 state->ptr = ptr; 1515 status = SRE(match)(state, pattern, 0); 1516 if (status != 0) 1517 break; 1518 ptr++; 1519 RESET_CAPTURE_GROUP(); 1520 } 1521 } else { 1522 /* general case */ 1523 assert(ptr <= end); 1524 TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); 1525 state->start = state->ptr = ptr; 1526 status = SRE(match)(state, pattern, 1); 1527 state->must_advance = 0; 1528 while (status == 0 && ptr < end) { 1529 ptr++; 1530 RESET_CAPTURE_GROUP(); 1531 TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); 1532 state->start = state->ptr = ptr; 1533 status = SRE(match)(state, pattern, 0); 1534 } 1535 } 1536 1537 return status; 1538 } 1539 1540 #undef SRE_CHAR 1541 #undef SIZEOF_SRE_CHAR 1542 #undef SRE 1543 1544 /* vim:ts=4:sw=4:et 1545 */ 1546