1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2014 University of Cambridge 10 11 ----------------------------------------------------------------------------- 12 Redistribution and use in source and binary forms, with or without 13 modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 POSSIBILITY OF SUCH DAMAGE. 37 ----------------------------------------------------------------------------- 38 */ 39 40 /* This module contains pcre_exec(), the externally visible function that does 41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as 42 possible. There are also some static supporting functions. */ 43 44 #ifdef HAVE_CONFIG_H 45 #include "config.h" 46 #endif 47 48 #define NLBLOCK md /* Block containing newline information */ 49 #define PSSTART start_subject /* Field containing processed string start */ 50 #define PSEND end_subject /* Field containing processed string end */ 51 52 #include "pcre_internal.h" 53 54 /* Undefine some potentially clashing cpp symbols */ 55 56 #undef min 57 #undef max 58 59 /* The md->capture_last field uses the lower 16 bits for the last captured 60 substring (which can never be greater than 65535) and a bit in the top half 61 to mean "capture vector overflowed". This odd way of doing things was 62 implemented when it was realized that preserving and restoring the overflow bit 63 whenever the last capture number was saved/restored made for a neater 64 interface, and doing it this way saved on (a) another variable, which would 65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another 66 separate set of save/restore instructions. The following defines are used in 67 implementing this. */ 68 69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */ 70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ 71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ 72 73 /* Values for setting in md->match_function_type to indicate two special types 74 of call to match(). We do it this way to save on using another stack variable, 75 as stack usage is to be discouraged. */ 76 77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ 78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ 79 80 /* Non-error returns from the match() function. Error returns are externally 81 defined PCRE_ERROR_xxx codes, which are all negative. */ 82 83 #define MATCH_MATCH 1 84 #define MATCH_NOMATCH 0 85 86 /* Special internal returns from the match() function. Make them sufficiently 87 negative to avoid the external error codes. */ 88 89 #define MATCH_ACCEPT (-999) 90 #define MATCH_KETRPOS (-998) 91 #define MATCH_ONCE (-997) 92 /* The next 5 must be kept together and in sequence so that a test that checks 93 for any one of them can use a range. */ 94 #define MATCH_COMMIT (-996) 95 #define MATCH_PRUNE (-995) 96 #define MATCH_SKIP (-994) 97 #define MATCH_SKIP_ARG (-993) 98 #define MATCH_THEN (-992) 99 #define MATCH_BACKTRACK_MAX MATCH_THEN 100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT 101 102 /* Maximum number of ints of offset to save on the stack for recursive calls. 103 If the offset vector is bigger, malloc is used. This should be a multiple of 3, 104 because the offset vector is always a multiple of 3 long. */ 105 106 #define REC_STACK_SAVE_MAX 30 107 108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ 109 110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; 111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; 112 113 #ifdef PCRE_DEBUG 114 /************************************************* 115 * Debugging function to print chars * 116 *************************************************/ 117 118 /* Print a sequence of chars in printable format, stopping at the end of the 119 subject if the requested. 120 121 Arguments: 122 p points to characters 123 length number to print 124 is_subject TRUE if printing from within md->start_subject 125 md pointer to matching data block, if is_subject is TRUE 126 127 Returns: nothing 128 */ 129 130 static void 131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) 132 { 133 pcre_uint32 c; 134 BOOL utf = md->utf; 135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 136 while (length-- > 0) 137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); 138 } 139 #endif 140 141 142 143 /************************************************* 144 * Match a back-reference * 145 *************************************************/ 146 147 /* Normally, if a back reference hasn't been set, the length that is passed is 148 negative, so the match always fails. However, in JavaScript compatibility mode, 149 the length passed is zero. Note that in caseless UTF-8 mode, the number of 150 subject bytes matched may be different to the number of reference bytes. 151 152 Arguments: 153 offset index into the offset vector 154 eptr pointer into the subject 155 length length of reference to be matched (number of bytes) 156 md points to match data block 157 caseless TRUE if caseless 158 159 Returns: >= 0 the number of subject bytes matched 160 -1 no match 161 -2 partial match; always given if at end subject 162 */ 163 164 static int 165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, 166 BOOL caseless) 167 { 168 PCRE_PUCHAR eptr_start = eptr; 169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; 170 #if defined SUPPORT_UTF && defined SUPPORT_UCP 171 BOOL utf = md->utf; 172 #endif 173 174 #ifdef PCRE_DEBUG 175 if (eptr >= md->end_subject) 176 printf("matching subject <null>"); 177 else 178 { 179 printf("matching subject "); 180 pchars(eptr, length, TRUE, md); 181 } 182 printf(" against backref "); 183 pchars(p, length, FALSE, md); 184 printf("\n"); 185 #endif 186 187 /* Always fail if reference not set (and not JavaScript compatible - in that 188 case the length is passed as zero). */ 189 190 if (length < 0) return -1; 191 192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this 193 properly if Unicode properties are supported. Otherwise, we can check only 194 ASCII characters. */ 195 196 if (caseless) 197 { 198 #if defined SUPPORT_UTF && defined SUPPORT_UCP 199 if (utf) 200 { 201 /* Match characters up to the end of the reference. NOTE: the number of 202 data units matched may differ, because in UTF-8 there are some characters 203 whose upper and lower case versions code have different numbers of bytes. 204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a 206 sequence of two of the latter. It is important, therefore, to check the 207 length along the reference, not along the subject (earlier code did this 208 wrong). */ 209 210 PCRE_PUCHAR endptr = p + length; 211 while (p < endptr) 212 { 213 pcre_uint32 c, d; 214 const ucd_record *ur; 215 if (eptr >= md->end_subject) return -2; /* Partial match */ 216 GETCHARINC(c, eptr); 217 GETCHARINC(d, p); 218 ur = GET_UCD(d); 219 if (c != d && c != d + ur->other_case) 220 { 221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; 222 for (;;) 223 { 224 if (c < *pp) return -1; 225 if (c == *pp++) break; 226 } 227 } 228 } 229 } 230 else 231 #endif 232 233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 234 is no UCP support. */ 235 { 236 while (length-- > 0) 237 { 238 pcre_uint32 cc, cp; 239 if (eptr >= md->end_subject) return -2; /* Partial match */ 240 cc = UCHAR21TEST(eptr); 241 cp = UCHAR21TEST(p); 242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; 243 p++; 244 eptr++; 245 } 246 } 247 } 248 249 /* In the caseful case, we can just compare the bytes, whether or not we 250 are in UTF-8 mode. */ 251 252 else 253 { 254 while (length-- > 0) 255 { 256 if (eptr >= md->end_subject) return -2; /* Partial match */ 257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; 258 } 259 } 260 261 return (int)(eptr - eptr_start); 262 } 263 264 265 266 /*************************************************************************** 267 **************************************************************************** 268 RECURSION IN THE match() FUNCTION 269 270 The match() function is highly recursive, though not every recursive call 271 increases the recursive depth. Nevertheless, some regular expressions can cause 272 it to recurse to a great depth. I was writing for Unix, so I just let it call 273 itself recursively. This uses the stack for saving everything that has to be 274 saved for a recursive call. On Unix, the stack can be large, and this works 275 fine. 276 277 It turns out that on some non-Unix-like systems there are problems with 278 programs that use a lot of stack. (This despite the fact that every last chip 279 has oodles of memory these days, and techniques for extending the stack have 280 been known for decades.) So.... 281 282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 283 calls by keeping local variables that need to be preserved in blocks of memory 284 obtained from malloc() instead instead of on the stack. Macros are used to 285 achieve this so that the actual code doesn't look very different to what it 286 always used to. 287 288 The original heap-recursive code used longjmp(). However, it seems that this 289 can be very slow on some operating systems. Following a suggestion from Stan 290 Switzer, the use of longjmp() has been abolished, at the cost of having to 291 provide a unique number for each call to RMATCH. There is no way of generating 292 a sequence of numbers at compile time in C. I have given them names, to make 293 them stand out more clearly. 294 295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 297 tests. Furthermore, not using longjmp() means that local dynamic variables 298 don't have indeterminate values; this has meant that the frame size can be 299 reduced because the result can be "passed back" by straight setting of the 300 variable instead of being passed in the frame. 301 **************************************************************************** 302 ***************************************************************************/ 303 304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 305 below must be updated in sync. */ 306 307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; 314 315 /* These versions of the macros use the stack, as normal. There are debugging 316 versions and production versions. Note that the "rw" argument of RMATCH isn't 317 actually used in this definition. */ 318 319 #ifndef NO_RECURSE 320 #define REGISTER register 321 322 #ifdef PCRE_DEBUG 323 #define RMATCH(ra,rb,rc,rd,re,rw) \ 324 { \ 325 printf("match() called in line %d\n", __LINE__); \ 326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \ 327 printf("to line %d\n", __LINE__); \ 328 } 329 #define RRETURN(ra) \ 330 { \ 331 printf("match() returned %d from line %d\n", ra, __LINE__); \ 332 return ra; \ 333 } 334 #else 335 #define RMATCH(ra,rb,rc,rd,re,rw) \ 336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) 337 #define RRETURN(ra) return ra 338 #endif 339 340 #else 341 342 343 /* These versions of the macros manage a private stack on the heap. Note that 344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md 345 argument of match(), which never changes. */ 346 347 #define REGISTER 348 349 #define RMATCH(ra,rb,rc,rd,re,rw)\ 350 {\ 351 heapframe *newframe = frame->Xnextframe;\ 352 if (newframe == NULL)\ 353 {\ 354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ 355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ 356 newframe->Xnextframe = NULL;\ 357 frame->Xnextframe = newframe;\ 358 }\ 359 frame->Xwhere = rw;\ 360 newframe->Xeptr = ra;\ 361 newframe->Xecode = rb;\ 362 newframe->Xmstart = mstart;\ 363 newframe->Xoffset_top = rc;\ 364 newframe->Xeptrb = re;\ 365 newframe->Xrdepth = frame->Xrdepth + 1;\ 366 newframe->Xprevframe = frame;\ 367 frame = newframe;\ 368 DPRINTF(("restarting from line %d\n", __LINE__));\ 369 goto HEAP_RECURSE;\ 370 L_##rw:\ 371 DPRINTF(("jumped back to line %d\n", __LINE__));\ 372 } 373 374 #define RRETURN(ra)\ 375 {\ 376 heapframe *oldframe = frame;\ 377 frame = oldframe->Xprevframe;\ 378 if (frame != NULL)\ 379 {\ 380 rrc = ra;\ 381 goto HEAP_RETURN;\ 382 }\ 383 return ra;\ 384 } 385 386 387 /* Structure for remembering the local variables in a private frame */ 388 389 typedef struct heapframe { 390 struct heapframe *Xprevframe; 391 struct heapframe *Xnextframe; 392 393 /* Function arguments that may change */ 394 395 PCRE_PUCHAR Xeptr; 396 const pcre_uchar *Xecode; 397 PCRE_PUCHAR Xmstart; 398 int Xoffset_top; 399 eptrblock *Xeptrb; 400 unsigned int Xrdepth; 401 402 /* Function local variables */ 403 404 PCRE_PUCHAR Xcallpat; 405 #ifdef SUPPORT_UTF 406 PCRE_PUCHAR Xcharptr; 407 #endif 408 PCRE_PUCHAR Xdata; 409 PCRE_PUCHAR Xnext; 410 PCRE_PUCHAR Xpp; 411 PCRE_PUCHAR Xprev; 412 PCRE_PUCHAR Xsaved_eptr; 413 414 recursion_info Xnew_recursive; 415 416 BOOL Xcur_is_word; 417 BOOL Xcondition; 418 BOOL Xprev_is_word; 419 420 #ifdef SUPPORT_UCP 421 int Xprop_type; 422 unsigned int Xprop_value; 423 int Xprop_fail_result; 424 int Xoclength; 425 pcre_uchar Xocchars[6]; 426 #endif 427 428 int Xcodelink; 429 int Xctype; 430 unsigned int Xfc; 431 int Xfi; 432 int Xlength; 433 int Xmax; 434 int Xmin; 435 unsigned int Xnumber; 436 int Xoffset; 437 unsigned int Xop; 438 pcre_int32 Xsave_capture_last; 439 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 440 int Xstacksave[REC_STACK_SAVE_MAX]; 441 442 eptrblock Xnewptrb; 443 444 /* Where to jump back to */ 445 446 int Xwhere; 447 448 } heapframe; 449 450 #endif 451 452 453 /*************************************************************************** 454 ***************************************************************************/ 455 456 457 458 /************************************************* 459 * Match from current position * 460 *************************************************/ 461 462 /* This function is called recursively in many circumstances. Whenever it 463 returns a negative (error) response, the outer incarnation must also return the 464 same response. */ 465 466 /* These macros pack up tests that are used for partial matching, and which 467 appear several times in the code. We set the "hit end" flag if the pointer is 468 at the end of the subject and also past the start of the subject (i.e. 469 something has been matched). For hard partial matching, we then return 470 immediately. The second one is used when we already know we are past the end of 471 the subject. */ 472 473 #define CHECK_PARTIAL()\ 474 if (md->partial != 0 && eptr >= md->end_subject && \ 475 eptr > md->start_used_ptr) \ 476 { \ 477 md->hitend = TRUE; \ 478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 479 } 480 481 #define SCHECK_PARTIAL()\ 482 if (md->partial != 0 && eptr > md->start_used_ptr) \ 483 { \ 484 md->hitend = TRUE; \ 485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 486 } 487 488 489 /* Performance note: It might be tempting to extract commonly used fields from 490 the md structure (e.g. utf, end_subject) into individual variables to improve 491 performance. Tests using gcc on a SPARC disproved this; in the first case, it 492 made performance worse. 493 494 Arguments: 495 eptr pointer to current character in subject 496 ecode pointer to current position in compiled code 497 mstart pointer to the current match start position (can be modified 498 by encountering \K) 499 offset_top current top pointer 500 md pointer to "static" info for the match 501 eptrb pointer to chain of blocks containing eptr at start of 502 brackets - for testing for empty matches 503 rdepth the recursion depth 504 505 Returns: MATCH_MATCH if matched ) these values are >= 0 506 MATCH_NOMATCH if failed to match ) 507 a negative MATCH_xxx value for PRUNE, SKIP, etc 508 a negative PCRE_ERROR_xxx value if aborted by an error condition 509 (e.g. stopped by repeated call or recursion limit) 510 */ 511 512 static int 513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, 514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, 515 unsigned int rdepth) 516 { 517 /* These variables do not need to be preserved over recursion in this function, 518 so they can be ordinary variables in all cases. Mark some of them with 519 "register" because they are used a lot in loops. */ 520 521 register int rrc; /* Returns from recursive calls */ 522 register int i; /* Used for loops not involving calls to RMATCH() */ 523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ 524 register BOOL utf; /* Local copy of UTF flag for speed */ 525 526 BOOL minimize, possessive; /* Quantifier options */ 527 BOOL caseless; 528 int condcode; 529 530 /* When recursion is not being used, all "local" variables that have to be 531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level 532 frame on the stack here; subsequent instantiations are obtained from the heap 533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting 534 the top-level on the stack rather than malloc-ing them all gives a performance 535 boost in many cases where there is not much "recursion". */ 536 537 #ifdef NO_RECURSE 538 heapframe *frame = (heapframe *)md->match_frames_base; 539 540 /* Copy in the original argument variables */ 541 542 frame->Xeptr = eptr; 543 frame->Xecode = ecode; 544 frame->Xmstart = mstart; 545 frame->Xoffset_top = offset_top; 546 frame->Xeptrb = eptrb; 547 frame->Xrdepth = rdepth; 548 549 /* This is where control jumps back to to effect "recursion" */ 550 551 HEAP_RECURSE: 552 553 /* Macros make the argument variables come from the current frame */ 554 555 #define eptr frame->Xeptr 556 #define ecode frame->Xecode 557 #define mstart frame->Xmstart 558 #define offset_top frame->Xoffset_top 559 #define eptrb frame->Xeptrb 560 #define rdepth frame->Xrdepth 561 562 /* Ditto for the local variables */ 563 564 #ifdef SUPPORT_UTF 565 #define charptr frame->Xcharptr 566 #endif 567 #define callpat frame->Xcallpat 568 #define codelink frame->Xcodelink 569 #define data frame->Xdata 570 #define next frame->Xnext 571 #define pp frame->Xpp 572 #define prev frame->Xprev 573 #define saved_eptr frame->Xsaved_eptr 574 575 #define new_recursive frame->Xnew_recursive 576 577 #define cur_is_word frame->Xcur_is_word 578 #define condition frame->Xcondition 579 #define prev_is_word frame->Xprev_is_word 580 581 #ifdef SUPPORT_UCP 582 #define prop_type frame->Xprop_type 583 #define prop_value frame->Xprop_value 584 #define prop_fail_result frame->Xprop_fail_result 585 #define oclength frame->Xoclength 586 #define occhars frame->Xocchars 587 #endif 588 589 #define ctype frame->Xctype 590 #define fc frame->Xfc 591 #define fi frame->Xfi 592 #define length frame->Xlength 593 #define max frame->Xmax 594 #define min frame->Xmin 595 #define number frame->Xnumber 596 #define offset frame->Xoffset 597 #define op frame->Xop 598 #define save_capture_last frame->Xsave_capture_last 599 #define save_offset1 frame->Xsave_offset1 600 #define save_offset2 frame->Xsave_offset2 601 #define save_offset3 frame->Xsave_offset3 602 #define stacksave frame->Xstacksave 603 604 #define newptrb frame->Xnewptrb 605 606 /* When recursion is being used, local variables are allocated on the stack and 607 get preserved during recursion in the normal way. In this environment, fi and 608 i, and fc and c, can be the same variables. */ 609 610 #else /* NO_RECURSE not defined */ 611 #define fi i 612 #define fc c 613 614 /* Many of the following variables are used only in small blocks of the code. 615 My normal style of coding would have declared them within each of those blocks. 616 However, in order to accommodate the version of this code that uses an external 617 "stack" implemented on the heap, it is easier to declare them all here, so the 618 declarations can be cut out in a block. The only declarations within blocks 619 below are for variables that do not have to be preserved over a recursive call 620 to RMATCH(). */ 621 622 #ifdef SUPPORT_UTF 623 const pcre_uchar *charptr; 624 #endif 625 const pcre_uchar *callpat; 626 const pcre_uchar *data; 627 const pcre_uchar *next; 628 PCRE_PUCHAR pp; 629 const pcre_uchar *prev; 630 PCRE_PUCHAR saved_eptr; 631 632 recursion_info new_recursive; 633 634 BOOL cur_is_word; 635 BOOL condition; 636 BOOL prev_is_word; 637 638 #ifdef SUPPORT_UCP 639 int prop_type; 640 unsigned int prop_value; 641 int prop_fail_result; 642 int oclength; 643 pcre_uchar occhars[6]; 644 #endif 645 646 int codelink; 647 int ctype; 648 int length; 649 int max; 650 int min; 651 unsigned int number; 652 int offset; 653 unsigned int op; 654 pcre_int32 save_capture_last; 655 int save_offset1, save_offset2, save_offset3; 656 int stacksave[REC_STACK_SAVE_MAX]; 657 658 eptrblock newptrb; 659 660 /* There is a special fudge for calling match() in a way that causes it to 661 measure the size of its basic stack frame when the stack is being used for 662 recursion. The second argument (ecode) being NULL triggers this behaviour. It 663 cannot normally ever be NULL. The return is the negated value of the frame 664 size. */ 665 666 if (ecode == NULL) 667 { 668 if (rdepth == 0) 669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); 670 else 671 { 672 int len = (char *)&rdepth - (char *)eptr; 673 return (len > 0)? -len : len; 674 } 675 } 676 #endif /* NO_RECURSE */ 677 678 /* To save space on the stack and in the heap frame, I have doubled up on some 679 of the local variables that are used only in localised parts of the code, but 680 still need to be preserved over recursive calls of match(). These macros define 681 the alternative names that are used. */ 682 683 #define allow_zero cur_is_word 684 #define cbegroup condition 685 #define code_offset codelink 686 #define condassert condition 687 #define matched_once prev_is_word 688 #define foc number 689 #define save_mark data 690 691 /* These statements are here to stop the compiler complaining about unitialized 692 variables. */ 693 694 #ifdef SUPPORT_UCP 695 prop_value = 0; 696 prop_fail_result = 0; 697 #endif 698 699 700 /* This label is used for tail recursion, which is used in a few cases even 701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is 702 used. Thanks to Ian Taylor for noticing this possibility and sending the 703 original patch. */ 704 705 TAIL_RECURSE: 706 707 /* OK, now we can get on with the real code of the function. Recursive calls 708 are specified by the macro RMATCH and RRETURN is used to return. When 709 NO_RECURSE is *not* defined, these just turn into a recursive call to match() 710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 711 defined). However, RMATCH isn't like a function call because it's quite a 712 complicated macro. It has to be used in one particular way. This shouldn't, 713 however, impact performance when true recursion is being used. */ 714 715 #ifdef SUPPORT_UTF 716 utf = md->utf; /* Local copy of the flag */ 717 #else 718 utf = FALSE; 719 #endif 720 721 /* First check that we haven't called match() too many times, or that we 722 haven't exceeded the recursive call limit. */ 723 724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 726 727 /* At the start of a group with an unlimited repeat that may match an empty 728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is 729 done this way to save having to use another function argument, which would take 730 up space on the stack. See also MATCH_CONDASSERT below. 731 732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of 733 such remembered pointers, to be checked when we hit the closing ket, in order 734 to break infinite loops that match no characters. When match() is called in 735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must 736 NOT be used with tail recursion, because the memory block that is used is on 737 the stack, so a new one may be required for each match(). */ 738 739 if (md->match_function_type == MATCH_CBEGROUP) 740 { 741 newptrb.epb_saved_eptr = eptr; 742 newptrb.epb_prev = eptrb; 743 eptrb = &newptrb; 744 md->match_function_type = 0; 745 } 746 747 /* Now start processing the opcodes. */ 748 749 for (;;) 750 { 751 minimize = possessive = FALSE; 752 op = *ecode; 753 754 switch(op) 755 { 756 case OP_MARK: 757 md->nomatch_mark = ecode + 2; 758 md->mark = NULL; /* In case previously set by assertion */ 759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 760 eptrb, RM55); 761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 762 md->mark == NULL) md->mark = ecode + 2; 763 764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 765 argument, and we must check whether that argument matches this MARK's 766 argument. It is passed back in md->start_match_ptr (an overloading of that 767 variable). If it does match, we reset that variable to the current subject 768 position and return MATCH_SKIP. Otherwise, pass back the return code 769 unaltered. */ 770 771 else if (rrc == MATCH_SKIP_ARG && 772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) 773 { 774 md->start_match_ptr = eptr; 775 RRETURN(MATCH_SKIP); 776 } 777 RRETURN(rrc); 778 779 case OP_FAIL: 780 RRETURN(MATCH_NOMATCH); 781 782 case OP_COMMIT: 783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 784 eptrb, RM52); 785 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 786 RRETURN(MATCH_COMMIT); 787 788 case OP_PRUNE: 789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 790 eptrb, RM51); 791 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 792 RRETURN(MATCH_PRUNE); 793 794 case OP_PRUNE_ARG: 795 md->nomatch_mark = ecode + 2; 796 md->mark = NULL; /* In case previously set by assertion */ 797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 798 eptrb, RM56); 799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 800 md->mark == NULL) md->mark = ecode + 2; 801 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 802 RRETURN(MATCH_PRUNE); 803 804 case OP_SKIP: 805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 806 eptrb, RM53); 807 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 808 md->start_match_ptr = eptr; /* Pass back current position */ 809 RRETURN(MATCH_SKIP); 810 811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set 812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was 813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG 814 that failed and any that precede it (either they also failed, or were not 815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a 816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg 817 set to the count of the one that failed. */ 818 819 case OP_SKIP_ARG: 820 md->skip_arg_count++; 821 if (md->skip_arg_count <= md->ignore_skip_arg) 822 { 823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; 824 break; 825 } 826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 827 eptrb, RM57); 828 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 829 830 /* Pass back the current skip name by overloading md->start_match_ptr and 831 returning the special MATCH_SKIP_ARG return code. This will either be 832 caught by a matching MARK, or get to the top, where it causes a rematch 833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */ 834 835 md->start_match_ptr = ecode + 2; 836 RRETURN(MATCH_SKIP_ARG); 837 838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 839 the branch in which it occurs can be determined. Overload the start of 840 match pointer to do this. */ 841 842 case OP_THEN: 843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 844 eptrb, RM54); 845 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 846 md->start_match_ptr = ecode; 847 RRETURN(MATCH_THEN); 848 849 case OP_THEN_ARG: 850 md->nomatch_mark = ecode + 2; 851 md->mark = NULL; /* In case previously set by assertion */ 852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, 853 md, eptrb, RM58); 854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 855 md->mark == NULL) md->mark = ecode + 2; 856 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 857 md->start_match_ptr = ecode; 858 RRETURN(MATCH_THEN); 859 860 /* Handle an atomic group that does not contain any capturing parentheses. 861 This can be handled like an assertion. Prior to 8.13, all atomic groups 862 were handled this way. In 8.13, the code was changed as below for ONCE, so 863 that backups pass through the group and thereby reset captured values. 864 However, this uses a lot more stack, so in 8.20, atomic groups that do not 865 contain any captures generate OP_ONCE_NC, which can be handled in the old, 866 less stack intensive way. 867 868 Check the alternative branches in turn - the matching won't pass the KET 869 for this kind of subpattern. If any one branch matches, we carry on as at 870 the end of a normal bracket, leaving the subject pointer, but resetting 871 the start-of-match value in case it was changed by \K. */ 872 873 case OP_ONCE_NC: 874 prev = ecode; 875 saved_eptr = eptr; 876 save_mark = md->mark; 877 do 878 { 879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); 880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 881 { 882 mstart = md->start_match_ptr; 883 break; 884 } 885 if (rrc == MATCH_THEN) 886 { 887 next = ecode + GET(ecode,1); 888 if (md->start_match_ptr < next && 889 (*ecode == OP_ALT || *next == OP_ALT)) 890 rrc = MATCH_NOMATCH; 891 } 892 893 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 894 ecode += GET(ecode,1); 895 md->mark = save_mark; 896 } 897 while (*ecode == OP_ALT); 898 899 /* If hit the end of the group (which could be repeated), fail */ 900 901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 902 903 /* Continue as from after the group, updating the offsets high water 904 mark, since extracts may have been taken. */ 905 906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 907 908 offset_top = md->end_offset_top; 909 eptr = md->end_match_ptr; 910 911 /* For a non-repeating ket, just continue at this level. This also 912 happens for a repeating ket if no characters were matched in the group. 913 This is the forcible breaking of infinite loops as implemented in Perl 914 5.005. */ 915 916 if (*ecode == OP_KET || eptr == saved_eptr) 917 { 918 ecode += 1+LINK_SIZE; 919 break; 920 } 921 922 /* The repeating kets try the rest of the pattern or restart from the 923 preceding bracket, in the appropriate order. The second "call" of match() 924 uses tail recursion, to avoid using another stack frame. */ 925 926 if (*ecode == OP_KETRMIN) 927 { 928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65); 929 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 930 ecode = prev; 931 goto TAIL_RECURSE; 932 } 933 else /* OP_KETRMAX */ 934 { 935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66); 936 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 937 ecode += 1 + LINK_SIZE; 938 goto TAIL_RECURSE; 939 } 940 /* Control never gets here */ 941 942 /* Handle a capturing bracket, other than those that are possessive with an 943 unlimited repeat. If there is space in the offset vector, save the current 944 subject position in the working slot at the top of the vector. We mustn't 945 change the current values of the data slot, because they may be set from a 946 previous iteration of this group, and be referred to by a reference inside 947 the group. A failure to match might occur after the group has succeeded, 948 if something later on doesn't match. For this reason, we need to restore 949 the working value and also the values of the final offsets, in case they 950 were set by a previous iteration of the same bracket. 951 952 If there isn't enough space in the offset vector, treat this as if it were 953 a non-capturing bracket. Don't worry about setting the flag for the error 954 case here; that is handled in the code for KET. */ 955 956 case OP_CBRA: 957 case OP_SCBRA: 958 number = GET2(ecode, 1+LINK_SIZE); 959 offset = number << 1; 960 961 #ifdef PCRE_DEBUG 962 printf("start bracket %d\n", number); 963 printf("subject="); 964 pchars(eptr, 16, TRUE, md); 965 printf("\n"); 966 #endif 967 968 if (offset < md->offset_max) 969 { 970 save_offset1 = md->offset_vector[offset]; 971 save_offset2 = md->offset_vector[offset+1]; 972 save_offset3 = md->offset_vector[md->offset_end - number]; 973 save_capture_last = md->capture_last; 974 save_mark = md->mark; 975 976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 977 md->offset_vector[md->offset_end - number] = 978 (int)(eptr - md->start_subject); 979 980 for (;;) 981 { 982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 984 eptrb, RM1); 985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ 986 987 /* If we backed up to a THEN, check whether it is within the current 988 branch by comparing the address of the THEN that is passed back with 989 the end of the branch. If it is within the current branch, and the 990 branch is one of two or more alternatives (it either starts or ends 991 with OP_ALT), we have reached the limit of THEN's action, so convert 992 the return code to NOMATCH, which will cause normal backtracking to 993 happen from now on. Otherwise, THEN is passed back to an outer 994 alternative. This implements Perl's treatment of parenthesized groups, 995 where a group not containing | does not affect the current alternative, 996 that is, (X) is NOT the same as (X|(*F)). */ 997 998 if (rrc == MATCH_THEN) 999 { 1000 next = ecode + GET(ecode,1); 1001 if (md->start_match_ptr < next && 1002 (*ecode == OP_ALT || *next == OP_ALT)) 1003 rrc = MATCH_NOMATCH; 1004 } 1005 1006 /* Anything other than NOMATCH is passed back. */ 1007 1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1009 md->capture_last = save_capture_last; 1010 ecode += GET(ecode, 1); 1011 md->mark = save_mark; 1012 if (*ecode != OP_ALT) break; 1013 } 1014 1015 DPRINTF(("bracket %d failed\n", number)); 1016 md->offset_vector[offset] = save_offset1; 1017 md->offset_vector[offset+1] = save_offset2; 1018 md->offset_vector[md->offset_end - number] = save_offset3; 1019 1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ 1021 1022 RRETURN(rrc); 1023 } 1024 1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1026 as a non-capturing bracket. */ 1027 1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1030 1031 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1032 1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1035 1036 /* Non-capturing or atomic group, except for possessive with unlimited 1037 repeat and ONCE group with no captures. Loop for all the alternatives. 1038 1039 When we get to the final alternative within the brackets, we used to return 1040 the result of a recursive call to match() whatever happened so it was 1041 possible to reduce stack usage by turning this into a tail recursion, 1042 except in the case of a possibly empty group. However, now that there is 1043 the possiblity of (*THEN) occurring in the final alternative, this 1044 optimization is no longer always possible. 1045 1046 We can optimize if we know there are no (*THEN)s in the pattern; at present 1047 this is the best that can be done. 1048 1049 MATCH_ONCE is returned when the end of an atomic group is successfully 1050 reached, but subsequent matching fails. It passes back up the tree (causing 1051 captured values to be reset) until the original atomic group level is 1052 reached. This is tested by comparing md->once_target with the start of the 1053 group. At this point, the return is converted into MATCH_NOMATCH so that 1054 previous backup points can be taken. */ 1055 1056 case OP_ONCE: 1057 case OP_BRA: 1058 case OP_SBRA: 1059 DPRINTF(("start non-capturing bracket\n")); 1060 1061 for (;;) 1062 { 1063 if (op >= OP_SBRA || op == OP_ONCE) 1064 md->match_function_type = MATCH_CBEGROUP; 1065 1066 /* If this is not a possibly empty group, and there are no (*THEN)s in 1067 the pattern, and this is the final alternative, optimize as described 1068 above. */ 1069 1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) 1071 { 1072 ecode += PRIV(OP_lengths)[*ecode]; 1073 goto TAIL_RECURSE; 1074 } 1075 1076 /* In all other cases, we have to make another call to match(). */ 1077 1078 save_mark = md->mark; 1079 save_capture_last = md->capture_last; 1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, 1081 RM2); 1082 1083 /* See comment in the code for capturing groups above about handling 1084 THEN. */ 1085 1086 if (rrc == MATCH_THEN) 1087 { 1088 next = ecode + GET(ecode,1); 1089 if (md->start_match_ptr < next && 1090 (*ecode == OP_ALT || *next == OP_ALT)) 1091 rrc = MATCH_NOMATCH; 1092 } 1093 1094 if (rrc != MATCH_NOMATCH) 1095 { 1096 if (rrc == MATCH_ONCE) 1097 { 1098 const pcre_uchar *scode = ecode; 1099 if (*scode != OP_ONCE) /* If not at start, find it */ 1100 { 1101 while (*scode == OP_ALT) scode += GET(scode, 1); 1102 scode -= GET(scode, 1); 1103 } 1104 if (md->once_target == scode) rrc = MATCH_NOMATCH; 1105 } 1106 RRETURN(rrc); 1107 } 1108 ecode += GET(ecode, 1); 1109 md->mark = save_mark; 1110 if (*ecode != OP_ALT) break; 1111 md->capture_last = save_capture_last; 1112 } 1113 1114 RRETURN(MATCH_NOMATCH); 1115 1116 /* Handle possessive capturing brackets with an unlimited repeat. We come 1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are 1118 handled similarly to the normal case above. However, the matching is 1119 different. The end of these brackets will always be OP_KETRPOS, which 1120 returns MATCH_KETRPOS without going further in the pattern. By this means 1121 we can handle the group by iteration rather than recursion, thereby 1122 reducing the amount of stack needed. */ 1123 1124 case OP_CBRAPOS: 1125 case OP_SCBRAPOS: 1126 allow_zero = FALSE; 1127 1128 POSSESSIVE_CAPTURE: 1129 number = GET2(ecode, 1+LINK_SIZE); 1130 offset = number << 1; 1131 1132 #ifdef PCRE_DEBUG 1133 printf("start possessive bracket %d\n", number); 1134 printf("subject="); 1135 pchars(eptr, 16, TRUE, md); 1136 printf("\n"); 1137 #endif 1138 1139 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE; 1140 1141 matched_once = FALSE; 1142 code_offset = (int)(ecode - md->start_code); 1143 1144 save_offset1 = md->offset_vector[offset]; 1145 save_offset2 = md->offset_vector[offset+1]; 1146 save_offset3 = md->offset_vector[md->offset_end - number]; 1147 save_capture_last = md->capture_last; 1148 1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 1150 1151 /* Each time round the loop, save the current subject position for use 1152 when the group matches. For MATCH_MATCH, the group has matched, so we 1153 restart it with a new subject starting position, remembering that we had 1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as 1155 usual. If we haven't matched any alternatives in any iteration, check to 1156 see if a previous iteration matched. If so, the group has matched; 1157 continue from afterwards. Otherwise it has failed; restore the previous 1158 capture values before returning NOMATCH. */ 1159 1160 for (;;) 1161 { 1162 md->offset_vector[md->offset_end - number] = 1163 (int)(eptr - md->start_subject); 1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1166 eptrb, RM63); 1167 if (rrc == MATCH_KETRPOS) 1168 { 1169 offset_top = md->end_offset_top; 1170 ecode = md->start_code + code_offset; 1171 save_capture_last = md->capture_last; 1172 matched_once = TRUE; 1173 mstart = md->start_match_ptr; /* In case \K changed it */ 1174 if (eptr == md->end_match_ptr) /* Matched an empty string */ 1175 { 1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1177 break; 1178 } 1179 eptr = md->end_match_ptr; 1180 continue; 1181 } 1182 1183 /* See comment in the code for capturing groups above about handling 1184 THEN. */ 1185 1186 if (rrc == MATCH_THEN) 1187 { 1188 next = ecode + GET(ecode,1); 1189 if (md->start_match_ptr < next && 1190 (*ecode == OP_ALT || *next == OP_ALT)) 1191 rrc = MATCH_NOMATCH; 1192 } 1193 1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1195 md->capture_last = save_capture_last; 1196 ecode += GET(ecode, 1); 1197 if (*ecode != OP_ALT) break; 1198 } 1199 1200 if (!matched_once) 1201 { 1202 md->offset_vector[offset] = save_offset1; 1203 md->offset_vector[offset+1] = save_offset2; 1204 md->offset_vector[md->offset_end - number] = save_offset3; 1205 } 1206 1207 if (allow_zero || matched_once) 1208 { 1209 ecode += 1 + LINK_SIZE; 1210 break; 1211 } 1212 1213 RRETURN(MATCH_NOMATCH); 1214 1215 /* Non-capturing possessive bracket with unlimited repeat. We come here 1216 from BRAZERO with allow_zero = TRUE. The code is similar to the above, 1217 without the capturing complication. It is written out separately for speed 1218 and cleanliness. */ 1219 1220 case OP_BRAPOS: 1221 case OP_SBRAPOS: 1222 allow_zero = FALSE; 1223 1224 POSSESSIVE_NON_CAPTURE: 1225 matched_once = FALSE; 1226 code_offset = (int)(ecode - md->start_code); 1227 save_capture_last = md->capture_last; 1228 1229 for (;;) 1230 { 1231 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1232 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1233 eptrb, RM48); 1234 if (rrc == MATCH_KETRPOS) 1235 { 1236 offset_top = md->end_offset_top; 1237 ecode = md->start_code + code_offset; 1238 matched_once = TRUE; 1239 mstart = md->start_match_ptr; /* In case \K reset it */ 1240 if (eptr == md->end_match_ptr) /* Matched an empty string */ 1241 { 1242 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1243 break; 1244 } 1245 eptr = md->end_match_ptr; 1246 continue; 1247 } 1248 1249 /* See comment in the code for capturing groups above about handling 1250 THEN. */ 1251 1252 if (rrc == MATCH_THEN) 1253 { 1254 next = ecode + GET(ecode,1); 1255 if (md->start_match_ptr < next && 1256 (*ecode == OP_ALT || *next == OP_ALT)) 1257 rrc = MATCH_NOMATCH; 1258 } 1259 1260 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1261 ecode += GET(ecode, 1); 1262 if (*ecode != OP_ALT) break; 1263 md->capture_last = save_capture_last; 1264 } 1265 1266 if (matched_once || allow_zero) 1267 { 1268 ecode += 1 + LINK_SIZE; 1269 break; 1270 } 1271 RRETURN(MATCH_NOMATCH); 1272 1273 /* Control never reaches here. */ 1274 1275 /* Conditional group: compilation checked that there are no more than two 1276 branches. If the condition is false, skipping the first branch takes us 1277 past the end of the item if there is only one branch, but that's exactly 1278 what we want. */ 1279 1280 case OP_COND: 1281 case OP_SCOND: 1282 1283 /* The variable codelink will be added to ecode when the condition is 1284 false, to get to the second branch. Setting it to the offset to the ALT 1285 or KET, then incrementing ecode achieves this effect. We now have ecode 1286 pointing to the condition or callout. */ 1287 1288 codelink = GET(ecode, 1); /* Offset to the second branch */ 1289 ecode += 1 + LINK_SIZE; /* From this opcode */ 1290 1291 /* Because of the way auto-callout works during compile, a callout item is 1292 inserted between OP_COND and an assertion condition. */ 1293 1294 if (*ecode == OP_CALLOUT) 1295 { 1296 if (PUBL(callout) != NULL) 1297 { 1298 PUBL(callout_block) cb; 1299 cb.version = 2; /* Version 1 of the callout block */ 1300 cb.callout_number = ecode[1]; 1301 cb.offset_vector = md->offset_vector; 1302 #if defined COMPILE_PCRE8 1303 cb.subject = (PCRE_SPTR)md->start_subject; 1304 #elif defined COMPILE_PCRE16 1305 cb.subject = (PCRE_SPTR16)md->start_subject; 1306 #elif defined COMPILE_PCRE32 1307 cb.subject = (PCRE_SPTR32)md->start_subject; 1308 #endif 1309 cb.subject_length = (int)(md->end_subject - md->start_subject); 1310 cb.start_match = (int)(mstart - md->start_subject); 1311 cb.current_position = (int)(eptr - md->start_subject); 1312 cb.pattern_position = GET(ecode, 2); 1313 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1314 cb.capture_top = offset_top/2; 1315 cb.capture_last = md->capture_last & CAPLMASK; 1316 /* Internal change requires this for API compatibility. */ 1317 if (cb.capture_last == 0) cb.capture_last = -1; 1318 cb.callout_data = md->callout_data; 1319 cb.mark = md->nomatch_mark; 1320 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1321 if (rrc < 0) RRETURN(rrc); 1322 } 1323 1324 /* Advance ecode past the callout, so it now points to the condition. We 1325 must adjust codelink so that the value of ecode+codelink is unchanged. */ 1326 1327 ecode += PRIV(OP_lengths)[OP_CALLOUT]; 1328 codelink -= PRIV(OP_lengths)[OP_CALLOUT]; 1329 } 1330 1331 /* Test the various possible conditions */ 1332 1333 condition = FALSE; 1334 switch(condcode = *ecode) 1335 { 1336 case OP_RREF: /* Numbered group recursion test */ 1337 if (md->recursive != NULL) /* Not recursing => FALSE */ 1338 { 1339 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ 1340 condition = (recno == RREF_ANY || recno == md->recursive->group_num); 1341 } 1342 break; 1343 1344 case OP_DNRREF: /* Duplicate named group recursion test */ 1345 if (md->recursive != NULL) 1346 { 1347 int count = GET2(ecode, 1 + IMM2_SIZE); 1348 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1349 while (count-- > 0) 1350 { 1351 unsigned int recno = GET2(slot, 0); 1352 condition = recno == md->recursive->group_num; 1353 if (condition) break; 1354 slot += md->name_entry_size; 1355 } 1356 } 1357 break; 1358 1359 case OP_CREF: /* Numbered group used test */ 1360 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 1361 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1362 break; 1363 1364 case OP_DNCREF: /* Duplicate named group used test */ 1365 { 1366 int count = GET2(ecode, 1 + IMM2_SIZE); 1367 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1368 while (count-- > 0) 1369 { 1370 offset = GET2(slot, 0) << 1; 1371 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1372 if (condition) break; 1373 slot += md->name_entry_size; 1374 } 1375 } 1376 break; 1377 1378 case OP_DEF: /* DEFINE - always false */ 1379 case OP_FAIL: /* From optimized (?!) condition */ 1380 break; 1381 1382 /* The condition is an assertion. Call match() to evaluate it - setting 1383 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end 1384 of an assertion. */ 1385 1386 default: 1387 md->match_function_type = MATCH_CONDASSERT; 1388 RMATCH(eptr, ecode, offset_top, md, NULL, RM3); 1389 if (rrc == MATCH_MATCH) 1390 { 1391 if (md->end_offset_top > offset_top) 1392 offset_top = md->end_offset_top; /* Captures may have happened */ 1393 condition = TRUE; 1394 1395 /* Advance ecode past the assertion to the start of the first branch, 1396 but adjust it so that the general choosing code below works. If the 1397 assertion has a quantifier that allows zero repeats we must skip over 1398 the BRAZERO. This is a lunatic thing to do, but somebody did! */ 1399 1400 if (*ecode == OP_BRAZERO) ecode++; 1401 ecode += GET(ecode, 1); 1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1403 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; 1404 } 1405 1406 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 1407 assertion; it is therefore treated as NOMATCH. Any other return is an 1408 error. */ 1409 1410 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1411 { 1412 RRETURN(rrc); /* Need braces because of following else */ 1413 } 1414 break; 1415 } 1416 1417 /* Choose branch according to the condition */ 1418 1419 ecode += condition? PRIV(OP_lengths)[condcode] : codelink; 1420 1421 /* We are now at the branch that is to be obeyed. As there is only one, we 1422 can use tail recursion to avoid using another stack frame, except when 1423 there is unlimited repeat of a possibly empty group. In the latter case, a 1424 recursive call to match() is always required, unless the second alternative 1425 doesn't exist, in which case we can just plough on. Note that, for 1426 compatibility with Perl, the | in a conditional group is NOT treated as 1427 creating two alternatives. If a THEN is encountered in the branch, it 1428 propagates out to the enclosing alternative (unless nested in a deeper set 1429 of alternatives, of course). */ 1430 1431 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) 1432 { 1433 if (op != OP_SCOND) 1434 { 1435 goto TAIL_RECURSE; 1436 } 1437 1438 md->match_function_type = MATCH_CBEGROUP; 1439 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); 1440 RRETURN(rrc); 1441 } 1442 1443 /* Condition false & no alternative; continue after the group. */ 1444 1445 else 1446 { 1447 } 1448 break; 1449 1450 1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1452 to close any currently open capturing brackets. */ 1453 1454 case OP_CLOSE: 1455 number = GET2(ecode, 1); /* Must be less than 65536 */ 1456 offset = number << 1; 1457 1458 #ifdef PCRE_DEBUG 1459 printf("end bracket %d at *ACCEPT", number); 1460 printf("\n"); 1461 #endif 1462 1463 md->capture_last = (md->capture_last & OVFLMASK) | number; 1464 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1465 { 1466 md->offset_vector[offset] = 1467 md->offset_vector[md->offset_end - number]; 1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1469 1470 /* If this group is at or above the current highwater mark, ensure that 1471 any groups between the current high water mark and this group are marked 1472 unset and then update the high water mark. */ 1473 1474 if (offset >= offset_top) 1475 { 1476 register int *iptr = md->offset_vector + offset_top; 1477 register int *iend = md->offset_vector + offset; 1478 while (iptr < iend) *iptr++ = -1; 1479 offset_top = offset + 2; 1480 } 1481 } 1482 ecode += 1 + IMM2_SIZE; 1483 break; 1484 1485 1486 /* End of the pattern, either real or forced. */ 1487 1488 case OP_END: 1489 case OP_ACCEPT: 1490 case OP_ASSERT_ACCEPT: 1491 1492 /* If we have matched an empty string, fail if not in an assertion and not 1493 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART 1494 is set and we have matched at the start of the subject. In both cases, 1495 backtracking will then try other alternatives, if any. */ 1496 1497 if (eptr == mstart && op != OP_ASSERT_ACCEPT && 1498 md->recursive == NULL && 1499 (md->notempty || 1500 (md->notempty_atstart && 1501 mstart == md->start_subject + md->start_offset))) 1502 RRETURN(MATCH_NOMATCH); 1503 1504 /* Otherwise, we have a match. */ 1505 1506 md->end_match_ptr = eptr; /* Record where we ended */ 1507 md->end_offset_top = offset_top; /* and how many extracts were taken */ 1508 md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1509 1510 /* For some reason, the macros don't work properly if an expression is 1511 given as the argument to RRETURN when the heap is in use. */ 1512 1513 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1514 RRETURN(rrc); 1515 1516 /* Assertion brackets. Check the alternative branches in turn - the 1517 matching won't pass the KET for an assertion. If any one branch matches, 1518 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1519 start of each branch to move the current point backwards, so the code at 1520 this level is identical to the lookahead case. When the assertion is part 1521 of a condition, we want to return immediately afterwards. The caller of 1522 this incarnation of the match() function will have set MATCH_CONDASSERT in 1523 md->match_function type, and one of these opcodes will be the first opcode 1524 that is processed. We use a local variable that is preserved over calls to 1525 match() to remember this case. */ 1526 1527 case OP_ASSERT: 1528 case OP_ASSERTBACK: 1529 save_mark = md->mark; 1530 if (md->match_function_type == MATCH_CONDASSERT) 1531 { 1532 condassert = TRUE; 1533 md->match_function_type = 0; 1534 } 1535 else condassert = FALSE; 1536 1537 /* Loop for each branch */ 1538 1539 do 1540 { 1541 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); 1542 1543 /* A match means that the assertion is true; break out of the loop 1544 that matches its alternatives. */ 1545 1546 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1547 { 1548 mstart = md->start_match_ptr; /* In case \K reset it */ 1549 break; 1550 } 1551 1552 /* If not matched, restore the previous mark setting. */ 1553 1554 md->mark = save_mark; 1555 1556 /* See comment in the code for capturing groups above about handling 1557 THEN. */ 1558 1559 if (rrc == MATCH_THEN) 1560 { 1561 next = ecode + GET(ecode,1); 1562 if (md->start_match_ptr < next && 1563 (*ecode == OP_ALT || *next == OP_ALT)) 1564 rrc = MATCH_NOMATCH; 1565 } 1566 1567 /* Anything other than NOMATCH causes the entire assertion to fail, 1568 passing back the return code. This includes COMMIT, SKIP, PRUNE and an 1569 uncaptured THEN, which means they take their normal effect. This 1570 consistent approach does not always have exactly the same effect as in 1571 Perl. */ 1572 1573 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1574 ecode += GET(ecode, 1); 1575 } 1576 while (*ecode == OP_ALT); /* Continue for next alternative */ 1577 1578 /* If we have tried all the alternative branches, the assertion has 1579 failed. If not, we broke out after a match. */ 1580 1581 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1582 1583 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1584 1585 if (condassert) RRETURN(MATCH_MATCH); 1586 1587 /* Continue from after a successful assertion, updating the offsets high 1588 water mark, since extracts may have been taken during the assertion. */ 1589 1590 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1591 ecode += 1 + LINK_SIZE; 1592 offset_top = md->end_offset_top; 1593 continue; 1594 1595 /* Negative assertion: all branches must fail to match for the assertion to 1596 succeed. */ 1597 1598 case OP_ASSERT_NOT: 1599 case OP_ASSERTBACK_NOT: 1600 save_mark = md->mark; 1601 if (md->match_function_type == MATCH_CONDASSERT) 1602 { 1603 condassert = TRUE; 1604 md->match_function_type = 0; 1605 } 1606 else condassert = FALSE; 1607 1608 /* Loop for each alternative branch. */ 1609 1610 do 1611 { 1612 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); 1613 md->mark = save_mark; /* Always restore the mark setting */ 1614 1615 switch(rrc) 1616 { 1617 case MATCH_MATCH: /* A successful match means */ 1618 case MATCH_ACCEPT: /* the assertion has failed. */ 1619 RRETURN(MATCH_NOMATCH); 1620 1621 case MATCH_NOMATCH: /* Carry on with next branch */ 1622 break; 1623 1624 /* See comment in the code for capturing groups above about handling 1625 THEN. */ 1626 1627 case MATCH_THEN: 1628 next = ecode + GET(ecode,1); 1629 if (md->start_match_ptr < next && 1630 (*ecode == OP_ALT || *next == OP_ALT)) 1631 { 1632 rrc = MATCH_NOMATCH; 1633 break; 1634 } 1635 /* Otherwise fall through. */ 1636 1637 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole 1638 assertion to fail to match, without considering any more alternatives. 1639 Failing to match means the assertion is true. This is a consistent 1640 approach, but does not always have the same effect as in Perl. */ 1641 1642 case MATCH_COMMIT: 1643 case MATCH_SKIP: 1644 case MATCH_SKIP_ARG: 1645 case MATCH_PRUNE: 1646 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1647 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ 1648 1649 /* Anything else is an error */ 1650 1651 default: 1652 RRETURN(rrc); 1653 } 1654 1655 /* Continue with next branch */ 1656 1657 ecode += GET(ecode,1); 1658 } 1659 while (*ecode == OP_ALT); 1660 1661 /* All branches in the assertion failed to match. */ 1662 1663 NEG_ASSERT_TRUE: 1664 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ 1665 ecode += 1 + LINK_SIZE; /* Continue with current branch */ 1666 continue; 1667 1668 /* Move the subject pointer back. This occurs only at the start of 1669 each branch of a lookbehind assertion. If we are too close to the start to 1670 move back, this match function fails. When working with UTF-8 we move 1671 back a number of characters, not bytes. */ 1672 1673 case OP_REVERSE: 1674 #ifdef SUPPORT_UTF 1675 if (utf) 1676 { 1677 i = GET(ecode, 1); 1678 while (i-- > 0) 1679 { 1680 eptr--; 1681 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1682 BACKCHAR(eptr); 1683 } 1684 } 1685 else 1686 #endif 1687 1688 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1689 1690 { 1691 eptr -= GET(ecode, 1); 1692 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1693 } 1694 1695 /* Save the earliest consulted character, then skip to next op code */ 1696 1697 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1698 ecode += 1 + LINK_SIZE; 1699 break; 1700 1701 /* The callout item calls an external function, if one is provided, passing 1702 details of the match so far. This is mainly for debugging, though the 1703 function is able to force a failure. */ 1704 1705 case OP_CALLOUT: 1706 if (PUBL(callout) != NULL) 1707 { 1708 PUBL(callout_block) cb; 1709 cb.version = 2; /* Version 1 of the callout block */ 1710 cb.callout_number = ecode[1]; 1711 cb.offset_vector = md->offset_vector; 1712 #if defined COMPILE_PCRE8 1713 cb.subject = (PCRE_SPTR)md->start_subject; 1714 #elif defined COMPILE_PCRE16 1715 cb.subject = (PCRE_SPTR16)md->start_subject; 1716 #elif defined COMPILE_PCRE32 1717 cb.subject = (PCRE_SPTR32)md->start_subject; 1718 #endif 1719 cb.subject_length = (int)(md->end_subject - md->start_subject); 1720 cb.start_match = (int)(mstart - md->start_subject); 1721 cb.current_position = (int)(eptr - md->start_subject); 1722 cb.pattern_position = GET(ecode, 2); 1723 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1724 cb.capture_top = offset_top/2; 1725 cb.capture_last = md->capture_last & CAPLMASK; 1726 /* Internal change requires this for API compatibility. */ 1727 if (cb.capture_last == 0) cb.capture_last = -1; 1728 cb.callout_data = md->callout_data; 1729 cb.mark = md->nomatch_mark; 1730 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1731 if (rrc < 0) RRETURN(rrc); 1732 } 1733 ecode += 2 + 2*LINK_SIZE; 1734 break; 1735 1736 /* Recursion either matches the current regex, or some subexpression. The 1737 offset data is the offset to the starting bracket from the start of the 1738 whole pattern. (This is so that it works from duplicated subpatterns.) 1739 1740 The state of the capturing groups is preserved over recursion, and 1741 re-instated afterwards. We don't know how many are started and not yet 1742 finished (offset_top records the completed total) so we just have to save 1743 all the potential data. There may be up to 65535 such values, which is too 1744 large to put on the stack, but using malloc for small numbers seems 1745 expensive. As a compromise, the stack is used when there are no more than 1746 REC_STACK_SAVE_MAX values to store; otherwise malloc is used. 1747 1748 There are also other values that have to be saved. We use a chained 1749 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1750 for the original version of this logic. It has, however, been hacked around 1751 a lot, so he is not to blame for the current way it works. */ 1752 1753 case OP_RECURSE: 1754 { 1755 recursion_info *ri; 1756 unsigned int recno; 1757 1758 callpat = md->start_code + GET(ecode, 1); 1759 recno = (callpat == md->start_code)? 0 : 1760 GET2(callpat, 1 + LINK_SIZE); 1761 1762 /* Check for repeating a recursion without advancing the subject pointer. 1763 This should catch convoluted mutual recursions. (Some simple cases are 1764 caught at compile time.) */ 1765 1766 for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 1767 if (recno == ri->group_num && eptr == ri->subject_position) 1768 RRETURN(PCRE_ERROR_RECURSELOOP); 1769 1770 /* Add to "recursing stack" */ 1771 1772 new_recursive.group_num = recno; 1773 new_recursive.saved_capture_last = md->capture_last; 1774 new_recursive.subject_position = eptr; 1775 new_recursive.prevrec = md->recursive; 1776 md->recursive = &new_recursive; 1777 1778 /* Where to continue from afterwards */ 1779 1780 ecode += 1 + LINK_SIZE; 1781 1782 /* Now save the offset data */ 1783 1784 new_recursive.saved_max = md->offset_end; 1785 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1786 new_recursive.offset_save = stacksave; 1787 else 1788 { 1789 new_recursive.offset_save = 1790 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); 1791 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1792 } 1793 memcpy(new_recursive.offset_save, md->offset_vector, 1794 new_recursive.saved_max * sizeof(int)); 1795 1796 /* OK, now we can do the recursion. After processing each alternative, 1797 restore the offset data and the last captured value. If there were nested 1798 recursions, md->recursive might be changed, so reset it before looping. 1799 */ 1800 1801 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1802 cbegroup = (*callpat >= OP_SBRA); 1803 do 1804 { 1805 if (cbegroup) md->match_function_type = MATCH_CBEGROUP; 1806 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, 1807 md, eptrb, RM6); 1808 memcpy(md->offset_vector, new_recursive.offset_save, 1809 new_recursive.saved_max * sizeof(int)); 1810 md->capture_last = new_recursive.saved_capture_last; 1811 md->recursive = new_recursive.prevrec; 1812 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1813 { 1814 DPRINTF(("Recursion matched\n")); 1815 if (new_recursive.offset_save != stacksave) 1816 (PUBL(free))(new_recursive.offset_save); 1817 1818 /* Set where we got to in the subject, and reset the start in case 1819 it was changed by \K. This *is* propagated back out of a recursion, 1820 for Perl compatibility. */ 1821 1822 eptr = md->end_match_ptr; 1823 mstart = md->start_match_ptr; 1824 goto RECURSION_MATCHED; /* Exit loop; end processing */ 1825 } 1826 1827 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a 1828 recursion; they cause a NOMATCH for the entire recursion. These codes 1829 are defined in a range that can be tested for. */ 1830 1831 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) 1832 { 1833 if (new_recursive.offset_save != stacksave) 1834 (PUBL(free))(new_recursive.offset_save); 1835 RRETURN(MATCH_NOMATCH); 1836 } 1837 1838 /* Any return code other than NOMATCH is an error. */ 1839 1840 if (rrc != MATCH_NOMATCH) 1841 { 1842 DPRINTF(("Recursion gave error %d\n", rrc)); 1843 if (new_recursive.offset_save != stacksave) 1844 (PUBL(free))(new_recursive.offset_save); 1845 RRETURN(rrc); 1846 } 1847 1848 md->recursive = &new_recursive; 1849 callpat += GET(callpat, 1); 1850 } 1851 while (*callpat == OP_ALT); 1852 1853 DPRINTF(("Recursion didn't match\n")); 1854 md->recursive = new_recursive.prevrec; 1855 if (new_recursive.offset_save != stacksave) 1856 (PUBL(free))(new_recursive.offset_save); 1857 RRETURN(MATCH_NOMATCH); 1858 } 1859 1860 RECURSION_MATCHED: 1861 break; 1862 1863 /* An alternation is the end of a branch; scan along to find the end of the 1864 bracketed group and go to there. */ 1865 1866 case OP_ALT: 1867 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1868 break; 1869 1870 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1871 indicating that it may occur zero times. It may repeat infinitely, or not 1872 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1873 with fixed upper repeat limits are compiled as a number of copies, with the 1874 optional ones preceded by BRAZERO or BRAMINZERO. */ 1875 1876 case OP_BRAZERO: 1877 next = ecode + 1; 1878 RMATCH(eptr, next, offset_top, md, eptrb, RM10); 1879 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1880 do next += GET(next, 1); while (*next == OP_ALT); 1881 ecode = next + 1 + LINK_SIZE; 1882 break; 1883 1884 case OP_BRAMINZERO: 1885 next = ecode + 1; 1886 do next += GET(next, 1); while (*next == OP_ALT); 1887 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11); 1888 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1889 ecode++; 1890 break; 1891 1892 case OP_SKIPZERO: 1893 next = ecode+1; 1894 do next += GET(next,1); while (*next == OP_ALT); 1895 ecode = next + 1 + LINK_SIZE; 1896 break; 1897 1898 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything 1899 here; just jump to the group, with allow_zero set TRUE. */ 1900 1901 case OP_BRAPOSZERO: 1902 op = *(++ecode); 1903 allow_zero = TRUE; 1904 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; 1905 goto POSSESSIVE_NON_CAPTURE; 1906 1907 /* End of a group, repeated or non-repeating. */ 1908 1909 case OP_KET: 1910 case OP_KETRMIN: 1911 case OP_KETRMAX: 1912 case OP_KETRPOS: 1913 prev = ecode - GET(ecode, 1); 1914 1915 /* If this was a group that remembered the subject start, in order to break 1916 infinite repeats of empty string matches, retrieve the subject start from 1917 the chain. Otherwise, set it NULL. */ 1918 1919 if (*prev >= OP_SBRA || *prev == OP_ONCE) 1920 { 1921 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1922 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1923 } 1924 else saved_eptr = NULL; 1925 1926 /* If we are at the end of an assertion group or a non-capturing atomic 1927 group, stop matching and return MATCH_MATCH, but record the current high 1928 water mark for use by positive assertions. We also need to record the match 1929 start in case it was changed by \K. */ 1930 1931 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || 1932 *prev == OP_ONCE_NC) 1933 { 1934 md->end_match_ptr = eptr; /* For ONCE_NC */ 1935 md->end_offset_top = offset_top; 1936 md->start_match_ptr = mstart; 1937 RRETURN(MATCH_MATCH); /* Sets md->mark */ 1938 } 1939 1940 /* For capturing groups we have to check the group number back at the start 1941 and if necessary complete handling an extraction by setting the offsets and 1942 bumping the high water mark. Whole-pattern recursion is coded as a recurse 1943 into group 0, so it won't be picked up here. Instead, we catch it when the 1944 OP_END is reached. Other recursion is handled here. We just have to record 1945 the current subject position and start match pointer and give a MATCH 1946 return. */ 1947 1948 if (*prev == OP_CBRA || *prev == OP_SCBRA || 1949 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) 1950 { 1951 number = GET2(prev, 1+LINK_SIZE); 1952 offset = number << 1; 1953 1954 #ifdef PCRE_DEBUG 1955 printf("end bracket %d", number); 1956 printf("\n"); 1957 #endif 1958 1959 /* Handle a recursively called group. */ 1960 1961 if (md->recursive != NULL && md->recursive->group_num == number) 1962 { 1963 md->end_match_ptr = eptr; 1964 md->start_match_ptr = mstart; 1965 RRETURN(MATCH_MATCH); 1966 } 1967 1968 /* Deal with capturing */ 1969 1970 md->capture_last = (md->capture_last & OVFLMASK) | number; 1971 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1972 { 1973 /* If offset is greater than offset_top, it means that we are 1974 "skipping" a capturing group, and that group's offsets must be marked 1975 unset. In earlier versions of PCRE, all the offsets were unset at the 1976 start of matching, but this doesn't work because atomic groups and 1977 assertions can cause a value to be set that should later be unset. 1978 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as 1979 part of the atomic group, but this is not on the final matching path, 1980 so must be unset when 2 is set. (If there is no group 2, there is no 1981 problem, because offset_top will then be 2, indicating no capture.) */ 1982 1983 if (offset > offset_top) 1984 { 1985 register int *iptr = md->offset_vector + offset_top; 1986 register int *iend = md->offset_vector + offset; 1987 while (iptr < iend) *iptr++ = -1; 1988 } 1989 1990 /* Now make the extraction */ 1991 1992 md->offset_vector[offset] = 1993 md->offset_vector[md->offset_end - number]; 1994 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1995 if (offset_top <= offset) offset_top = offset + 2; 1996 } 1997 } 1998 1999 /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 2000 and return the MATCH_KETRPOS. This makes it possible to do the repeats one 2001 at a time from the outer level, thus saving stack. This must precede the 2002 empty string test - in this case that test is done at the outer level. */ 2003 2004 if (*ecode == OP_KETRPOS) 2005 { 2006 md->start_match_ptr = mstart; /* In case \K reset it */ 2007 md->end_match_ptr = eptr; 2008 md->end_offset_top = offset_top; 2009 RRETURN(MATCH_KETRPOS); 2010 } 2011 2012 /* For an ordinary non-repeating ket, just continue at this level. This 2013 also happens for a repeating ket if no characters were matched in the 2014 group. This is the forcible breaking of infinite loops as implemented in 2015 Perl 5.005. For a non-repeating atomic group that includes captures, 2016 establish a backup point by processing the rest of the pattern at a lower 2017 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the 2018 original OP_ONCE level, thereby bypassing intermediate backup points, but 2019 resetting any captures that happened along the way. */ 2020 2021 if (*ecode == OP_KET || eptr == saved_eptr) 2022 { 2023 if (*prev == OP_ONCE) 2024 { 2025 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12); 2026 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2027 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2028 RRETURN(MATCH_ONCE); 2029 } 2030 ecode += 1 + LINK_SIZE; /* Carry on at this level */ 2031 break; 2032 } 2033 2034 /* The normal repeating kets try the rest of the pattern or restart from 2035 the preceding bracket, in the appropriate order. In the second case, we can 2036 use tail recursion to avoid using another stack frame, unless we have an 2037 an atomic group or an unlimited repeat of a group that can match an empty 2038 string. */ 2039 2040 if (*ecode == OP_KETRMIN) 2041 { 2042 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7); 2043 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2044 if (*prev == OP_ONCE) 2045 { 2046 RMATCH(eptr, prev, offset_top, md, eptrb, RM8); 2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2048 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2049 RRETURN(MATCH_ONCE); 2050 } 2051 if (*prev >= OP_SBRA) /* Could match an empty string */ 2052 { 2053 RMATCH(eptr, prev, offset_top, md, eptrb, RM50); 2054 RRETURN(rrc); 2055 } 2056 ecode = prev; 2057 goto TAIL_RECURSE; 2058 } 2059 else /* OP_KETRMAX */ 2060 { 2061 RMATCH(eptr, prev, offset_top, md, eptrb, RM13); 2062 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; 2063 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2064 if (*prev == OP_ONCE) 2065 { 2066 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9); 2067 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2068 md->once_target = prev; 2069 RRETURN(MATCH_ONCE); 2070 } 2071 ecode += 1 + LINK_SIZE; 2072 goto TAIL_RECURSE; 2073 } 2074 /* Control never gets here */ 2075 2076 /* Not multiline mode: start of subject assertion, unless notbol. */ 2077 2078 case OP_CIRC: 2079 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2080 2081 /* Start of subject assertion */ 2082 2083 case OP_SOD: 2084 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 2085 ecode++; 2086 break; 2087 2088 /* Multiline mode: start of subject unless notbol, or after any newline. */ 2089 2090 case OP_CIRCM: 2091 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2092 if (eptr != md->start_subject && 2093 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 2094 RRETURN(MATCH_NOMATCH); 2095 ecode++; 2096 break; 2097 2098 /* Start of match assertion */ 2099 2100 case OP_SOM: 2101 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 2102 ecode++; 2103 break; 2104 2105 /* Reset the start of match point */ 2106 2107 case OP_SET_SOM: 2108 mstart = eptr; 2109 ecode++; 2110 break; 2111 2112 /* Multiline mode: assert before any newline, or before end of subject 2113 unless noteol is set. */ 2114 2115 case OP_DOLLM: 2116 if (eptr < md->end_subject) 2117 { 2118 if (!IS_NEWLINE(eptr)) 2119 { 2120 if (md->partial != 0 && 2121 eptr + 1 >= md->end_subject && 2122 NLBLOCK->nltype == NLTYPE_FIXED && 2123 NLBLOCK->nllen == 2 && 2124 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2125 { 2126 md->hitend = TRUE; 2127 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2128 } 2129 RRETURN(MATCH_NOMATCH); 2130 } 2131 } 2132 else 2133 { 2134 if (md->noteol) RRETURN(MATCH_NOMATCH); 2135 SCHECK_PARTIAL(); 2136 } 2137 ecode++; 2138 break; 2139 2140 /* Not multiline mode: assert before a terminating newline or before end of 2141 subject unless noteol is set. */ 2142 2143 case OP_DOLL: 2144 if (md->noteol) RRETURN(MATCH_NOMATCH); 2145 if (!md->endonly) goto ASSERT_NL_OR_EOS; 2146 2147 /* ... else fall through for endonly */ 2148 2149 /* End of subject assertion (\z) */ 2150 2151 case OP_EOD: 2152 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 2153 SCHECK_PARTIAL(); 2154 ecode++; 2155 break; 2156 2157 /* End of subject or ending \n assertion (\Z) */ 2158 2159 case OP_EODN: 2160 ASSERT_NL_OR_EOS: 2161 if (eptr < md->end_subject && 2162 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 2163 { 2164 if (md->partial != 0 && 2165 eptr + 1 >= md->end_subject && 2166 NLBLOCK->nltype == NLTYPE_FIXED && 2167 NLBLOCK->nllen == 2 && 2168 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2169 { 2170 md->hitend = TRUE; 2171 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2172 } 2173 RRETURN(MATCH_NOMATCH); 2174 } 2175 2176 /* Either at end of string or \n before end. */ 2177 2178 SCHECK_PARTIAL(); 2179 ecode++; 2180 break; 2181 2182 /* Word boundary assertions */ 2183 2184 case OP_NOT_WORD_BOUNDARY: 2185 case OP_WORD_BOUNDARY: 2186 { 2187 2188 /* Find out if the previous and current characters are "word" characters. 2189 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 2190 be "non-word" characters. Remember the earliest consulted character for 2191 partial matching. */ 2192 2193 #ifdef SUPPORT_UTF 2194 if (utf) 2195 { 2196 /* Get status of previous character */ 2197 2198 if (eptr == md->start_subject) prev_is_word = FALSE; else 2199 { 2200 PCRE_PUCHAR lastptr = eptr - 1; 2201 BACKCHAR(lastptr); 2202 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 2203 GETCHAR(c, lastptr); 2204 #ifdef SUPPORT_UCP 2205 if (md->use_ucp) 2206 { 2207 if (c == '_') prev_is_word = TRUE; else 2208 { 2209 int cat = UCD_CATEGORY(c); 2210 prev_is_word = (cat == ucp_L || cat == ucp_N); 2211 } 2212 } 2213 else 2214 #endif 2215 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2216 } 2217 2218 /* Get status of next character */ 2219 2220 if (eptr >= md->end_subject) 2221 { 2222 SCHECK_PARTIAL(); 2223 cur_is_word = FALSE; 2224 } 2225 else 2226 { 2227 GETCHAR(c, eptr); 2228 #ifdef SUPPORT_UCP 2229 if (md->use_ucp) 2230 { 2231 if (c == '_') cur_is_word = TRUE; else 2232 { 2233 int cat = UCD_CATEGORY(c); 2234 cur_is_word = (cat == ucp_L || cat == ucp_N); 2235 } 2236 } 2237 else 2238 #endif 2239 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2240 } 2241 } 2242 else 2243 #endif 2244 2245 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for 2246 consistency with the behaviour of \w we do use it in this case. */ 2247 2248 { 2249 /* Get status of previous character */ 2250 2251 if (eptr == md->start_subject) prev_is_word = FALSE; else 2252 { 2253 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 2254 #ifdef SUPPORT_UCP 2255 if (md->use_ucp) 2256 { 2257 c = eptr[-1]; 2258 if (c == '_') prev_is_word = TRUE; else 2259 { 2260 int cat = UCD_CATEGORY(c); 2261 prev_is_word = (cat == ucp_L || cat == ucp_N); 2262 } 2263 } 2264 else 2265 #endif 2266 prev_is_word = MAX_255(eptr[-1]) 2267 && ((md->ctypes[eptr[-1]] & ctype_word) != 0); 2268 } 2269 2270 /* Get status of next character */ 2271 2272 if (eptr >= md->end_subject) 2273 { 2274 SCHECK_PARTIAL(); 2275 cur_is_word = FALSE; 2276 } 2277 else 2278 #ifdef SUPPORT_UCP 2279 if (md->use_ucp) 2280 { 2281 c = *eptr; 2282 if (c == '_') cur_is_word = TRUE; else 2283 { 2284 int cat = UCD_CATEGORY(c); 2285 cur_is_word = (cat == ucp_L || cat == ucp_N); 2286 } 2287 } 2288 else 2289 #endif 2290 cur_is_word = MAX_255(*eptr) 2291 && ((md->ctypes[*eptr] & ctype_word) != 0); 2292 } 2293 2294 /* Now see if the situation is what we want */ 2295 2296 if ((*ecode++ == OP_WORD_BOUNDARY)? 2297 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 2298 RRETURN(MATCH_NOMATCH); 2299 } 2300 break; 2301 2302 /* Match any single character type except newline; have to take care with 2303 CRLF newlines and partial matching. */ 2304 2305 case OP_ANY: 2306 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 2307 if (md->partial != 0 && 2308 eptr + 1 >= md->end_subject && 2309 NLBLOCK->nltype == NLTYPE_FIXED && 2310 NLBLOCK->nllen == 2 && 2311 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2312 { 2313 md->hitend = TRUE; 2314 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2315 } 2316 2317 /* Fall through */ 2318 2319 /* Match any single character whatsoever. */ 2320 2321 case OP_ALLANY: 2322 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2323 { /* not be updated before SCHECK_PARTIAL. */ 2324 SCHECK_PARTIAL(); 2325 RRETURN(MATCH_NOMATCH); 2326 } 2327 eptr++; 2328 #ifdef SUPPORT_UTF 2329 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 2330 #endif 2331 ecode++; 2332 break; 2333 2334 /* Match a single byte, even in UTF-8 mode. This opcode really does match 2335 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 2336 2337 case OP_ANYBYTE: 2338 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2339 { /* not be updated before SCHECK_PARTIAL. */ 2340 SCHECK_PARTIAL(); 2341 RRETURN(MATCH_NOMATCH); 2342 } 2343 eptr++; 2344 ecode++; 2345 break; 2346 2347 case OP_NOT_DIGIT: 2348 if (eptr >= md->end_subject) 2349 { 2350 SCHECK_PARTIAL(); 2351 RRETURN(MATCH_NOMATCH); 2352 } 2353 GETCHARINCTEST(c, eptr); 2354 if ( 2355 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2356 c < 256 && 2357 #endif 2358 (md->ctypes[c] & ctype_digit) != 0 2359 ) 2360 RRETURN(MATCH_NOMATCH); 2361 ecode++; 2362 break; 2363 2364 case OP_DIGIT: 2365 if (eptr >= md->end_subject) 2366 { 2367 SCHECK_PARTIAL(); 2368 RRETURN(MATCH_NOMATCH); 2369 } 2370 GETCHARINCTEST(c, eptr); 2371 if ( 2372 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2373 c > 255 || 2374 #endif 2375 (md->ctypes[c] & ctype_digit) == 0 2376 ) 2377 RRETURN(MATCH_NOMATCH); 2378 ecode++; 2379 break; 2380 2381 case OP_NOT_WHITESPACE: 2382 if (eptr >= md->end_subject) 2383 { 2384 SCHECK_PARTIAL(); 2385 RRETURN(MATCH_NOMATCH); 2386 } 2387 GETCHARINCTEST(c, eptr); 2388 if ( 2389 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2390 c < 256 && 2391 #endif 2392 (md->ctypes[c] & ctype_space) != 0 2393 ) 2394 RRETURN(MATCH_NOMATCH); 2395 ecode++; 2396 break; 2397 2398 case OP_WHITESPACE: 2399 if (eptr >= md->end_subject) 2400 { 2401 SCHECK_PARTIAL(); 2402 RRETURN(MATCH_NOMATCH); 2403 } 2404 GETCHARINCTEST(c, eptr); 2405 if ( 2406 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2407 c > 255 || 2408 #endif 2409 (md->ctypes[c] & ctype_space) == 0 2410 ) 2411 RRETURN(MATCH_NOMATCH); 2412 ecode++; 2413 break; 2414 2415 case OP_NOT_WORDCHAR: 2416 if (eptr >= md->end_subject) 2417 { 2418 SCHECK_PARTIAL(); 2419 RRETURN(MATCH_NOMATCH); 2420 } 2421 GETCHARINCTEST(c, eptr); 2422 if ( 2423 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2424 c < 256 && 2425 #endif 2426 (md->ctypes[c] & ctype_word) != 0 2427 ) 2428 RRETURN(MATCH_NOMATCH); 2429 ecode++; 2430 break; 2431 2432 case OP_WORDCHAR: 2433 if (eptr >= md->end_subject) 2434 { 2435 SCHECK_PARTIAL(); 2436 RRETURN(MATCH_NOMATCH); 2437 } 2438 GETCHARINCTEST(c, eptr); 2439 if ( 2440 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2441 c > 255 || 2442 #endif 2443 (md->ctypes[c] & ctype_word) == 0 2444 ) 2445 RRETURN(MATCH_NOMATCH); 2446 ecode++; 2447 break; 2448 2449 case OP_ANYNL: 2450 if (eptr >= md->end_subject) 2451 { 2452 SCHECK_PARTIAL(); 2453 RRETURN(MATCH_NOMATCH); 2454 } 2455 GETCHARINCTEST(c, eptr); 2456 switch(c) 2457 { 2458 default: RRETURN(MATCH_NOMATCH); 2459 2460 case CHAR_CR: 2461 if (eptr >= md->end_subject) 2462 { 2463 SCHECK_PARTIAL(); 2464 } 2465 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++; 2466 break; 2467 2468 case CHAR_LF: 2469 break; 2470 2471 case CHAR_VT: 2472 case CHAR_FF: 2473 case CHAR_NEL: 2474 #ifndef EBCDIC 2475 case 0x2028: 2476 case 0x2029: 2477 #endif /* Not EBCDIC */ 2478 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 2479 break; 2480 } 2481 ecode++; 2482 break; 2483 2484 case OP_NOT_HSPACE: 2485 if (eptr >= md->end_subject) 2486 { 2487 SCHECK_PARTIAL(); 2488 RRETURN(MATCH_NOMATCH); 2489 } 2490 GETCHARINCTEST(c, eptr); 2491 switch(c) 2492 { 2493 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 2494 default: break; 2495 } 2496 ecode++; 2497 break; 2498 2499 case OP_HSPACE: 2500 if (eptr >= md->end_subject) 2501 { 2502 SCHECK_PARTIAL(); 2503 RRETURN(MATCH_NOMATCH); 2504 } 2505 GETCHARINCTEST(c, eptr); 2506 switch(c) 2507 { 2508 HSPACE_CASES: break; /* Byte and multibyte cases */ 2509 default: RRETURN(MATCH_NOMATCH); 2510 } 2511 ecode++; 2512 break; 2513 2514 case OP_NOT_VSPACE: 2515 if (eptr >= md->end_subject) 2516 { 2517 SCHECK_PARTIAL(); 2518 RRETURN(MATCH_NOMATCH); 2519 } 2520 GETCHARINCTEST(c, eptr); 2521 switch(c) 2522 { 2523 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2524 default: break; 2525 } 2526 ecode++; 2527 break; 2528 2529 case OP_VSPACE: 2530 if (eptr >= md->end_subject) 2531 { 2532 SCHECK_PARTIAL(); 2533 RRETURN(MATCH_NOMATCH); 2534 } 2535 GETCHARINCTEST(c, eptr); 2536 switch(c) 2537 { 2538 VSPACE_CASES: break; 2539 default: RRETURN(MATCH_NOMATCH); 2540 } 2541 ecode++; 2542 break; 2543 2544 #ifdef SUPPORT_UCP 2545 /* Check the next character by Unicode property. We will get here only 2546 if the support is in the binary; otherwise a compile-time error occurs. */ 2547 2548 case OP_PROP: 2549 case OP_NOTPROP: 2550 if (eptr >= md->end_subject) 2551 { 2552 SCHECK_PARTIAL(); 2553 RRETURN(MATCH_NOMATCH); 2554 } 2555 GETCHARINCTEST(c, eptr); 2556 { 2557 const pcre_uint32 *cp; 2558 const ucd_record *prop = GET_UCD(c); 2559 2560 switch(ecode[1]) 2561 { 2562 case PT_ANY: 2563 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2564 break; 2565 2566 case PT_LAMP: 2567 if ((prop->chartype == ucp_Lu || 2568 prop->chartype == ucp_Ll || 2569 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2570 RRETURN(MATCH_NOMATCH); 2571 break; 2572 2573 case PT_GC: 2574 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) 2575 RRETURN(MATCH_NOMATCH); 2576 break; 2577 2578 case PT_PC: 2579 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2580 RRETURN(MATCH_NOMATCH); 2581 break; 2582 2583 case PT_SC: 2584 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2585 RRETURN(MATCH_NOMATCH); 2586 break; 2587 2588 /* These are specials */ 2589 2590 case PT_ALNUM: 2591 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2592 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2593 RRETURN(MATCH_NOMATCH); 2594 break; 2595 2596 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2597 which means that Perl space and POSIX space are now identical. PCRE 2598 was changed at release 8.34. */ 2599 2600 case PT_SPACE: /* Perl space */ 2601 case PT_PXSPACE: /* POSIX space */ 2602 switch(c) 2603 { 2604 HSPACE_CASES: 2605 VSPACE_CASES: 2606 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2607 break; 2608 2609 default: 2610 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == 2611 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); 2612 break; 2613 } 2614 break; 2615 2616 case PT_WORD: 2617 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2618 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2619 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2620 RRETURN(MATCH_NOMATCH); 2621 break; 2622 2623 case PT_CLIST: 2624 cp = PRIV(ucd_caseless_sets) + ecode[2]; 2625 for (;;) 2626 { 2627 if (c < *cp) 2628 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } 2629 if (c == *cp++) 2630 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } 2631 } 2632 break; 2633 2634 case PT_UCNC: 2635 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 2636 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 2637 c >= 0xe000) == (op == OP_NOTPROP)) 2638 RRETURN(MATCH_NOMATCH); 2639 break; 2640 2641 /* This should never occur */ 2642 2643 default: 2644 RRETURN(PCRE_ERROR_INTERNAL); 2645 } 2646 2647 ecode += 3; 2648 } 2649 break; 2650 2651 /* Match an extended Unicode sequence. We will get here only if the support 2652 is in the binary; otherwise a compile-time error occurs. */ 2653 2654 case OP_EXTUNI: 2655 if (eptr >= md->end_subject) 2656 { 2657 SCHECK_PARTIAL(); 2658 RRETURN(MATCH_NOMATCH); 2659 } 2660 else 2661 { 2662 int lgb, rgb; 2663 GETCHARINCTEST(c, eptr); 2664 lgb = UCD_GRAPHBREAK(c); 2665 while (eptr < md->end_subject) 2666 { 2667 int len = 1; 2668 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 2669 rgb = UCD_GRAPHBREAK(c); 2670 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2671 lgb = rgb; 2672 eptr += len; 2673 } 2674 } 2675 CHECK_PARTIAL(); 2676 ecode++; 2677 break; 2678 #endif /* SUPPORT_UCP */ 2679 2680 2681 /* Match a back reference, possibly repeatedly. Look past the end of the 2682 item to see if there is repeat information following. The code is similar 2683 to that for character classes, but repeated for efficiency. Then obey 2684 similar code to character type repeats - written out again for speed. 2685 However, if the referenced string is the empty string, always treat 2686 it as matched, any number of times (otherwise there could be infinite 2687 loops). If the reference is unset, there are two possibilities: 2688 2689 (a) In the default, Perl-compatible state, set the length negative; 2690 this ensures that every attempt at a match fails. We can't just fail 2691 here, because of the possibility of quantifiers with zero minima. 2692 2693 (b) If the JavaScript compatibility flag is set, set the length to zero 2694 so that the back reference matches an empty string. 2695 2696 Otherwise, set the length to the length of what was matched by the 2697 referenced subpattern. 2698 2699 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group 2700 or to a non-duplicated named group. For a duplicated named group, OP_DNREF 2701 and OP_DNREFI are used. In this case we must scan the list of groups to 2702 which the name refers, and use the first one that is set. */ 2703 2704 case OP_DNREF: 2705 case OP_DNREFI: 2706 caseless = op == OP_DNREFI; 2707 { 2708 int count = GET2(ecode, 1+IMM2_SIZE); 2709 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 2710 ecode += 1 + 2*IMM2_SIZE; 2711 2712 /* Setting the default length first and initializing 'offset' avoids 2713 compiler warnings in the REF_REPEAT code. */ 2714 2715 length = (md->jscript_compat)? 0 : -1; 2716 offset = 0; 2717 2718 while (count-- > 0) 2719 { 2720 offset = GET2(slot, 0) << 1; 2721 if (offset < offset_top && md->offset_vector[offset] >= 0) 2722 { 2723 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2724 break; 2725 } 2726 slot += md->name_entry_size; 2727 } 2728 } 2729 goto REF_REPEAT; 2730 2731 case OP_REF: 2732 case OP_REFI: 2733 caseless = op == OP_REFI; 2734 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2735 ecode += 1 + IMM2_SIZE; 2736 if (offset >= offset_top || md->offset_vector[offset] < 0) 2737 length = (md->jscript_compat)? 0 : -1; 2738 else 2739 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2740 2741 /* Set up for repetition, or handle the non-repeated case */ 2742 2743 REF_REPEAT: 2744 switch (*ecode) 2745 { 2746 case OP_CRSTAR: 2747 case OP_CRMINSTAR: 2748 case OP_CRPLUS: 2749 case OP_CRMINPLUS: 2750 case OP_CRQUERY: 2751 case OP_CRMINQUERY: 2752 c = *ecode++ - OP_CRSTAR; 2753 minimize = (c & 1) != 0; 2754 min = rep_min[c]; /* Pick up values from tables; */ 2755 max = rep_max[c]; /* zero for max => infinity */ 2756 if (max == 0) max = INT_MAX; 2757 break; 2758 2759 case OP_CRRANGE: 2760 case OP_CRMINRANGE: 2761 minimize = (*ecode == OP_CRMINRANGE); 2762 min = GET2(ecode, 1); 2763 max = GET2(ecode, 1 + IMM2_SIZE); 2764 if (max == 0) max = INT_MAX; 2765 ecode += 1 + 2 * IMM2_SIZE; 2766 break; 2767 2768 default: /* No repeat follows */ 2769 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) 2770 { 2771 if (length == -2) eptr = md->end_subject; /* Partial match */ 2772 CHECK_PARTIAL(); 2773 RRETURN(MATCH_NOMATCH); 2774 } 2775 eptr += length; 2776 continue; /* With the main loop */ 2777 } 2778 2779 /* Handle repeated back references. If the length of the reference is 2780 zero, just continue with the main loop. If the length is negative, it 2781 means the reference is unset in non-Java-compatible mode. If the minimum is 2782 zero, we can continue at the same level without recursion. For any other 2783 minimum, carrying on will result in NOMATCH. */ 2784 2785 if (length == 0) continue; 2786 if (length < 0 && min == 0) continue; 2787 2788 /* First, ensure the minimum number of matches are present. We get back 2789 the length of the reference string explicitly rather than passing the 2790 address of eptr, so that eptr can be a register variable. */ 2791 2792 for (i = 1; i <= min; i++) 2793 { 2794 int slength; 2795 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2796 { 2797 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2798 CHECK_PARTIAL(); 2799 RRETURN(MATCH_NOMATCH); 2800 } 2801 eptr += slength; 2802 } 2803 2804 /* If min = max, continue at the same level without recursion. 2805 They are not both allowed to be zero. */ 2806 2807 if (min == max) continue; 2808 2809 /* If minimizing, keep trying and advancing the pointer */ 2810 2811 if (minimize) 2812 { 2813 for (fi = min;; fi++) 2814 { 2815 int slength; 2816 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14); 2817 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2818 if (fi >= max) RRETURN(MATCH_NOMATCH); 2819 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2820 { 2821 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2822 CHECK_PARTIAL(); 2823 RRETURN(MATCH_NOMATCH); 2824 } 2825 eptr += slength; 2826 } 2827 /* Control never gets here */ 2828 } 2829 2830 /* If maximizing, find the longest string and work backwards */ 2831 2832 else 2833 { 2834 pp = eptr; 2835 for (i = min; i < max; i++) 2836 { 2837 int slength; 2838 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2839 { 2840 /* Can't use CHECK_PARTIAL because we don't want to update eptr in 2841 the soft partial matching case. */ 2842 2843 if (slength == -2 && md->partial != 0 && 2844 md->end_subject > md->start_used_ptr) 2845 { 2846 md->hitend = TRUE; 2847 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2848 } 2849 break; 2850 } 2851 eptr += slength; 2852 } 2853 2854 while (eptr >= pp) 2855 { 2856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); 2857 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2858 eptr -= length; 2859 } 2860 RRETURN(MATCH_NOMATCH); 2861 } 2862 /* Control never gets here */ 2863 2864 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2865 used when all the characters in the class have values in the range 0-255, 2866 and either the matching is caseful, or the characters are in the range 2867 0-127 when UTF-8 processing is enabled. The only difference between 2868 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2869 encountered. 2870 2871 First, look past the end of the item to see if there is repeat information 2872 following. Then obey similar code to character type repeats - written out 2873 again for speed. */ 2874 2875 case OP_NCLASS: 2876 case OP_CLASS: 2877 { 2878 /* The data variable is saved across frames, so the byte map needs to 2879 be stored there. */ 2880 #define BYTE_MAP ((pcre_uint8 *)data) 2881 data = ecode + 1; /* Save for matching */ 2882 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ 2883 2884 switch (*ecode) 2885 { 2886 case OP_CRSTAR: 2887 case OP_CRMINSTAR: 2888 case OP_CRPLUS: 2889 case OP_CRMINPLUS: 2890 case OP_CRQUERY: 2891 case OP_CRMINQUERY: 2892 case OP_CRPOSSTAR: 2893 case OP_CRPOSPLUS: 2894 case OP_CRPOSQUERY: 2895 c = *ecode++ - OP_CRSTAR; 2896 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 2897 else possessive = TRUE; 2898 min = rep_min[c]; /* Pick up values from tables; */ 2899 max = rep_max[c]; /* zero for max => infinity */ 2900 if (max == 0) max = INT_MAX; 2901 break; 2902 2903 case OP_CRRANGE: 2904 case OP_CRMINRANGE: 2905 case OP_CRPOSRANGE: 2906 minimize = (*ecode == OP_CRMINRANGE); 2907 possessive = (*ecode == OP_CRPOSRANGE); 2908 min = GET2(ecode, 1); 2909 max = GET2(ecode, 1 + IMM2_SIZE); 2910 if (max == 0) max = INT_MAX; 2911 ecode += 1 + 2 * IMM2_SIZE; 2912 break; 2913 2914 default: /* No repeat follows */ 2915 min = max = 1; 2916 break; 2917 } 2918 2919 /* First, ensure the minimum number of matches are present. */ 2920 2921 #ifdef SUPPORT_UTF 2922 if (utf) 2923 { 2924 for (i = 1; i <= min; i++) 2925 { 2926 if (eptr >= md->end_subject) 2927 { 2928 SCHECK_PARTIAL(); 2929 RRETURN(MATCH_NOMATCH); 2930 } 2931 GETCHARINC(c, eptr); 2932 if (c > 255) 2933 { 2934 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2935 } 2936 else 2937 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2938 } 2939 } 2940 else 2941 #endif 2942 /* Not UTF mode */ 2943 { 2944 for (i = 1; i <= min; i++) 2945 { 2946 if (eptr >= md->end_subject) 2947 { 2948 SCHECK_PARTIAL(); 2949 RRETURN(MATCH_NOMATCH); 2950 } 2951 c = *eptr++; 2952 #ifndef COMPILE_PCRE8 2953 if (c > 255) 2954 { 2955 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2956 } 2957 else 2958 #endif 2959 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2960 } 2961 } 2962 2963 /* If max == min we can continue with the main loop without the 2964 need to recurse. */ 2965 2966 if (min == max) continue; 2967 2968 /* If minimizing, keep testing the rest of the expression and advancing 2969 the pointer while it matches the class. */ 2970 2971 if (minimize) 2972 { 2973 #ifdef SUPPORT_UTF 2974 if (utf) 2975 { 2976 for (fi = min;; fi++) 2977 { 2978 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16); 2979 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2980 if (fi >= max) RRETURN(MATCH_NOMATCH); 2981 if (eptr >= md->end_subject) 2982 { 2983 SCHECK_PARTIAL(); 2984 RRETURN(MATCH_NOMATCH); 2985 } 2986 GETCHARINC(c, eptr); 2987 if (c > 255) 2988 { 2989 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2990 } 2991 else 2992 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2993 } 2994 } 2995 else 2996 #endif 2997 /* Not UTF mode */ 2998 { 2999 for (fi = min;; fi++) 3000 { 3001 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17); 3002 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3003 if (fi >= max) RRETURN(MATCH_NOMATCH); 3004 if (eptr >= md->end_subject) 3005 { 3006 SCHECK_PARTIAL(); 3007 RRETURN(MATCH_NOMATCH); 3008 } 3009 c = *eptr++; 3010 #ifndef COMPILE_PCRE8 3011 if (c > 255) 3012 { 3013 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3014 } 3015 else 3016 #endif 3017 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3018 } 3019 } 3020 /* Control never gets here */ 3021 } 3022 3023 /* If maximizing, find the longest possible run, then work backwards. */ 3024 3025 else 3026 { 3027 pp = eptr; 3028 3029 #ifdef SUPPORT_UTF 3030 if (utf) 3031 { 3032 for (i = min; i < max; i++) 3033 { 3034 int len = 1; 3035 if (eptr >= md->end_subject) 3036 { 3037 SCHECK_PARTIAL(); 3038 break; 3039 } 3040 GETCHARLEN(c, eptr, len); 3041 if (c > 255) 3042 { 3043 if (op == OP_CLASS) break; 3044 } 3045 else 3046 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3047 eptr += len; 3048 } 3049 3050 if (possessive) continue; /* No backtracking */ 3051 3052 for (;;) 3053 { 3054 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); 3055 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3056 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3057 BACKCHAR(eptr); 3058 } 3059 } 3060 else 3061 #endif 3062 /* Not UTF mode */ 3063 { 3064 for (i = min; i < max; i++) 3065 { 3066 if (eptr >= md->end_subject) 3067 { 3068 SCHECK_PARTIAL(); 3069 break; 3070 } 3071 c = *eptr; 3072 #ifndef COMPILE_PCRE8 3073 if (c > 255) 3074 { 3075 if (op == OP_CLASS) break; 3076 } 3077 else 3078 #endif 3079 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3080 eptr++; 3081 } 3082 3083 if (possessive) continue; /* No backtracking */ 3084 3085 while (eptr >= pp) 3086 { 3087 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); 3088 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3089 eptr--; 3090 } 3091 } 3092 3093 RRETURN(MATCH_NOMATCH); 3094 } 3095 #undef BYTE_MAP 3096 } 3097 /* Control never gets here */ 3098 3099 3100 /* Match an extended character class. In the 8-bit library, this opcode is 3101 encountered only when UTF-8 mode mode is supported. In the 16-bit and 3102 32-bit libraries, codepoints greater than 255 may be encountered even when 3103 UTF is not supported. */ 3104 3105 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3106 case OP_XCLASS: 3107 { 3108 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 3109 ecode += GET(ecode, 1); /* Advance past the item */ 3110 3111 switch (*ecode) 3112 { 3113 case OP_CRSTAR: 3114 case OP_CRMINSTAR: 3115 case OP_CRPLUS: 3116 case OP_CRMINPLUS: 3117 case OP_CRQUERY: 3118 case OP_CRMINQUERY: 3119 case OP_CRPOSSTAR: 3120 case OP_CRPOSPLUS: 3121 case OP_CRPOSQUERY: 3122 c = *ecode++ - OP_CRSTAR; 3123 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 3124 else possessive = TRUE; 3125 min = rep_min[c]; /* Pick up values from tables; */ 3126 max = rep_max[c]; /* zero for max => infinity */ 3127 if (max == 0) max = INT_MAX; 3128 break; 3129 3130 case OP_CRRANGE: 3131 case OP_CRMINRANGE: 3132 case OP_CRPOSRANGE: 3133 minimize = (*ecode == OP_CRMINRANGE); 3134 possessive = (*ecode == OP_CRPOSRANGE); 3135 min = GET2(ecode, 1); 3136 max = GET2(ecode, 1 + IMM2_SIZE); 3137 if (max == 0) max = INT_MAX; 3138 ecode += 1 + 2 * IMM2_SIZE; 3139 break; 3140 3141 default: /* No repeat follows */ 3142 min = max = 1; 3143 break; 3144 } 3145 3146 /* First, ensure the minimum number of matches are present. */ 3147 3148 for (i = 1; i <= min; i++) 3149 { 3150 if (eptr >= md->end_subject) 3151 { 3152 SCHECK_PARTIAL(); 3153 RRETURN(MATCH_NOMATCH); 3154 } 3155 GETCHARINCTEST(c, eptr); 3156 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3157 } 3158 3159 /* If max == min we can continue with the main loop without the 3160 need to recurse. */ 3161 3162 if (min == max) continue; 3163 3164 /* If minimizing, keep testing the rest of the expression and advancing 3165 the pointer while it matches the class. */ 3166 3167 if (minimize) 3168 { 3169 for (fi = min;; fi++) 3170 { 3171 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20); 3172 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3173 if (fi >= max) RRETURN(MATCH_NOMATCH); 3174 if (eptr >= md->end_subject) 3175 { 3176 SCHECK_PARTIAL(); 3177 RRETURN(MATCH_NOMATCH); 3178 } 3179 GETCHARINCTEST(c, eptr); 3180 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3181 } 3182 /* Control never gets here */ 3183 } 3184 3185 /* If maximizing, find the longest possible run, then work backwards. */ 3186 3187 else 3188 { 3189 pp = eptr; 3190 for (i = min; i < max; i++) 3191 { 3192 int len = 1; 3193 if (eptr >= md->end_subject) 3194 { 3195 SCHECK_PARTIAL(); 3196 break; 3197 } 3198 #ifdef SUPPORT_UTF 3199 GETCHARLENTEST(c, eptr, len); 3200 #else 3201 c = *eptr; 3202 #endif 3203 if (!PRIV(xclass)(c, data, utf)) break; 3204 eptr += len; 3205 } 3206 3207 if (possessive) continue; /* No backtracking */ 3208 3209 for(;;) 3210 { 3211 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); 3212 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3213 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3214 #ifdef SUPPORT_UTF 3215 if (utf) BACKCHAR(eptr); 3216 #endif 3217 } 3218 RRETURN(MATCH_NOMATCH); 3219 } 3220 3221 /* Control never gets here */ 3222 } 3223 #endif /* End of XCLASS */ 3224 3225 /* Match a single character, casefully */ 3226 3227 case OP_CHAR: 3228 #ifdef SUPPORT_UTF 3229 if (utf) 3230 { 3231 length = 1; 3232 ecode++; 3233 GETCHARLEN(fc, ecode, length); 3234 if (length > md->end_subject - eptr) 3235 { 3236 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 3237 RRETURN(MATCH_NOMATCH); 3238 } 3239 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); 3240 } 3241 else 3242 #endif 3243 /* Not UTF mode */ 3244 { 3245 if (md->end_subject - eptr < 1) 3246 { 3247 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 3248 RRETURN(MATCH_NOMATCH); 3249 } 3250 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 3251 ecode += 2; 3252 } 3253 break; 3254 3255 /* Match a single character, caselessly. If we are at the end of the 3256 subject, give up immediately. */ 3257 3258 case OP_CHARI: 3259 if (eptr >= md->end_subject) 3260 { 3261 SCHECK_PARTIAL(); 3262 RRETURN(MATCH_NOMATCH); 3263 } 3264 3265 #ifdef SUPPORT_UTF 3266 if (utf) 3267 { 3268 length = 1; 3269 ecode++; 3270 GETCHARLEN(fc, ecode, length); 3271 3272 /* If the pattern character's value is < 128, we have only one byte, and 3273 we know that its other case must also be one byte long, so we can use the 3274 fast lookup table. We know that there is at least one byte left in the 3275 subject. */ 3276 3277 if (fc < 128) 3278 { 3279 pcre_uint32 cc = UCHAR21(eptr); 3280 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); 3281 ecode++; 3282 eptr++; 3283 } 3284 3285 /* Otherwise we must pick up the subject character. Note that we cannot 3286 use the value of "length" to check for sufficient bytes left, because the 3287 other case of the character may have more or fewer bytes. */ 3288 3289 else 3290 { 3291 pcre_uint32 dc; 3292 GETCHARINC(dc, eptr); 3293 ecode += length; 3294 3295 /* If we have Unicode property support, we can use it to test the other 3296 case of the character, if there is one. */ 3297 3298 if (fc != dc) 3299 { 3300 #ifdef SUPPORT_UCP 3301 if (dc != UCD_OTHERCASE(fc)) 3302 #endif 3303 RRETURN(MATCH_NOMATCH); 3304 } 3305 } 3306 } 3307 else 3308 #endif /* SUPPORT_UTF */ 3309 3310 /* Not UTF mode */ 3311 { 3312 if (TABLE_GET(ecode[1], md->lcc, ecode[1]) 3313 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3314 eptr++; 3315 ecode += 2; 3316 } 3317 break; 3318 3319 /* Match a single character repeatedly. */ 3320 3321 case OP_EXACT: 3322 case OP_EXACTI: 3323 min = max = GET2(ecode, 1); 3324 ecode += 1 + IMM2_SIZE; 3325 goto REPEATCHAR; 3326 3327 case OP_POSUPTO: 3328 case OP_POSUPTOI: 3329 possessive = TRUE; 3330 /* Fall through */ 3331 3332 case OP_UPTO: 3333 case OP_UPTOI: 3334 case OP_MINUPTO: 3335 case OP_MINUPTOI: 3336 min = 0; 3337 max = GET2(ecode, 1); 3338 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; 3339 ecode += 1 + IMM2_SIZE; 3340 goto REPEATCHAR; 3341 3342 case OP_POSSTAR: 3343 case OP_POSSTARI: 3344 possessive = TRUE; 3345 min = 0; 3346 max = INT_MAX; 3347 ecode++; 3348 goto REPEATCHAR; 3349 3350 case OP_POSPLUS: 3351 case OP_POSPLUSI: 3352 possessive = TRUE; 3353 min = 1; 3354 max = INT_MAX; 3355 ecode++; 3356 goto REPEATCHAR; 3357 3358 case OP_POSQUERY: 3359 case OP_POSQUERYI: 3360 possessive = TRUE; 3361 min = 0; 3362 max = 1; 3363 ecode++; 3364 goto REPEATCHAR; 3365 3366 case OP_STAR: 3367 case OP_STARI: 3368 case OP_MINSTAR: 3369 case OP_MINSTARI: 3370 case OP_PLUS: 3371 case OP_PLUSI: 3372 case OP_MINPLUS: 3373 case OP_MINPLUSI: 3374 case OP_QUERY: 3375 case OP_QUERYI: 3376 case OP_MINQUERY: 3377 case OP_MINQUERYI: 3378 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); 3379 minimize = (c & 1) != 0; 3380 min = rep_min[c]; /* Pick up values from tables; */ 3381 max = rep_max[c]; /* zero for max => infinity */ 3382 if (max == 0) max = INT_MAX; 3383 3384 /* Common code for all repeated single-character matches. We first check 3385 for the minimum number of characters. If the minimum equals the maximum, we 3386 are done. Otherwise, if minimizing, check the rest of the pattern for a 3387 match; if there isn't one, advance up to the maximum, one character at a 3388 time. 3389 3390 If maximizing, advance up to the maximum number of matching characters, 3391 until eptr is past the end of the maximum run. If possessive, we are 3392 then done (no backing up). Otherwise, match at this position; anything 3393 other than no match is immediately returned. For nomatch, back up one 3394 character, unless we are matching \R and the last thing matched was 3395 \r\n, in which case, back up two bytes. When we reach the first optional 3396 character position, we can save stack by doing a tail recurse. 3397 3398 The various UTF/non-UTF and caseful/caseless cases are handled separately, 3399 for speed. */ 3400 3401 REPEATCHAR: 3402 #ifdef SUPPORT_UTF 3403 if (utf) 3404 { 3405 length = 1; 3406 charptr = ecode; 3407 GETCHARLEN(fc, ecode, length); 3408 ecode += length; 3409 3410 /* Handle multibyte character matching specially here. There is 3411 support for caseless matching if UCP support is present. */ 3412 3413 if (length > 1) 3414 { 3415 #ifdef SUPPORT_UCP 3416 pcre_uint32 othercase; 3417 if (op >= OP_STARI && /* Caseless */ 3418 (othercase = UCD_OTHERCASE(fc)) != fc) 3419 oclength = PRIV(ord2utf)(othercase, occhars); 3420 else oclength = 0; 3421 #endif /* SUPPORT_UCP */ 3422 3423 for (i = 1; i <= min; i++) 3424 { 3425 if (eptr <= md->end_subject - length && 3426 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3427 #ifdef SUPPORT_UCP 3428 else if (oclength > 0 && 3429 eptr <= md->end_subject - oclength && 3430 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3431 #endif /* SUPPORT_UCP */ 3432 else 3433 { 3434 CHECK_PARTIAL(); 3435 RRETURN(MATCH_NOMATCH); 3436 } 3437 } 3438 3439 if (min == max) continue; 3440 3441 if (minimize) 3442 { 3443 for (fi = min;; fi++) 3444 { 3445 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22); 3446 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3447 if (fi >= max) RRETURN(MATCH_NOMATCH); 3448 if (eptr <= md->end_subject - length && 3449 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3450 #ifdef SUPPORT_UCP 3451 else if (oclength > 0 && 3452 eptr <= md->end_subject - oclength && 3453 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3454 #endif /* SUPPORT_UCP */ 3455 else 3456 { 3457 CHECK_PARTIAL(); 3458 RRETURN(MATCH_NOMATCH); 3459 } 3460 } 3461 /* Control never gets here */ 3462 } 3463 3464 else /* Maximize */ 3465 { 3466 pp = eptr; 3467 for (i = min; i < max; i++) 3468 { 3469 if (eptr <= md->end_subject - length && 3470 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3471 #ifdef SUPPORT_UCP 3472 else if (oclength > 0 && 3473 eptr <= md->end_subject - oclength && 3474 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3475 #endif /* SUPPORT_UCP */ 3476 else 3477 { 3478 CHECK_PARTIAL(); 3479 break; 3480 } 3481 } 3482 3483 if (possessive) continue; /* No backtracking */ 3484 for(;;) 3485 { 3486 if (eptr <= pp) goto TAIL_RECURSE; 3487 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); 3488 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3489 #ifdef SUPPORT_UCP 3490 eptr--; 3491 BACKCHAR(eptr); 3492 #else /* without SUPPORT_UCP */ 3493 eptr -= length; 3494 #endif /* SUPPORT_UCP */ 3495 } 3496 } 3497 /* Control never gets here */ 3498 } 3499 3500 /* If the length of a UTF-8 character is 1, we fall through here, and 3501 obey the code as for non-UTF-8 characters below, though in this case the 3502 value of fc will always be < 128. */ 3503 } 3504 else 3505 #endif /* SUPPORT_UTF */ 3506 /* When not in UTF-8 mode, load a single-byte character. */ 3507 fc = *ecode++; 3508 3509 /* The value of fc at this point is always one character, though we may 3510 or may not be in UTF mode. The code is duplicated for the caseless and 3511 caseful cases, for speed, since matching characters is likely to be quite 3512 common. First, ensure the minimum number of matches are present. If min = 3513 max, continue at the same level without recursing. Otherwise, if 3514 minimizing, keep trying the rest of the expression and advancing one 3515 matching character if failing, up to the maximum. Alternatively, if 3516 maximizing, find the maximum number of characters and work backwards. */ 3517 3518 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3519 max, (char *)eptr)); 3520 3521 if (op >= OP_STARI) /* Caseless */ 3522 { 3523 #ifdef COMPILE_PCRE8 3524 /* fc must be < 128 if UTF is enabled. */ 3525 foc = md->fcc[fc]; 3526 #else 3527 #ifdef SUPPORT_UTF 3528 #ifdef SUPPORT_UCP 3529 if (utf && fc > 127) 3530 foc = UCD_OTHERCASE(fc); 3531 #else 3532 if (utf && fc > 127) 3533 foc = fc; 3534 #endif /* SUPPORT_UCP */ 3535 else 3536 #endif /* SUPPORT_UTF */ 3537 foc = TABLE_GET(fc, md->fcc, fc); 3538 #endif /* COMPILE_PCRE8 */ 3539 3540 for (i = 1; i <= min; i++) 3541 { 3542 pcre_uint32 cc; /* Faster than pcre_uchar */ 3543 if (eptr >= md->end_subject) 3544 { 3545 SCHECK_PARTIAL(); 3546 RRETURN(MATCH_NOMATCH); 3547 } 3548 cc = UCHAR21TEST(eptr); 3549 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3550 eptr++; 3551 } 3552 if (min == max) continue; 3553 if (minimize) 3554 { 3555 for (fi = min;; fi++) 3556 { 3557 pcre_uint32 cc; /* Faster than pcre_uchar */ 3558 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); 3559 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3560 if (fi >= max) RRETURN(MATCH_NOMATCH); 3561 if (eptr >= md->end_subject) 3562 { 3563 SCHECK_PARTIAL(); 3564 RRETURN(MATCH_NOMATCH); 3565 } 3566 cc = UCHAR21TEST(eptr); 3567 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3568 eptr++; 3569 } 3570 /* Control never gets here */ 3571 } 3572 else /* Maximize */ 3573 { 3574 pp = eptr; 3575 for (i = min; i < max; i++) 3576 { 3577 pcre_uint32 cc; /* Faster than pcre_uchar */ 3578 if (eptr >= md->end_subject) 3579 { 3580 SCHECK_PARTIAL(); 3581 break; 3582 } 3583 cc = UCHAR21TEST(eptr); 3584 if (fc != cc && foc != cc) break; 3585 eptr++; 3586 } 3587 if (possessive) continue; /* No backtracking */ 3588 for (;;) 3589 { 3590 if (eptr == pp) goto TAIL_RECURSE; 3591 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); 3592 eptr--; 3593 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3594 } 3595 /* Control never gets here */ 3596 } 3597 } 3598 3599 /* Caseful comparisons (includes all multi-byte characters) */ 3600 3601 else 3602 { 3603 for (i = 1; i <= min; i++) 3604 { 3605 if (eptr >= md->end_subject) 3606 { 3607 SCHECK_PARTIAL(); 3608 RRETURN(MATCH_NOMATCH); 3609 } 3610 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3611 } 3612 3613 if (min == max) continue; 3614 3615 if (minimize) 3616 { 3617 for (fi = min;; fi++) 3618 { 3619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26); 3620 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3621 if (fi >= max) RRETURN(MATCH_NOMATCH); 3622 if (eptr >= md->end_subject) 3623 { 3624 SCHECK_PARTIAL(); 3625 RRETURN(MATCH_NOMATCH); 3626 } 3627 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3628 } 3629 /* Control never gets here */ 3630 } 3631 else /* Maximize */ 3632 { 3633 pp = eptr; 3634 for (i = min; i < max; i++) 3635 { 3636 if (eptr >= md->end_subject) 3637 { 3638 SCHECK_PARTIAL(); 3639 break; 3640 } 3641 if (fc != UCHAR21TEST(eptr)) break; 3642 eptr++; 3643 } 3644 if (possessive) continue; /* No backtracking */ 3645 for (;;) 3646 { 3647 if (eptr == pp) goto TAIL_RECURSE; 3648 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); 3649 eptr--; 3650 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3651 } 3652 /* Control never gets here */ 3653 } 3654 } 3655 /* Control never gets here */ 3656 3657 /* Match a negated single one-byte character. The character we are 3658 checking can be multibyte. */ 3659 3660 case OP_NOT: 3661 case OP_NOTI: 3662 if (eptr >= md->end_subject) 3663 { 3664 SCHECK_PARTIAL(); 3665 RRETURN(MATCH_NOMATCH); 3666 } 3667 #ifdef SUPPORT_UTF 3668 if (utf) 3669 { 3670 register pcre_uint32 ch, och; 3671 3672 ecode++; 3673 GETCHARINC(ch, ecode); 3674 GETCHARINC(c, eptr); 3675 3676 if (op == OP_NOT) 3677 { 3678 if (ch == c) RRETURN(MATCH_NOMATCH); 3679 } 3680 else 3681 { 3682 #ifdef SUPPORT_UCP 3683 if (ch > 127) 3684 och = UCD_OTHERCASE(ch); 3685 #else 3686 if (ch > 127) 3687 och = ch; 3688 #endif /* SUPPORT_UCP */ 3689 else 3690 och = TABLE_GET(ch, md->fcc, ch); 3691 if (ch == c || och == c) RRETURN(MATCH_NOMATCH); 3692 } 3693 } 3694 else 3695 #endif 3696 { 3697 register pcre_uint32 ch = ecode[1]; 3698 c = *eptr++; 3699 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) 3700 RRETURN(MATCH_NOMATCH); 3701 ecode += 2; 3702 } 3703 break; 3704 3705 /* Match a negated single one-byte character repeatedly. This is almost a 3706 repeat of the code for a repeated single character, but I haven't found a 3707 nice way of commoning these up that doesn't require a test of the 3708 positive/negative option for each character match. Maybe that wouldn't add 3709 very much to the time taken, but character matching *is* what this is all 3710 about... */ 3711 3712 case OP_NOTEXACT: 3713 case OP_NOTEXACTI: 3714 min = max = GET2(ecode, 1); 3715 ecode += 1 + IMM2_SIZE; 3716 goto REPEATNOTCHAR; 3717 3718 case OP_NOTUPTO: 3719 case OP_NOTUPTOI: 3720 case OP_NOTMINUPTO: 3721 case OP_NOTMINUPTOI: 3722 min = 0; 3723 max = GET2(ecode, 1); 3724 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; 3725 ecode += 1 + IMM2_SIZE; 3726 goto REPEATNOTCHAR; 3727 3728 case OP_NOTPOSSTAR: 3729 case OP_NOTPOSSTARI: 3730 possessive = TRUE; 3731 min = 0; 3732 max = INT_MAX; 3733 ecode++; 3734 goto REPEATNOTCHAR; 3735 3736 case OP_NOTPOSPLUS: 3737 case OP_NOTPOSPLUSI: 3738 possessive = TRUE; 3739 min = 1; 3740 max = INT_MAX; 3741 ecode++; 3742 goto REPEATNOTCHAR; 3743 3744 case OP_NOTPOSQUERY: 3745 case OP_NOTPOSQUERYI: 3746 possessive = TRUE; 3747 min = 0; 3748 max = 1; 3749 ecode++; 3750 goto REPEATNOTCHAR; 3751 3752 case OP_NOTPOSUPTO: 3753 case OP_NOTPOSUPTOI: 3754 possessive = TRUE; 3755 min = 0; 3756 max = GET2(ecode, 1); 3757 ecode += 1 + IMM2_SIZE; 3758 goto REPEATNOTCHAR; 3759 3760 case OP_NOTSTAR: 3761 case OP_NOTSTARI: 3762 case OP_NOTMINSTAR: 3763 case OP_NOTMINSTARI: 3764 case OP_NOTPLUS: 3765 case OP_NOTPLUSI: 3766 case OP_NOTMINPLUS: 3767 case OP_NOTMINPLUSI: 3768 case OP_NOTQUERY: 3769 case OP_NOTQUERYI: 3770 case OP_NOTMINQUERY: 3771 case OP_NOTMINQUERYI: 3772 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 3773 minimize = (c & 1) != 0; 3774 min = rep_min[c]; /* Pick up values from tables; */ 3775 max = rep_max[c]; /* zero for max => infinity */ 3776 if (max == 0) max = INT_MAX; 3777 3778 /* Common code for all repeated single-byte matches. */ 3779 3780 REPEATNOTCHAR: 3781 GETCHARINCTEST(fc, ecode); 3782 3783 /* The code is duplicated for the caseless and caseful cases, for speed, 3784 since matching characters is likely to be quite common. First, ensure the 3785 minimum number of matches are present. If min = max, continue at the same 3786 level without recursing. Otherwise, if minimizing, keep trying the rest of 3787 the expression and advancing one matching character if failing, up to the 3788 maximum. Alternatively, if maximizing, find the maximum number of 3789 characters and work backwards. */ 3790 3791 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3792 max, (char *)eptr)); 3793 3794 if (op >= OP_NOTSTARI) /* Caseless */ 3795 { 3796 #ifdef SUPPORT_UTF 3797 #ifdef SUPPORT_UCP 3798 if (utf && fc > 127) 3799 foc = UCD_OTHERCASE(fc); 3800 #else 3801 if (utf && fc > 127) 3802 foc = fc; 3803 #endif /* SUPPORT_UCP */ 3804 else 3805 #endif /* SUPPORT_UTF */ 3806 foc = TABLE_GET(fc, md->fcc, fc); 3807 3808 #ifdef SUPPORT_UTF 3809 if (utf) 3810 { 3811 register pcre_uint32 d; 3812 for (i = 1; i <= min; i++) 3813 { 3814 if (eptr >= md->end_subject) 3815 { 3816 SCHECK_PARTIAL(); 3817 RRETURN(MATCH_NOMATCH); 3818 } 3819 GETCHARINC(d, eptr); 3820 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3821 } 3822 } 3823 else 3824 #endif /* SUPPORT_UTF */ 3825 /* Not UTF mode */ 3826 { 3827 for (i = 1; i <= min; i++) 3828 { 3829 if (eptr >= md->end_subject) 3830 { 3831 SCHECK_PARTIAL(); 3832 RRETURN(MATCH_NOMATCH); 3833 } 3834 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3835 eptr++; 3836 } 3837 } 3838 3839 if (min == max) continue; 3840 3841 if (minimize) 3842 { 3843 #ifdef SUPPORT_UTF 3844 if (utf) 3845 { 3846 register pcre_uint32 d; 3847 for (fi = min;; fi++) 3848 { 3849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); 3850 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3851 if (fi >= max) RRETURN(MATCH_NOMATCH); 3852 if (eptr >= md->end_subject) 3853 { 3854 SCHECK_PARTIAL(); 3855 RRETURN(MATCH_NOMATCH); 3856 } 3857 GETCHARINC(d, eptr); 3858 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3859 } 3860 } 3861 else 3862 #endif /*SUPPORT_UTF */ 3863 /* Not UTF mode */ 3864 { 3865 for (fi = min;; fi++) 3866 { 3867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29); 3868 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3869 if (fi >= max) RRETURN(MATCH_NOMATCH); 3870 if (eptr >= md->end_subject) 3871 { 3872 SCHECK_PARTIAL(); 3873 RRETURN(MATCH_NOMATCH); 3874 } 3875 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3876 eptr++; 3877 } 3878 } 3879 /* Control never gets here */ 3880 } 3881 3882 /* Maximize case */ 3883 3884 else 3885 { 3886 pp = eptr; 3887 3888 #ifdef SUPPORT_UTF 3889 if (utf) 3890 { 3891 register pcre_uint32 d; 3892 for (i = min; i < max; i++) 3893 { 3894 int len = 1; 3895 if (eptr >= md->end_subject) 3896 { 3897 SCHECK_PARTIAL(); 3898 break; 3899 } 3900 GETCHARLEN(d, eptr, len); 3901 if (fc == d || (unsigned int)foc == d) break; 3902 eptr += len; 3903 } 3904 if (possessive) continue; /* No backtracking */ 3905 for(;;) 3906 { 3907 if (eptr <= pp) goto TAIL_RECURSE; 3908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); 3909 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3910 eptr--; 3911 BACKCHAR(eptr); 3912 } 3913 } 3914 else 3915 #endif /* SUPPORT_UTF */ 3916 /* Not UTF mode */ 3917 { 3918 for (i = min; i < max; i++) 3919 { 3920 if (eptr >= md->end_subject) 3921 { 3922 SCHECK_PARTIAL(); 3923 break; 3924 } 3925 if (fc == *eptr || foc == *eptr) break; 3926 eptr++; 3927 } 3928 if (possessive) continue; /* No backtracking */ 3929 for (;;) 3930 { 3931 if (eptr == pp) goto TAIL_RECURSE; 3932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); 3933 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3934 eptr--; 3935 } 3936 } 3937 /* Control never gets here */ 3938 } 3939 } 3940 3941 /* Caseful comparisons */ 3942 3943 else 3944 { 3945 #ifdef SUPPORT_UTF 3946 if (utf) 3947 { 3948 register pcre_uint32 d; 3949 for (i = 1; i <= min; i++) 3950 { 3951 if (eptr >= md->end_subject) 3952 { 3953 SCHECK_PARTIAL(); 3954 RRETURN(MATCH_NOMATCH); 3955 } 3956 GETCHARINC(d, eptr); 3957 if (fc == d) RRETURN(MATCH_NOMATCH); 3958 } 3959 } 3960 else 3961 #endif 3962 /* Not UTF mode */ 3963 { 3964 for (i = 1; i <= min; i++) 3965 { 3966 if (eptr >= md->end_subject) 3967 { 3968 SCHECK_PARTIAL(); 3969 RRETURN(MATCH_NOMATCH); 3970 } 3971 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3972 } 3973 } 3974 3975 if (min == max) continue; 3976 3977 if (minimize) 3978 { 3979 #ifdef SUPPORT_UTF 3980 if (utf) 3981 { 3982 register pcre_uint32 d; 3983 for (fi = min;; fi++) 3984 { 3985 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); 3986 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3987 if (fi >= max) RRETURN(MATCH_NOMATCH); 3988 if (eptr >= md->end_subject) 3989 { 3990 SCHECK_PARTIAL(); 3991 RRETURN(MATCH_NOMATCH); 3992 } 3993 GETCHARINC(d, eptr); 3994 if (fc == d) RRETURN(MATCH_NOMATCH); 3995 } 3996 } 3997 else 3998 #endif 3999 /* Not UTF mode */ 4000 { 4001 for (fi = min;; fi++) 4002 { 4003 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33); 4004 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4005 if (fi >= max) RRETURN(MATCH_NOMATCH); 4006 if (eptr >= md->end_subject) 4007 { 4008 SCHECK_PARTIAL(); 4009 RRETURN(MATCH_NOMATCH); 4010 } 4011 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 4012 } 4013 } 4014 /* Control never gets here */ 4015 } 4016 4017 /* Maximize case */ 4018 4019 else 4020 { 4021 pp = eptr; 4022 4023 #ifdef SUPPORT_UTF 4024 if (utf) 4025 { 4026 register pcre_uint32 d; 4027 for (i = min; i < max; i++) 4028 { 4029 int len = 1; 4030 if (eptr >= md->end_subject) 4031 { 4032 SCHECK_PARTIAL(); 4033 break; 4034 } 4035 GETCHARLEN(d, eptr, len); 4036 if (fc == d) break; 4037 eptr += len; 4038 } 4039 if (possessive) continue; /* No backtracking */ 4040 for(;;) 4041 { 4042 if (eptr <= pp) goto TAIL_RECURSE; 4043 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); 4044 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4045 eptr--; 4046 BACKCHAR(eptr); 4047 } 4048 } 4049 else 4050 #endif 4051 /* Not UTF mode */ 4052 { 4053 for (i = min; i < max; i++) 4054 { 4055 if (eptr >= md->end_subject) 4056 { 4057 SCHECK_PARTIAL(); 4058 break; 4059 } 4060 if (fc == *eptr) break; 4061 eptr++; 4062 } 4063 if (possessive) continue; /* No backtracking */ 4064 for (;;) 4065 { 4066 if (eptr == pp) goto TAIL_RECURSE; 4067 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); 4068 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4069 eptr--; 4070 } 4071 } 4072 /* Control never gets here */ 4073 } 4074 } 4075 /* Control never gets here */ 4076 4077 /* Match a single character type repeatedly; several different opcodes 4078 share code. This is very similar to the code for single characters, but we 4079 repeat it in the interests of efficiency. */ 4080 4081 case OP_TYPEEXACT: 4082 min = max = GET2(ecode, 1); 4083 minimize = TRUE; 4084 ecode += 1 + IMM2_SIZE; 4085 goto REPEATTYPE; 4086 4087 case OP_TYPEUPTO: 4088 case OP_TYPEMINUPTO: 4089 min = 0; 4090 max = GET2(ecode, 1); 4091 minimize = *ecode == OP_TYPEMINUPTO; 4092 ecode += 1 + IMM2_SIZE; 4093 goto REPEATTYPE; 4094 4095 case OP_TYPEPOSSTAR: 4096 possessive = TRUE; 4097 min = 0; 4098 max = INT_MAX; 4099 ecode++; 4100 goto REPEATTYPE; 4101 4102 case OP_TYPEPOSPLUS: 4103 possessive = TRUE; 4104 min = 1; 4105 max = INT_MAX; 4106 ecode++; 4107 goto REPEATTYPE; 4108 4109 case OP_TYPEPOSQUERY: 4110 possessive = TRUE; 4111 min = 0; 4112 max = 1; 4113 ecode++; 4114 goto REPEATTYPE; 4115 4116 case OP_TYPEPOSUPTO: 4117 possessive = TRUE; 4118 min = 0; 4119 max = GET2(ecode, 1); 4120 ecode += 1 + IMM2_SIZE; 4121 goto REPEATTYPE; 4122 4123 case OP_TYPESTAR: 4124 case OP_TYPEMINSTAR: 4125 case OP_TYPEPLUS: 4126 case OP_TYPEMINPLUS: 4127 case OP_TYPEQUERY: 4128 case OP_TYPEMINQUERY: 4129 c = *ecode++ - OP_TYPESTAR; 4130 minimize = (c & 1) != 0; 4131 min = rep_min[c]; /* Pick up values from tables; */ 4132 max = rep_max[c]; /* zero for max => infinity */ 4133 if (max == 0) max = INT_MAX; 4134 4135 /* Common code for all repeated single character type matches. Note that 4136 in UTF-8 mode, '.' matches a character of any length, but for the other 4137 character types, the valid characters are all one-byte long. */ 4138 4139 REPEATTYPE: 4140 ctype = *ecode++; /* Code for the character type */ 4141 4142 #ifdef SUPPORT_UCP 4143 if (ctype == OP_PROP || ctype == OP_NOTPROP) 4144 { 4145 prop_fail_result = ctype == OP_NOTPROP; 4146 prop_type = *ecode++; 4147 prop_value = *ecode++; 4148 } 4149 else prop_type = -1; 4150 #endif 4151 4152 /* First, ensure the minimum number of matches are present. Use inline 4153 code for maximizing the speed, and do the type test once at the start 4154 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 4155 is tidier. Also separate the UCP code, which can be the same for both UTF-8 4156 and single-bytes. */ 4157 4158 if (min > 0) 4159 { 4160 #ifdef SUPPORT_UCP 4161 if (prop_type >= 0) 4162 { 4163 switch(prop_type) 4164 { 4165 case PT_ANY: 4166 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4167 for (i = 1; i <= min; i++) 4168 { 4169 if (eptr >= md->end_subject) 4170 { 4171 SCHECK_PARTIAL(); 4172 RRETURN(MATCH_NOMATCH); 4173 } 4174 GETCHARINCTEST(c, eptr); 4175 } 4176 break; 4177 4178 case PT_LAMP: 4179 for (i = 1; i <= min; i++) 4180 { 4181 int chartype; 4182 if (eptr >= md->end_subject) 4183 { 4184 SCHECK_PARTIAL(); 4185 RRETURN(MATCH_NOMATCH); 4186 } 4187 GETCHARINCTEST(c, eptr); 4188 chartype = UCD_CHARTYPE(c); 4189 if ((chartype == ucp_Lu || 4190 chartype == ucp_Ll || 4191 chartype == ucp_Lt) == prop_fail_result) 4192 RRETURN(MATCH_NOMATCH); 4193 } 4194 break; 4195 4196 case PT_GC: 4197 for (i = 1; i <= min; i++) 4198 { 4199 if (eptr >= md->end_subject) 4200 { 4201 SCHECK_PARTIAL(); 4202 RRETURN(MATCH_NOMATCH); 4203 } 4204 GETCHARINCTEST(c, eptr); 4205 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4206 RRETURN(MATCH_NOMATCH); 4207 } 4208 break; 4209 4210 case PT_PC: 4211 for (i = 1; i <= min; i++) 4212 { 4213 if (eptr >= md->end_subject) 4214 { 4215 SCHECK_PARTIAL(); 4216 RRETURN(MATCH_NOMATCH); 4217 } 4218 GETCHARINCTEST(c, eptr); 4219 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4220 RRETURN(MATCH_NOMATCH); 4221 } 4222 break; 4223 4224 case PT_SC: 4225 for (i = 1; i <= min; i++) 4226 { 4227 if (eptr >= md->end_subject) 4228 { 4229 SCHECK_PARTIAL(); 4230 RRETURN(MATCH_NOMATCH); 4231 } 4232 GETCHARINCTEST(c, eptr); 4233 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4234 RRETURN(MATCH_NOMATCH); 4235 } 4236 break; 4237 4238 case PT_ALNUM: 4239 for (i = 1; i <= min; i++) 4240 { 4241 int category; 4242 if (eptr >= md->end_subject) 4243 { 4244 SCHECK_PARTIAL(); 4245 RRETURN(MATCH_NOMATCH); 4246 } 4247 GETCHARINCTEST(c, eptr); 4248 category = UCD_CATEGORY(c); 4249 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4250 RRETURN(MATCH_NOMATCH); 4251 } 4252 break; 4253 4254 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4255 which means that Perl space and POSIX space are now identical. PCRE 4256 was changed at release 8.34. */ 4257 4258 case PT_SPACE: /* Perl space */ 4259 case PT_PXSPACE: /* POSIX space */ 4260 for (i = 1; i <= min; i++) 4261 { 4262 if (eptr >= md->end_subject) 4263 { 4264 SCHECK_PARTIAL(); 4265 RRETURN(MATCH_NOMATCH); 4266 } 4267 GETCHARINCTEST(c, eptr); 4268 switch(c) 4269 { 4270 HSPACE_CASES: 4271 VSPACE_CASES: 4272 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4273 break; 4274 4275 default: 4276 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 4277 RRETURN(MATCH_NOMATCH); 4278 break; 4279 } 4280 } 4281 break; 4282 4283 case PT_WORD: 4284 for (i = 1; i <= min; i++) 4285 { 4286 int category; 4287 if (eptr >= md->end_subject) 4288 { 4289 SCHECK_PARTIAL(); 4290 RRETURN(MATCH_NOMATCH); 4291 } 4292 GETCHARINCTEST(c, eptr); 4293 category = UCD_CATEGORY(c); 4294 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) 4295 == prop_fail_result) 4296 RRETURN(MATCH_NOMATCH); 4297 } 4298 break; 4299 4300 case PT_CLIST: 4301 for (i = 1; i <= min; i++) 4302 { 4303 const pcre_uint32 *cp; 4304 if (eptr >= md->end_subject) 4305 { 4306 SCHECK_PARTIAL(); 4307 RRETURN(MATCH_NOMATCH); 4308 } 4309 GETCHARINCTEST(c, eptr); 4310 cp = PRIV(ucd_caseless_sets) + prop_value; 4311 for (;;) 4312 { 4313 if (c < *cp) 4314 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 4315 if (c == *cp++) 4316 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 4317 } 4318 } 4319 break; 4320 4321 case PT_UCNC: 4322 for (i = 1; i <= min; i++) 4323 { 4324 if (eptr >= md->end_subject) 4325 { 4326 SCHECK_PARTIAL(); 4327 RRETURN(MATCH_NOMATCH); 4328 } 4329 GETCHARINCTEST(c, eptr); 4330 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 4331 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 4332 c >= 0xe000) == prop_fail_result) 4333 RRETURN(MATCH_NOMATCH); 4334 } 4335 break; 4336 4337 /* This should not occur */ 4338 4339 default: 4340 RRETURN(PCRE_ERROR_INTERNAL); 4341 } 4342 } 4343 4344 /* Match extended Unicode sequences. We will get here only if the 4345 support is in the binary; otherwise a compile-time error occurs. */ 4346 4347 else if (ctype == OP_EXTUNI) 4348 { 4349 for (i = 1; i <= min; i++) 4350 { 4351 if (eptr >= md->end_subject) 4352 { 4353 SCHECK_PARTIAL(); 4354 RRETURN(MATCH_NOMATCH); 4355 } 4356 else 4357 { 4358 int lgb, rgb; 4359 GETCHARINCTEST(c, eptr); 4360 lgb = UCD_GRAPHBREAK(c); 4361 while (eptr < md->end_subject) 4362 { 4363 int len = 1; 4364 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 4365 rgb = UCD_GRAPHBREAK(c); 4366 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 4367 lgb = rgb; 4368 eptr += len; 4369 } 4370 } 4371 CHECK_PARTIAL(); 4372 } 4373 } 4374 4375 else 4376 #endif /* SUPPORT_UCP */ 4377 4378 /* Handle all other cases when the coding is UTF-8 */ 4379 4380 #ifdef SUPPORT_UTF 4381 if (utf) switch(ctype) 4382 { 4383 case OP_ANY: 4384 for (i = 1; i <= min; i++) 4385 { 4386 if (eptr >= md->end_subject) 4387 { 4388 SCHECK_PARTIAL(); 4389 RRETURN(MATCH_NOMATCH); 4390 } 4391 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4392 if (md->partial != 0 && 4393 eptr + 1 >= md->end_subject && 4394 NLBLOCK->nltype == NLTYPE_FIXED && 4395 NLBLOCK->nllen == 2 && 4396 UCHAR21(eptr) == NLBLOCK->nl[0]) 4397 { 4398 md->hitend = TRUE; 4399 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4400 } 4401 eptr++; 4402 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4403 } 4404 break; 4405 4406 case OP_ALLANY: 4407 for (i = 1; i <= min; i++) 4408 { 4409 if (eptr >= md->end_subject) 4410 { 4411 SCHECK_PARTIAL(); 4412 RRETURN(MATCH_NOMATCH); 4413 } 4414 eptr++; 4415 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4416 } 4417 break; 4418 4419 case OP_ANYBYTE: 4420 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); 4421 eptr += min; 4422 break; 4423 4424 case OP_ANYNL: 4425 for (i = 1; i <= min; i++) 4426 { 4427 if (eptr >= md->end_subject) 4428 { 4429 SCHECK_PARTIAL(); 4430 RRETURN(MATCH_NOMATCH); 4431 } 4432 GETCHARINC(c, eptr); 4433 switch(c) 4434 { 4435 default: RRETURN(MATCH_NOMATCH); 4436 4437 case CHAR_CR: 4438 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 4439 break; 4440 4441 case CHAR_LF: 4442 break; 4443 4444 case CHAR_VT: 4445 case CHAR_FF: 4446 case CHAR_NEL: 4447 #ifndef EBCDIC 4448 case 0x2028: 4449 case 0x2029: 4450 #endif /* Not EBCDIC */ 4451 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4452 break; 4453 } 4454 } 4455 break; 4456 4457 case OP_NOT_HSPACE: 4458 for (i = 1; i <= min; i++) 4459 { 4460 if (eptr >= md->end_subject) 4461 { 4462 SCHECK_PARTIAL(); 4463 RRETURN(MATCH_NOMATCH); 4464 } 4465 GETCHARINC(c, eptr); 4466 switch(c) 4467 { 4468 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 4469 default: break; 4470 } 4471 } 4472 break; 4473 4474 case OP_HSPACE: 4475 for (i = 1; i <= min; i++) 4476 { 4477 if (eptr >= md->end_subject) 4478 { 4479 SCHECK_PARTIAL(); 4480 RRETURN(MATCH_NOMATCH); 4481 } 4482 GETCHARINC(c, eptr); 4483 switch(c) 4484 { 4485 HSPACE_CASES: break; /* Byte and multibyte cases */ 4486 default: RRETURN(MATCH_NOMATCH); 4487 } 4488 } 4489 break; 4490 4491 case OP_NOT_VSPACE: 4492 for (i = 1; i <= min; i++) 4493 { 4494 if (eptr >= md->end_subject) 4495 { 4496 SCHECK_PARTIAL(); 4497 RRETURN(MATCH_NOMATCH); 4498 } 4499 GETCHARINC(c, eptr); 4500 switch(c) 4501 { 4502 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 4503 default: break; 4504 } 4505 } 4506 break; 4507 4508 case OP_VSPACE: 4509 for (i = 1; i <= min; i++) 4510 { 4511 if (eptr >= md->end_subject) 4512 { 4513 SCHECK_PARTIAL(); 4514 RRETURN(MATCH_NOMATCH); 4515 } 4516 GETCHARINC(c, eptr); 4517 switch(c) 4518 { 4519 VSPACE_CASES: break; 4520 default: RRETURN(MATCH_NOMATCH); 4521 } 4522 } 4523 break; 4524 4525 case OP_NOT_DIGIT: 4526 for (i = 1; i <= min; i++) 4527 { 4528 if (eptr >= md->end_subject) 4529 { 4530 SCHECK_PARTIAL(); 4531 RRETURN(MATCH_NOMATCH); 4532 } 4533 GETCHARINC(c, eptr); 4534 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 4535 RRETURN(MATCH_NOMATCH); 4536 } 4537 break; 4538 4539 case OP_DIGIT: 4540 for (i = 1; i <= min; i++) 4541 { 4542 pcre_uint32 cc; 4543 if (eptr >= md->end_subject) 4544 { 4545 SCHECK_PARTIAL(); 4546 RRETURN(MATCH_NOMATCH); 4547 } 4548 cc = UCHAR21(eptr); 4549 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) 4550 RRETURN(MATCH_NOMATCH); 4551 eptr++; 4552 /* No need to skip more bytes - we know it's a 1-byte character */ 4553 } 4554 break; 4555 4556 case OP_NOT_WHITESPACE: 4557 for (i = 1; i <= min; i++) 4558 { 4559 pcre_uint32 cc; 4560 if (eptr >= md->end_subject) 4561 { 4562 SCHECK_PARTIAL(); 4563 RRETURN(MATCH_NOMATCH); 4564 } 4565 cc = UCHAR21(eptr); 4566 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) 4567 RRETURN(MATCH_NOMATCH); 4568 eptr++; 4569 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4570 } 4571 break; 4572 4573 case OP_WHITESPACE: 4574 for (i = 1; i <= min; i++) 4575 { 4576 pcre_uint32 cc; 4577 if (eptr >= md->end_subject) 4578 { 4579 SCHECK_PARTIAL(); 4580 RRETURN(MATCH_NOMATCH); 4581 } 4582 cc = UCHAR21(eptr); 4583 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) 4584 RRETURN(MATCH_NOMATCH); 4585 eptr++; 4586 /* No need to skip more bytes - we know it's a 1-byte character */ 4587 } 4588 break; 4589 4590 case OP_NOT_WORDCHAR: 4591 for (i = 1; i <= min; i++) 4592 { 4593 pcre_uint32 cc; 4594 if (eptr >= md->end_subject) 4595 { 4596 SCHECK_PARTIAL(); 4597 RRETURN(MATCH_NOMATCH); 4598 } 4599 cc = UCHAR21(eptr); 4600 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) 4601 RRETURN(MATCH_NOMATCH); 4602 eptr++; 4603 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4604 } 4605 break; 4606 4607 case OP_WORDCHAR: 4608 for (i = 1; i <= min; i++) 4609 { 4610 pcre_uint32 cc; 4611 if (eptr >= md->end_subject) 4612 { 4613 SCHECK_PARTIAL(); 4614 RRETURN(MATCH_NOMATCH); 4615 } 4616 cc = UCHAR21(eptr); 4617 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) 4618 RRETURN(MATCH_NOMATCH); 4619 eptr++; 4620 /* No need to skip more bytes - we know it's a 1-byte character */ 4621 } 4622 break; 4623 4624 default: 4625 RRETURN(PCRE_ERROR_INTERNAL); 4626 } /* End switch(ctype) */ 4627 4628 else 4629 #endif /* SUPPORT_UTF */ 4630 4631 /* Code for the non-UTF-8 case for minimum matching of operators other 4632 than OP_PROP and OP_NOTPROP. */ 4633 4634 switch(ctype) 4635 { 4636 case OP_ANY: 4637 for (i = 1; i <= min; i++) 4638 { 4639 if (eptr >= md->end_subject) 4640 { 4641 SCHECK_PARTIAL(); 4642 RRETURN(MATCH_NOMATCH); 4643 } 4644 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4645 if (md->partial != 0 && 4646 eptr + 1 >= md->end_subject && 4647 NLBLOCK->nltype == NLTYPE_FIXED && 4648 NLBLOCK->nllen == 2 && 4649 *eptr == NLBLOCK->nl[0]) 4650 { 4651 md->hitend = TRUE; 4652 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4653 } 4654 eptr++; 4655 } 4656 break; 4657 4658 case OP_ALLANY: 4659 if (eptr > md->end_subject - min) 4660 { 4661 SCHECK_PARTIAL(); 4662 RRETURN(MATCH_NOMATCH); 4663 } 4664 eptr += min; 4665 break; 4666 4667 case OP_ANYBYTE: 4668 if (eptr > md->end_subject - min) 4669 { 4670 SCHECK_PARTIAL(); 4671 RRETURN(MATCH_NOMATCH); 4672 } 4673 eptr += min; 4674 break; 4675 4676 case OP_ANYNL: 4677 for (i = 1; i <= min; i++) 4678 { 4679 if (eptr >= md->end_subject) 4680 { 4681 SCHECK_PARTIAL(); 4682 RRETURN(MATCH_NOMATCH); 4683 } 4684 switch(*eptr++) 4685 { 4686 default: RRETURN(MATCH_NOMATCH); 4687 4688 case CHAR_CR: 4689 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 4690 break; 4691 4692 case CHAR_LF: 4693 break; 4694 4695 case CHAR_VT: 4696 case CHAR_FF: 4697 case CHAR_NEL: 4698 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4699 case 0x2028: 4700 case 0x2029: 4701 #endif 4702 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4703 break; 4704 } 4705 } 4706 break; 4707 4708 case OP_NOT_HSPACE: 4709 for (i = 1; i <= min; i++) 4710 { 4711 if (eptr >= md->end_subject) 4712 { 4713 SCHECK_PARTIAL(); 4714 RRETURN(MATCH_NOMATCH); 4715 } 4716 switch(*eptr++) 4717 { 4718 default: break; 4719 HSPACE_BYTE_CASES: 4720 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4721 HSPACE_MULTIBYTE_CASES: 4722 #endif 4723 RRETURN(MATCH_NOMATCH); 4724 } 4725 } 4726 break; 4727 4728 case OP_HSPACE: 4729 for (i = 1; i <= min; i++) 4730 { 4731 if (eptr >= md->end_subject) 4732 { 4733 SCHECK_PARTIAL(); 4734 RRETURN(MATCH_NOMATCH); 4735 } 4736 switch(*eptr++) 4737 { 4738 default: RRETURN(MATCH_NOMATCH); 4739 HSPACE_BYTE_CASES: 4740 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4741 HSPACE_MULTIBYTE_CASES: 4742 #endif 4743 break; 4744 } 4745 } 4746 break; 4747 4748 case OP_NOT_VSPACE: 4749 for (i = 1; i <= min; i++) 4750 { 4751 if (eptr >= md->end_subject) 4752 { 4753 SCHECK_PARTIAL(); 4754 RRETURN(MATCH_NOMATCH); 4755 } 4756 switch(*eptr++) 4757 { 4758 VSPACE_BYTE_CASES: 4759 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4760 VSPACE_MULTIBYTE_CASES: 4761 #endif 4762 RRETURN(MATCH_NOMATCH); 4763 default: break; 4764 } 4765 } 4766 break; 4767 4768 case OP_VSPACE: 4769 for (i = 1; i <= min; i++) 4770 { 4771 if (eptr >= md->end_subject) 4772 { 4773 SCHECK_PARTIAL(); 4774 RRETURN(MATCH_NOMATCH); 4775 } 4776 switch(*eptr++) 4777 { 4778 default: RRETURN(MATCH_NOMATCH); 4779 VSPACE_BYTE_CASES: 4780 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4781 VSPACE_MULTIBYTE_CASES: 4782 #endif 4783 break; 4784 } 4785 } 4786 break; 4787 4788 case OP_NOT_DIGIT: 4789 for (i = 1; i <= min; i++) 4790 { 4791 if (eptr >= md->end_subject) 4792 { 4793 SCHECK_PARTIAL(); 4794 RRETURN(MATCH_NOMATCH); 4795 } 4796 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) 4797 RRETURN(MATCH_NOMATCH); 4798 eptr++; 4799 } 4800 break; 4801 4802 case OP_DIGIT: 4803 for (i = 1; i <= min; i++) 4804 { 4805 if (eptr >= md->end_subject) 4806 { 4807 SCHECK_PARTIAL(); 4808 RRETURN(MATCH_NOMATCH); 4809 } 4810 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) 4811 RRETURN(MATCH_NOMATCH); 4812 eptr++; 4813 } 4814 break; 4815 4816 case OP_NOT_WHITESPACE: 4817 for (i = 1; i <= min; i++) 4818 { 4819 if (eptr >= md->end_subject) 4820 { 4821 SCHECK_PARTIAL(); 4822 RRETURN(MATCH_NOMATCH); 4823 } 4824 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) 4825 RRETURN(MATCH_NOMATCH); 4826 eptr++; 4827 } 4828 break; 4829 4830 case OP_WHITESPACE: 4831 for (i = 1; i <= min; i++) 4832 { 4833 if (eptr >= md->end_subject) 4834 { 4835 SCHECK_PARTIAL(); 4836 RRETURN(MATCH_NOMATCH); 4837 } 4838 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) 4839 RRETURN(MATCH_NOMATCH); 4840 eptr++; 4841 } 4842 break; 4843 4844 case OP_NOT_WORDCHAR: 4845 for (i = 1; i <= min; i++) 4846 { 4847 if (eptr >= md->end_subject) 4848 { 4849 SCHECK_PARTIAL(); 4850 RRETURN(MATCH_NOMATCH); 4851 } 4852 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) 4853 RRETURN(MATCH_NOMATCH); 4854 eptr++; 4855 } 4856 break; 4857 4858 case OP_WORDCHAR: 4859 for (i = 1; i <= min; i++) 4860 { 4861 if (eptr >= md->end_subject) 4862 { 4863 SCHECK_PARTIAL(); 4864 RRETURN(MATCH_NOMATCH); 4865 } 4866 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) 4867 RRETURN(MATCH_NOMATCH); 4868 eptr++; 4869 } 4870 break; 4871 4872 default: 4873 RRETURN(PCRE_ERROR_INTERNAL); 4874 } 4875 } 4876 4877 /* If min = max, continue at the same level without recursing */ 4878 4879 if (min == max) continue; 4880 4881 /* If minimizing, we have to test the rest of the pattern before each 4882 subsequent match. Again, separate the UTF-8 case for speed, and also 4883 separate the UCP cases. */ 4884 4885 if (minimize) 4886 { 4887 #ifdef SUPPORT_UCP 4888 if (prop_type >= 0) 4889 { 4890 switch(prop_type) 4891 { 4892 case PT_ANY: 4893 for (fi = min;; fi++) 4894 { 4895 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36); 4896 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4897 if (fi >= max) RRETURN(MATCH_NOMATCH); 4898 if (eptr >= md->end_subject) 4899 { 4900 SCHECK_PARTIAL(); 4901 RRETURN(MATCH_NOMATCH); 4902 } 4903 GETCHARINCTEST(c, eptr); 4904 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4905 } 4906 /* Control never gets here */ 4907 4908 case PT_LAMP: 4909 for (fi = min;; fi++) 4910 { 4911 int chartype; 4912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37); 4913 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4914 if (fi >= max) RRETURN(MATCH_NOMATCH); 4915 if (eptr >= md->end_subject) 4916 { 4917 SCHECK_PARTIAL(); 4918 RRETURN(MATCH_NOMATCH); 4919 } 4920 GETCHARINCTEST(c, eptr); 4921 chartype = UCD_CHARTYPE(c); 4922 if ((chartype == ucp_Lu || 4923 chartype == ucp_Ll || 4924 chartype == ucp_Lt) == prop_fail_result) 4925 RRETURN(MATCH_NOMATCH); 4926 } 4927 /* Control never gets here */ 4928 4929 case PT_GC: 4930 for (fi = min;; fi++) 4931 { 4932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38); 4933 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4934 if (fi >= max) RRETURN(MATCH_NOMATCH); 4935 if (eptr >= md->end_subject) 4936 { 4937 SCHECK_PARTIAL(); 4938 RRETURN(MATCH_NOMATCH); 4939 } 4940 GETCHARINCTEST(c, eptr); 4941 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4942 RRETURN(MATCH_NOMATCH); 4943 } 4944 /* Control never gets here */ 4945 4946 case PT_PC: 4947 for (fi = min;; fi++) 4948 { 4949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39); 4950 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4951 if (fi >= max) RRETURN(MATCH_NOMATCH); 4952 if (eptr >= md->end_subject) 4953 { 4954 SCHECK_PARTIAL(); 4955 RRETURN(MATCH_NOMATCH); 4956 } 4957 GETCHARINCTEST(c, eptr); 4958 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4959 RRETURN(MATCH_NOMATCH); 4960 } 4961 /* Control never gets here */ 4962 4963 case PT_SC: 4964 for (fi = min;; fi++) 4965 { 4966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40); 4967 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4968 if (fi >= max) RRETURN(MATCH_NOMATCH); 4969 if (eptr >= md->end_subject) 4970 { 4971 SCHECK_PARTIAL(); 4972 RRETURN(MATCH_NOMATCH); 4973 } 4974 GETCHARINCTEST(c, eptr); 4975 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4976 RRETURN(MATCH_NOMATCH); 4977 } 4978 /* Control never gets here */ 4979 4980 case PT_ALNUM: 4981 for (fi = min;; fi++) 4982 { 4983 int category; 4984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59); 4985 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4986 if (fi >= max) RRETURN(MATCH_NOMATCH); 4987 if (eptr >= md->end_subject) 4988 { 4989 SCHECK_PARTIAL(); 4990 RRETURN(MATCH_NOMATCH); 4991 } 4992 GETCHARINCTEST(c, eptr); 4993 category = UCD_CATEGORY(c); 4994 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4995 RRETURN(MATCH_NOMATCH); 4996 } 4997 /* Control never gets here */ 4998 4999 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5000 which means that Perl space and POSIX space are now identical. PCRE 5001 was changed at release 8.34. */ 5002 5003 case PT_SPACE: /* Perl space */ 5004 case PT_PXSPACE: /* POSIX space */ 5005 for (fi = min;; fi++) 5006 { 5007 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); 5008 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5009 if (fi >= max) RRETURN(MATCH_NOMATCH); 5010 if (eptr >= md->end_subject) 5011 { 5012 SCHECK_PARTIAL(); 5013 RRETURN(MATCH_NOMATCH); 5014 } 5015 GETCHARINCTEST(c, eptr); 5016 switch(c) 5017 { 5018 HSPACE_CASES: 5019 VSPACE_CASES: 5020 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 5021 break; 5022 5023 default: 5024 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5025 RRETURN(MATCH_NOMATCH); 5026 break; 5027 } 5028 } 5029 /* Control never gets here */ 5030 5031 case PT_WORD: 5032 for (fi = min;; fi++) 5033 { 5034 int category; 5035 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); 5036 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5037 if (fi >= max) RRETURN(MATCH_NOMATCH); 5038 if (eptr >= md->end_subject) 5039 { 5040 SCHECK_PARTIAL(); 5041 RRETURN(MATCH_NOMATCH); 5042 } 5043 GETCHARINCTEST(c, eptr); 5044 category = UCD_CATEGORY(c); 5045 if ((category == ucp_L || 5046 category == ucp_N || 5047 c == CHAR_UNDERSCORE) 5048 == prop_fail_result) 5049 RRETURN(MATCH_NOMATCH); 5050 } 5051 /* Control never gets here */ 5052 5053 case PT_CLIST: 5054 for (fi = min;; fi++) 5055 { 5056 const pcre_uint32 *cp; 5057 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); 5058 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5059 if (fi >= max) RRETURN(MATCH_NOMATCH); 5060 if (eptr >= md->end_subject) 5061 { 5062 SCHECK_PARTIAL(); 5063 RRETURN(MATCH_NOMATCH); 5064 } 5065 GETCHARINCTEST(c, eptr); 5066 cp = PRIV(ucd_caseless_sets) + prop_value; 5067 for (;;) 5068 { 5069 if (c < *cp) 5070 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 5071 if (c == *cp++) 5072 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 5073 } 5074 } 5075 /* Control never gets here */ 5076 5077 case PT_UCNC: 5078 for (fi = min;; fi++) 5079 { 5080 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); 5081 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5082 if (fi >= max) RRETURN(MATCH_NOMATCH); 5083 if (eptr >= md->end_subject) 5084 { 5085 SCHECK_PARTIAL(); 5086 RRETURN(MATCH_NOMATCH); 5087 } 5088 GETCHARINCTEST(c, eptr); 5089 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5090 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5091 c >= 0xe000) == prop_fail_result) 5092 RRETURN(MATCH_NOMATCH); 5093 } 5094 /* Control never gets here */ 5095 5096 /* This should never occur */ 5097 default: 5098 RRETURN(PCRE_ERROR_INTERNAL); 5099 } 5100 } 5101 5102 /* Match extended Unicode sequences. We will get here only if the 5103 support is in the binary; otherwise a compile-time error occurs. */ 5104 5105 else if (ctype == OP_EXTUNI) 5106 { 5107 for (fi = min;; fi++) 5108 { 5109 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41); 5110 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5111 if (fi >= max) RRETURN(MATCH_NOMATCH); 5112 if (eptr >= md->end_subject) 5113 { 5114 SCHECK_PARTIAL(); 5115 RRETURN(MATCH_NOMATCH); 5116 } 5117 else 5118 { 5119 int lgb, rgb; 5120 GETCHARINCTEST(c, eptr); 5121 lgb = UCD_GRAPHBREAK(c); 5122 while (eptr < md->end_subject) 5123 { 5124 int len = 1; 5125 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5126 rgb = UCD_GRAPHBREAK(c); 5127 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5128 lgb = rgb; 5129 eptr += len; 5130 } 5131 } 5132 CHECK_PARTIAL(); 5133 } 5134 } 5135 else 5136 #endif /* SUPPORT_UCP */ 5137 5138 #ifdef SUPPORT_UTF 5139 if (utf) 5140 { 5141 for (fi = min;; fi++) 5142 { 5143 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42); 5144 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5145 if (fi >= max) RRETURN(MATCH_NOMATCH); 5146 if (eptr >= md->end_subject) 5147 { 5148 SCHECK_PARTIAL(); 5149 RRETURN(MATCH_NOMATCH); 5150 } 5151 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5152 RRETURN(MATCH_NOMATCH); 5153 GETCHARINC(c, eptr); 5154 switch(ctype) 5155 { 5156 case OP_ANY: /* This is the non-NL case */ 5157 if (md->partial != 0 && /* Take care with CRLF partial */ 5158 eptr >= md->end_subject && 5159 NLBLOCK->nltype == NLTYPE_FIXED && 5160 NLBLOCK->nllen == 2 && 5161 c == NLBLOCK->nl[0]) 5162 { 5163 md->hitend = TRUE; 5164 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5165 } 5166 break; 5167 5168 case OP_ALLANY: 5169 case OP_ANYBYTE: 5170 break; 5171 5172 case OP_ANYNL: 5173 switch(c) 5174 { 5175 default: RRETURN(MATCH_NOMATCH); 5176 case CHAR_CR: 5177 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 5178 break; 5179 5180 case CHAR_LF: 5181 break; 5182 5183 case CHAR_VT: 5184 case CHAR_FF: 5185 case CHAR_NEL: 5186 #ifndef EBCDIC 5187 case 0x2028: 5188 case 0x2029: 5189 #endif /* Not EBCDIC */ 5190 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5191 break; 5192 } 5193 break; 5194 5195 case OP_NOT_HSPACE: 5196 switch(c) 5197 { 5198 HSPACE_CASES: RRETURN(MATCH_NOMATCH); 5199 default: break; 5200 } 5201 break; 5202 5203 case OP_HSPACE: 5204 switch(c) 5205 { 5206 HSPACE_CASES: break; 5207 default: RRETURN(MATCH_NOMATCH); 5208 } 5209 break; 5210 5211 case OP_NOT_VSPACE: 5212 switch(c) 5213 { 5214 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 5215 default: break; 5216 } 5217 break; 5218 5219 case OP_VSPACE: 5220 switch(c) 5221 { 5222 VSPACE_CASES: break; 5223 default: RRETURN(MATCH_NOMATCH); 5224 } 5225 break; 5226 5227 case OP_NOT_DIGIT: 5228 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 5229 RRETURN(MATCH_NOMATCH); 5230 break; 5231 5232 case OP_DIGIT: 5233 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 5234 RRETURN(MATCH_NOMATCH); 5235 break; 5236 5237 case OP_NOT_WHITESPACE: 5238 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 5239 RRETURN(MATCH_NOMATCH); 5240 break; 5241 5242 case OP_WHITESPACE: 5243 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 5244 RRETURN(MATCH_NOMATCH); 5245 break; 5246 5247 case OP_NOT_WORDCHAR: 5248 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 5249 RRETURN(MATCH_NOMATCH); 5250 break; 5251 5252 case OP_WORDCHAR: 5253 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 5254 RRETURN(MATCH_NOMATCH); 5255 break; 5256 5257 default: 5258 RRETURN(PCRE_ERROR_INTERNAL); 5259 } 5260 } 5261 } 5262 else 5263 #endif 5264 /* Not UTF mode */ 5265 { 5266 for (fi = min;; fi++) 5267 { 5268 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43); 5269 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5270 if (fi >= max) RRETURN(MATCH_NOMATCH); 5271 if (eptr >= md->end_subject) 5272 { 5273 SCHECK_PARTIAL(); 5274 RRETURN(MATCH_NOMATCH); 5275 } 5276 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5277 RRETURN(MATCH_NOMATCH); 5278 c = *eptr++; 5279 switch(ctype) 5280 { 5281 case OP_ANY: /* This is the non-NL case */ 5282 if (md->partial != 0 && /* Take care with CRLF partial */ 5283 eptr >= md->end_subject && 5284 NLBLOCK->nltype == NLTYPE_FIXED && 5285 NLBLOCK->nllen == 2 && 5286 c == NLBLOCK->nl[0]) 5287 { 5288 md->hitend = TRUE; 5289 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5290 } 5291 break; 5292 5293 case OP_ALLANY: 5294 case OP_ANYBYTE: 5295 break; 5296 5297 case OP_ANYNL: 5298 switch(c) 5299 { 5300 default: RRETURN(MATCH_NOMATCH); 5301 case CHAR_CR: 5302 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 5303 break; 5304 5305 case CHAR_LF: 5306 break; 5307 5308 case CHAR_VT: 5309 case CHAR_FF: 5310 case CHAR_NEL: 5311 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5312 case 0x2028: 5313 case 0x2029: 5314 #endif 5315 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5316 break; 5317 } 5318 break; 5319 5320 case OP_NOT_HSPACE: 5321 switch(c) 5322 { 5323 default: break; 5324 HSPACE_BYTE_CASES: 5325 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5326 HSPACE_MULTIBYTE_CASES: 5327 #endif 5328 RRETURN(MATCH_NOMATCH); 5329 } 5330 break; 5331 5332 case OP_HSPACE: 5333 switch(c) 5334 { 5335 default: RRETURN(MATCH_NOMATCH); 5336 HSPACE_BYTE_CASES: 5337 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5338 HSPACE_MULTIBYTE_CASES: 5339 #endif 5340 break; 5341 } 5342 break; 5343 5344 case OP_NOT_VSPACE: 5345 switch(c) 5346 { 5347 default: break; 5348 VSPACE_BYTE_CASES: 5349 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5350 VSPACE_MULTIBYTE_CASES: 5351 #endif 5352 RRETURN(MATCH_NOMATCH); 5353 } 5354 break; 5355 5356 case OP_VSPACE: 5357 switch(c) 5358 { 5359 default: RRETURN(MATCH_NOMATCH); 5360 VSPACE_BYTE_CASES: 5361 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5362 VSPACE_MULTIBYTE_CASES: 5363 #endif 5364 break; 5365 } 5366 break; 5367 5368 case OP_NOT_DIGIT: 5369 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 5370 break; 5371 5372 case OP_DIGIT: 5373 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 5374 break; 5375 5376 case OP_NOT_WHITESPACE: 5377 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 5378 break; 5379 5380 case OP_WHITESPACE: 5381 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 5382 break; 5383 5384 case OP_NOT_WORDCHAR: 5385 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 5386 break; 5387 5388 case OP_WORDCHAR: 5389 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 5390 break; 5391 5392 default: 5393 RRETURN(PCRE_ERROR_INTERNAL); 5394 } 5395 } 5396 } 5397 /* Control never gets here */ 5398 } 5399 5400 /* If maximizing, it is worth using inline code for speed, doing the type 5401 test once at the start (i.e. keep it out of the loop). Again, keep the 5402 UTF-8 and UCP stuff separate. */ 5403 5404 else 5405 { 5406 pp = eptr; /* Remember where we started */ 5407 5408 #ifdef SUPPORT_UCP 5409 if (prop_type >= 0) 5410 { 5411 switch(prop_type) 5412 { 5413 case PT_ANY: 5414 for (i = min; i < max; i++) 5415 { 5416 int len = 1; 5417 if (eptr >= md->end_subject) 5418 { 5419 SCHECK_PARTIAL(); 5420 break; 5421 } 5422 GETCHARLENTEST(c, eptr, len); 5423 if (prop_fail_result) break; 5424 eptr+= len; 5425 } 5426 break; 5427 5428 case PT_LAMP: 5429 for (i = min; i < max; i++) 5430 { 5431 int chartype; 5432 int len = 1; 5433 if (eptr >= md->end_subject) 5434 { 5435 SCHECK_PARTIAL(); 5436 break; 5437 } 5438 GETCHARLENTEST(c, eptr, len); 5439 chartype = UCD_CHARTYPE(c); 5440 if ((chartype == ucp_Lu || 5441 chartype == ucp_Ll || 5442 chartype == ucp_Lt) == prop_fail_result) 5443 break; 5444 eptr+= len; 5445 } 5446 break; 5447 5448 case PT_GC: 5449 for (i = min; i < max; i++) 5450 { 5451 int len = 1; 5452 if (eptr >= md->end_subject) 5453 { 5454 SCHECK_PARTIAL(); 5455 break; 5456 } 5457 GETCHARLENTEST(c, eptr, len); 5458 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; 5459 eptr+= len; 5460 } 5461 break; 5462 5463 case PT_PC: 5464 for (i = min; i < max; i++) 5465 { 5466 int len = 1; 5467 if (eptr >= md->end_subject) 5468 { 5469 SCHECK_PARTIAL(); 5470 break; 5471 } 5472 GETCHARLENTEST(c, eptr, len); 5473 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; 5474 eptr+= len; 5475 } 5476 break; 5477 5478 case PT_SC: 5479 for (i = min; i < max; i++) 5480 { 5481 int len = 1; 5482 if (eptr >= md->end_subject) 5483 { 5484 SCHECK_PARTIAL(); 5485 break; 5486 } 5487 GETCHARLENTEST(c, eptr, len); 5488 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; 5489 eptr+= len; 5490 } 5491 break; 5492 5493 case PT_ALNUM: 5494 for (i = min; i < max; i++) 5495 { 5496 int category; 5497 int len = 1; 5498 if (eptr >= md->end_subject) 5499 { 5500 SCHECK_PARTIAL(); 5501 break; 5502 } 5503 GETCHARLENTEST(c, eptr, len); 5504 category = UCD_CATEGORY(c); 5505 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5506 break; 5507 eptr+= len; 5508 } 5509 break; 5510 5511 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5512 which means that Perl space and POSIX space are now identical. PCRE 5513 was changed at release 8.34. */ 5514 5515 case PT_SPACE: /* Perl space */ 5516 case PT_PXSPACE: /* POSIX space */ 5517 for (i = min; i < max; i++) 5518 { 5519 int len = 1; 5520 if (eptr >= md->end_subject) 5521 { 5522 SCHECK_PARTIAL(); 5523 break; 5524 } 5525 GETCHARLENTEST(c, eptr, len); 5526 switch(c) 5527 { 5528 HSPACE_CASES: 5529 VSPACE_CASES: 5530 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ 5531 break; 5532 5533 default: 5534 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5535 goto ENDLOOP99; /* Break the loop */ 5536 break; 5537 } 5538 eptr+= len; 5539 } 5540 ENDLOOP99: 5541 break; 5542 5543 case PT_WORD: 5544 for (i = min; i < max; i++) 5545 { 5546 int category; 5547 int len = 1; 5548 if (eptr >= md->end_subject) 5549 { 5550 SCHECK_PARTIAL(); 5551 break; 5552 } 5553 GETCHARLENTEST(c, eptr, len); 5554 category = UCD_CATEGORY(c); 5555 if ((category == ucp_L || category == ucp_N || 5556 c == CHAR_UNDERSCORE) == prop_fail_result) 5557 break; 5558 eptr+= len; 5559 } 5560 break; 5561 5562 case PT_CLIST: 5563 for (i = min; i < max; i++) 5564 { 5565 const pcre_uint32 *cp; 5566 int len = 1; 5567 if (eptr >= md->end_subject) 5568 { 5569 SCHECK_PARTIAL(); 5570 break; 5571 } 5572 GETCHARLENTEST(c, eptr, len); 5573 cp = PRIV(ucd_caseless_sets) + prop_value; 5574 for (;;) 5575 { 5576 if (c < *cp) 5577 { if (prop_fail_result) break; else goto GOT_MAX; } 5578 if (c == *cp++) 5579 { if (prop_fail_result) goto GOT_MAX; else break; } 5580 } 5581 eptr += len; 5582 } 5583 GOT_MAX: 5584 break; 5585 5586 case PT_UCNC: 5587 for (i = min; i < max; i++) 5588 { 5589 int len = 1; 5590 if (eptr >= md->end_subject) 5591 { 5592 SCHECK_PARTIAL(); 5593 break; 5594 } 5595 GETCHARLENTEST(c, eptr, len); 5596 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5597 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5598 c >= 0xe000) == prop_fail_result) 5599 break; 5600 eptr += len; 5601 } 5602 break; 5603 5604 default: 5605 RRETURN(PCRE_ERROR_INTERNAL); 5606 } 5607 5608 /* eptr is now past the end of the maximum run */ 5609 5610 if (possessive) continue; /* No backtracking */ 5611 for(;;) 5612 { 5613 if (eptr <= pp) goto TAIL_RECURSE; 5614 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); 5615 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5616 eptr--; 5617 if (utf) BACKCHAR(eptr); 5618 } 5619 } 5620 5621 /* Match extended Unicode grapheme clusters. We will get here only if the 5622 support is in the binary; otherwise a compile-time error occurs. */ 5623 5624 else if (ctype == OP_EXTUNI) 5625 { 5626 for (i = min; i < max; i++) 5627 { 5628 if (eptr >= md->end_subject) 5629 { 5630 SCHECK_PARTIAL(); 5631 break; 5632 } 5633 else 5634 { 5635 int lgb, rgb; 5636 GETCHARINCTEST(c, eptr); 5637 lgb = UCD_GRAPHBREAK(c); 5638 while (eptr < md->end_subject) 5639 { 5640 int len = 1; 5641 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5642 rgb = UCD_GRAPHBREAK(c); 5643 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5644 lgb = rgb; 5645 eptr += len; 5646 } 5647 } 5648 CHECK_PARTIAL(); 5649 } 5650 5651 /* eptr is now past the end of the maximum run */ 5652 5653 if (possessive) continue; /* No backtracking */ 5654 5655 /* We use <= pp rather than == pp to detect the start of the run while 5656 backtracking because the use of \C in UTF mode can cause BACKCHAR to 5657 move back past pp. This is just palliative; the use of \C in UTF mode 5658 is fraught with danger. */ 5659 5660 for(;;) 5661 { 5662 int lgb, rgb; 5663 PCRE_PUCHAR fptr; 5664 5665 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ 5666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); 5667 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5668 5669 /* Backtracking over an extended grapheme cluster involves inspecting 5670 the previous two characters (if present) to see if a break is 5671 permitted between them. */ 5672 5673 eptr--; 5674 if (!utf) c = *eptr; else 5675 { 5676 BACKCHAR(eptr); 5677 GETCHAR(c, eptr); 5678 } 5679 rgb = UCD_GRAPHBREAK(c); 5680 5681 for (;;) 5682 { 5683 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ 5684 fptr = eptr - 1; 5685 if (!utf) c = *fptr; else 5686 { 5687 BACKCHAR(fptr); 5688 GETCHAR(c, fptr); 5689 } 5690 lgb = UCD_GRAPHBREAK(c); 5691 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5692 eptr = fptr; 5693 rgb = lgb; 5694 } 5695 } 5696 } 5697 5698 else 5699 #endif /* SUPPORT_UCP */ 5700 5701 #ifdef SUPPORT_UTF 5702 if (utf) 5703 { 5704 switch(ctype) 5705 { 5706 case OP_ANY: 5707 for (i = min; i < max; i++) 5708 { 5709 if (eptr >= md->end_subject) 5710 { 5711 SCHECK_PARTIAL(); 5712 break; 5713 } 5714 if (IS_NEWLINE(eptr)) break; 5715 if (md->partial != 0 && /* Take care with CRLF partial */ 5716 eptr + 1 >= md->end_subject && 5717 NLBLOCK->nltype == NLTYPE_FIXED && 5718 NLBLOCK->nllen == 2 && 5719 UCHAR21(eptr) == NLBLOCK->nl[0]) 5720 { 5721 md->hitend = TRUE; 5722 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5723 } 5724 eptr++; 5725 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5726 } 5727 break; 5728 5729 case OP_ALLANY: 5730 if (max < INT_MAX) 5731 { 5732 for (i = min; i < max; i++) 5733 { 5734 if (eptr >= md->end_subject) 5735 { 5736 SCHECK_PARTIAL(); 5737 break; 5738 } 5739 eptr++; 5740 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5741 } 5742 } 5743 else 5744 { 5745 eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 5746 SCHECK_PARTIAL(); 5747 } 5748 break; 5749 5750 /* The byte case is the same as non-UTF8 */ 5751 5752 case OP_ANYBYTE: 5753 c = max - min; 5754 if (c > (unsigned int)(md->end_subject - eptr)) 5755 { 5756 eptr = md->end_subject; 5757 SCHECK_PARTIAL(); 5758 } 5759 else eptr += c; 5760 break; 5761 5762 case OP_ANYNL: 5763 for (i = min; i < max; i++) 5764 { 5765 int len = 1; 5766 if (eptr >= md->end_subject) 5767 { 5768 SCHECK_PARTIAL(); 5769 break; 5770 } 5771 GETCHARLEN(c, eptr, len); 5772 if (c == CHAR_CR) 5773 { 5774 if (++eptr >= md->end_subject) break; 5775 if (UCHAR21(eptr) == CHAR_LF) eptr++; 5776 } 5777 else 5778 { 5779 if (c != CHAR_LF && 5780 (md->bsr_anycrlf || 5781 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5782 #ifndef EBCDIC 5783 && c != 0x2028 && c != 0x2029 5784 #endif /* Not EBCDIC */ 5785 ))) 5786 break; 5787 eptr += len; 5788 } 5789 } 5790 break; 5791 5792 case OP_NOT_HSPACE: 5793 case OP_HSPACE: 5794 for (i = min; i < max; i++) 5795 { 5796 BOOL gotspace; 5797 int len = 1; 5798 if (eptr >= md->end_subject) 5799 { 5800 SCHECK_PARTIAL(); 5801 break; 5802 } 5803 GETCHARLEN(c, eptr, len); 5804 switch(c) 5805 { 5806 HSPACE_CASES: gotspace = TRUE; break; 5807 default: gotspace = FALSE; break; 5808 } 5809 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5810 eptr += len; 5811 } 5812 break; 5813 5814 case OP_NOT_VSPACE: 5815 case OP_VSPACE: 5816 for (i = min; i < max; i++) 5817 { 5818 BOOL gotspace; 5819 int len = 1; 5820 if (eptr >= md->end_subject) 5821 { 5822 SCHECK_PARTIAL(); 5823 break; 5824 } 5825 GETCHARLEN(c, eptr, len); 5826 switch(c) 5827 { 5828 VSPACE_CASES: gotspace = TRUE; break; 5829 default: gotspace = FALSE; break; 5830 } 5831 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5832 eptr += len; 5833 } 5834 break; 5835 5836 case OP_NOT_DIGIT: 5837 for (i = min; i < max; i++) 5838 { 5839 int len = 1; 5840 if (eptr >= md->end_subject) 5841 { 5842 SCHECK_PARTIAL(); 5843 break; 5844 } 5845 GETCHARLEN(c, eptr, len); 5846 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 5847 eptr+= len; 5848 } 5849 break; 5850 5851 case OP_DIGIT: 5852 for (i = min; i < max; i++) 5853 { 5854 int len = 1; 5855 if (eptr >= md->end_subject) 5856 { 5857 SCHECK_PARTIAL(); 5858 break; 5859 } 5860 GETCHARLEN(c, eptr, len); 5861 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 5862 eptr+= len; 5863 } 5864 break; 5865 5866 case OP_NOT_WHITESPACE: 5867 for (i = min; i < max; i++) 5868 { 5869 int len = 1; 5870 if (eptr >= md->end_subject) 5871 { 5872 SCHECK_PARTIAL(); 5873 break; 5874 } 5875 GETCHARLEN(c, eptr, len); 5876 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 5877 eptr+= len; 5878 } 5879 break; 5880 5881 case OP_WHITESPACE: 5882 for (i = min; i < max; i++) 5883 { 5884 int len = 1; 5885 if (eptr >= md->end_subject) 5886 { 5887 SCHECK_PARTIAL(); 5888 break; 5889 } 5890 GETCHARLEN(c, eptr, len); 5891 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 5892 eptr+= len; 5893 } 5894 break; 5895 5896 case OP_NOT_WORDCHAR: 5897 for (i = min; i < max; i++) 5898 { 5899 int len = 1; 5900 if (eptr >= md->end_subject) 5901 { 5902 SCHECK_PARTIAL(); 5903 break; 5904 } 5905 GETCHARLEN(c, eptr, len); 5906 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 5907 eptr+= len; 5908 } 5909 break; 5910 5911 case OP_WORDCHAR: 5912 for (i = min; i < max; i++) 5913 { 5914 int len = 1; 5915 if (eptr >= md->end_subject) 5916 { 5917 SCHECK_PARTIAL(); 5918 break; 5919 } 5920 GETCHARLEN(c, eptr, len); 5921 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 5922 eptr+= len; 5923 } 5924 break; 5925 5926 default: 5927 RRETURN(PCRE_ERROR_INTERNAL); 5928 } 5929 5930 if (possessive) continue; /* No backtracking */ 5931 for(;;) 5932 { 5933 if (eptr <= pp) goto TAIL_RECURSE; 5934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); 5935 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5936 eptr--; 5937 BACKCHAR(eptr); 5938 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL && 5939 UCHAR21(eptr - 1) == CHAR_CR) eptr--; 5940 } 5941 } 5942 else 5943 #endif /* SUPPORT_UTF */ 5944 /* Not UTF mode */ 5945 { 5946 switch(ctype) 5947 { 5948 case OP_ANY: 5949 for (i = min; i < max; i++) 5950 { 5951 if (eptr >= md->end_subject) 5952 { 5953 SCHECK_PARTIAL(); 5954 break; 5955 } 5956 if (IS_NEWLINE(eptr)) break; 5957 if (md->partial != 0 && /* Take care with CRLF partial */ 5958 eptr + 1 >= md->end_subject && 5959 NLBLOCK->nltype == NLTYPE_FIXED && 5960 NLBLOCK->nllen == 2 && 5961 *eptr == NLBLOCK->nl[0]) 5962 { 5963 md->hitend = TRUE; 5964 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5965 } 5966 eptr++; 5967 } 5968 break; 5969 5970 case OP_ALLANY: 5971 case OP_ANYBYTE: 5972 c = max - min; 5973 if (c > (unsigned int)(md->end_subject - eptr)) 5974 { 5975 eptr = md->end_subject; 5976 SCHECK_PARTIAL(); 5977 } 5978 else eptr += c; 5979 break; 5980 5981 case OP_ANYNL: 5982 for (i = min; i < max; i++) 5983 { 5984 if (eptr >= md->end_subject) 5985 { 5986 SCHECK_PARTIAL(); 5987 break; 5988 } 5989 c = *eptr; 5990 if (c == CHAR_CR) 5991 { 5992 if (++eptr >= md->end_subject) break; 5993 if (*eptr == CHAR_LF) eptr++; 5994 } 5995 else 5996 { 5997 if (c != CHAR_LF && (md->bsr_anycrlf || 5998 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5999 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6000 && c != 0x2028 && c != 0x2029 6001 #endif 6002 ))) break; 6003 eptr++; 6004 } 6005 } 6006 break; 6007 6008 case OP_NOT_HSPACE: 6009 for (i = min; i < max; i++) 6010 { 6011 if (eptr >= md->end_subject) 6012 { 6013 SCHECK_PARTIAL(); 6014 break; 6015 } 6016 switch(*eptr) 6017 { 6018 default: eptr++; break; 6019 HSPACE_BYTE_CASES: 6020 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6021 HSPACE_MULTIBYTE_CASES: 6022 #endif 6023 goto ENDLOOP00; 6024 } 6025 } 6026 ENDLOOP00: 6027 break; 6028 6029 case OP_HSPACE: 6030 for (i = min; i < max; i++) 6031 { 6032 if (eptr >= md->end_subject) 6033 { 6034 SCHECK_PARTIAL(); 6035 break; 6036 } 6037 switch(*eptr) 6038 { 6039 default: goto ENDLOOP01; 6040 HSPACE_BYTE_CASES: 6041 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6042 HSPACE_MULTIBYTE_CASES: 6043 #endif 6044 eptr++; break; 6045 } 6046 } 6047 ENDLOOP01: 6048 break; 6049 6050 case OP_NOT_VSPACE: 6051 for (i = min; i < max; i++) 6052 { 6053 if (eptr >= md->end_subject) 6054 { 6055 SCHECK_PARTIAL(); 6056 break; 6057 } 6058 switch(*eptr) 6059 { 6060 default: eptr++; break; 6061 VSPACE_BYTE_CASES: 6062 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6063 VSPACE_MULTIBYTE_CASES: 6064 #endif 6065 goto ENDLOOP02; 6066 } 6067 } 6068 ENDLOOP02: 6069 break; 6070 6071 case OP_VSPACE: 6072 for (i = min; i < max; i++) 6073 { 6074 if (eptr >= md->end_subject) 6075 { 6076 SCHECK_PARTIAL(); 6077 break; 6078 } 6079 switch(*eptr) 6080 { 6081 default: goto ENDLOOP03; 6082 VSPACE_BYTE_CASES: 6083 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6084 VSPACE_MULTIBYTE_CASES: 6085 #endif 6086 eptr++; break; 6087 } 6088 } 6089 ENDLOOP03: 6090 break; 6091 6092 case OP_NOT_DIGIT: 6093 for (i = min; i < max; i++) 6094 { 6095 if (eptr >= md->end_subject) 6096 { 6097 SCHECK_PARTIAL(); 6098 break; 6099 } 6100 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; 6101 eptr++; 6102 } 6103 break; 6104 6105 case OP_DIGIT: 6106 for (i = min; i < max; i++) 6107 { 6108 if (eptr >= md->end_subject) 6109 { 6110 SCHECK_PARTIAL(); 6111 break; 6112 } 6113 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; 6114 eptr++; 6115 } 6116 break; 6117 6118 case OP_NOT_WHITESPACE: 6119 for (i = min; i < max; i++) 6120 { 6121 if (eptr >= md->end_subject) 6122 { 6123 SCHECK_PARTIAL(); 6124 break; 6125 } 6126 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; 6127 eptr++; 6128 } 6129 break; 6130 6131 case OP_WHITESPACE: 6132 for (i = min; i < max; i++) 6133 { 6134 if (eptr >= md->end_subject) 6135 { 6136 SCHECK_PARTIAL(); 6137 break; 6138 } 6139 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; 6140 eptr++; 6141 } 6142 break; 6143 6144 case OP_NOT_WORDCHAR: 6145 for (i = min; i < max; i++) 6146 { 6147 if (eptr >= md->end_subject) 6148 { 6149 SCHECK_PARTIAL(); 6150 break; 6151 } 6152 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; 6153 eptr++; 6154 } 6155 break; 6156 6157 case OP_WORDCHAR: 6158 for (i = min; i < max; i++) 6159 { 6160 if (eptr >= md->end_subject) 6161 { 6162 SCHECK_PARTIAL(); 6163 break; 6164 } 6165 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; 6166 eptr++; 6167 } 6168 break; 6169 6170 default: 6171 RRETURN(PCRE_ERROR_INTERNAL); 6172 } 6173 6174 if (possessive) continue; /* No backtracking */ 6175 for (;;) 6176 { 6177 if (eptr == pp) goto TAIL_RECURSE; 6178 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); 6179 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6180 eptr--; 6181 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && 6182 eptr[-1] == CHAR_CR) eptr--; 6183 } 6184 } 6185 6186 /* Control never gets here */ 6187 } 6188 6189 /* There's been some horrible disaster. Arrival here can only mean there is 6190 something seriously wrong in the code above or the OP_xxx definitions. */ 6191 6192 default: 6193 DPRINTF(("Unknown opcode %d\n", *ecode)); 6194 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 6195 } 6196 6197 /* Do not stick any code in here without much thought; it is assumed 6198 that "continue" in the code above comes out to here to repeat the main 6199 loop. */ 6200 6201 } /* End of main loop */ 6202 /* Control never reaches here */ 6203 6204 6205 /* When compiling to use the heap rather than the stack for recursive calls to 6206 match(), the RRETURN() macro jumps here. The number that is saved in 6207 frame->Xwhere indicates which label we actually want to return to. */ 6208 6209 #ifdef NO_RECURSE 6210 #define LBL(val) case val: goto L_RM##val; 6211 HEAP_RETURN: 6212 switch (frame->Xwhere) 6213 { 6214 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 6215 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 6216 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 6217 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 6218 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) 6219 LBL(65) LBL(66) 6220 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 6221 LBL(20) LBL(21) 6222 #endif 6223 #ifdef SUPPORT_UTF 6224 LBL(16) LBL(18) 6225 LBL(22) LBL(23) LBL(28) LBL(30) 6226 LBL(32) LBL(34) LBL(42) LBL(46) 6227 #ifdef SUPPORT_UCP 6228 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 6229 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) 6230 #endif /* SUPPORT_UCP */ 6231 #endif /* SUPPORT_UTF */ 6232 default: 6233 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 6234 return PCRE_ERROR_INTERNAL; 6235 } 6236 #undef LBL 6237 #endif /* NO_RECURSE */ 6238 } 6239 6240 6241 /*************************************************************************** 6242 **************************************************************************** 6243 RECURSION IN THE match() FUNCTION 6244 6245 Undefine all the macros that were defined above to handle this. */ 6246 6247 #ifdef NO_RECURSE 6248 #undef eptr 6249 #undef ecode 6250 #undef mstart 6251 #undef offset_top 6252 #undef eptrb 6253 #undef flags 6254 6255 #undef callpat 6256 #undef charptr 6257 #undef data 6258 #undef next 6259 #undef pp 6260 #undef prev 6261 #undef saved_eptr 6262 6263 #undef new_recursive 6264 6265 #undef cur_is_word 6266 #undef condition 6267 #undef prev_is_word 6268 6269 #undef ctype 6270 #undef length 6271 #undef max 6272 #undef min 6273 #undef number 6274 #undef offset 6275 #undef op 6276 #undef save_capture_last 6277 #undef save_offset1 6278 #undef save_offset2 6279 #undef save_offset3 6280 #undef stacksave 6281 6282 #undef newptrb 6283 6284 #endif 6285 6286 /* These two are defined as macros in both cases */ 6287 6288 #undef fc 6289 #undef fi 6290 6291 /*************************************************************************** 6292 ***************************************************************************/ 6293 6294 6295 #ifdef NO_RECURSE 6296 /************************************************* 6297 * Release allocated heap frames * 6298 *************************************************/ 6299 6300 /* This function releases all the allocated frames. The base frame is on the 6301 machine stack, and so must not be freed. 6302 6303 Argument: the address of the base frame 6304 Returns: nothing 6305 */ 6306 6307 static void 6308 release_match_heapframes (heapframe *frame_base) 6309 { 6310 heapframe *nextframe = frame_base->Xnextframe; 6311 while (nextframe != NULL) 6312 { 6313 heapframe *oldframe = nextframe; 6314 nextframe = nextframe->Xnextframe; 6315 (PUBL(stack_free))(oldframe); 6316 } 6317 } 6318 #endif 6319 6320 6321 /************************************************* 6322 * Execute a Regular Expression * 6323 *************************************************/ 6324 6325 /* This function applies a compiled re to a subject string and picks out 6326 portions of the string if it matches. Two elements in the vector are set for 6327 each substring: the offsets to the start and end of the substring. 6328 6329 Arguments: 6330 argument_re points to the compiled expression 6331 extra_data points to extra data or is NULL 6332 subject points to the subject string 6333 length length of subject string (may contain binary zeros) 6334 start_offset where to start in the subject string 6335 options option bits 6336 offsets points to a vector of ints to be filled in with offsets 6337 offsetcount the number of elements in the vector 6338 6339 Returns: > 0 => success; value is the number of elements filled in 6340 = 0 => success, but offsets is not big enough 6341 -1 => failed to match 6342 < -1 => some kind of unexpected problem 6343 */ 6344 6345 #if defined COMPILE_PCRE8 6346 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6347 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 6348 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 6349 int offsetcount) 6350 #elif defined COMPILE_PCRE16 6351 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6352 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 6353 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 6354 int offsetcount) 6355 #elif defined COMPILE_PCRE32 6356 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6357 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, 6358 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, 6359 int offsetcount) 6360 #endif 6361 { 6362 int rc, ocount, arg_offset_max; 6363 int newline; 6364 BOOL using_temporary_offsets = FALSE; 6365 BOOL anchored; 6366 BOOL startline; 6367 BOOL firstline; 6368 BOOL utf; 6369 BOOL has_first_char = FALSE; 6370 BOOL has_req_char = FALSE; 6371 pcre_uchar first_char = 0; 6372 pcre_uchar first_char2 = 0; 6373 pcre_uchar req_char = 0; 6374 pcre_uchar req_char2 = 0; 6375 match_data match_block; 6376 match_data *md = &match_block; 6377 const pcre_uint8 *tables; 6378 const pcre_uint8 *start_bits = NULL; 6379 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; 6380 PCRE_PUCHAR end_subject; 6381 PCRE_PUCHAR start_partial = NULL; 6382 PCRE_PUCHAR match_partial = NULL; 6383 PCRE_PUCHAR req_char_ptr = start_match - 1; 6384 6385 const pcre_study_data *study; 6386 const REAL_PCRE *re = (const REAL_PCRE *)argument_re; 6387 6388 #ifdef NO_RECURSE 6389 heapframe frame_zero; 6390 frame_zero.Xprevframe = NULL; /* Marks the top level */ 6391 frame_zero.Xnextframe = NULL; /* None are allocated yet */ 6392 md->match_frames_base = &frame_zero; 6393 #endif 6394 6395 /* Check for the special magic call that measures the size of the stack used 6396 per recursive call of match(). Without the funny casting for sizeof, a Windows 6397 compiler gave this error: "unary minus operator applied to unsigned type, 6398 result still unsigned". Hopefully the cast fixes that. */ 6399 6400 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && 6401 start_offset == -999) 6402 #ifdef NO_RECURSE 6403 return -((int)sizeof(heapframe)); 6404 #else 6405 return match(NULL, NULL, NULL, 0, NULL, NULL, 0); 6406 #endif 6407 6408 /* Plausibility checks */ 6409 6410 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 6411 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) 6412 return PCRE_ERROR_NULL; 6413 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 6414 if (length < 0) return PCRE_ERROR_BADLENGTH; 6415 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 6416 6417 /* Check that the first field in the block is the magic number. If it is not, 6418 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 6419 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 6420 means that the pattern is likely compiled with different endianness. */ 6421 6422 if (re->magic_number != MAGIC_NUMBER) 6423 return re->magic_number == REVERSED_MAGIC_NUMBER? 6424 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 6425 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 6426 6427 /* These two settings are used in the code for checking a UTF-8 string that 6428 follows immediately afterwards. Other values in the md block are used only 6429 during "normal" pcre_exec() processing, not when the JIT support is in use, 6430 so they are set up later. */ 6431 6432 /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 6433 utf = md->utf = (re->options & PCRE_UTF8) != 0; 6434 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 6435 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 6436 6437 /* Check a UTF-8 string if required. Pass back the character offset and error 6438 code for an invalid string if a results vector is available. */ 6439 6440 #ifdef SUPPORT_UTF 6441 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 6442 { 6443 int erroroffset; 6444 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); 6445 if (errorcode != 0) 6446 { 6447 if (offsetcount >= 2) 6448 { 6449 offsets[0] = erroroffset; 6450 offsets[1] = errorcode; 6451 } 6452 #if defined COMPILE_PCRE8 6453 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? 6454 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 6455 #elif defined COMPILE_PCRE16 6456 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? 6457 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 6458 #elif defined COMPILE_PCRE32 6459 return PCRE_ERROR_BADUTF32; 6460 #endif 6461 } 6462 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 6463 /* Check that a start_offset points to the start of a UTF character. */ 6464 if (start_offset > 0 && start_offset < length && 6465 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 6466 return PCRE_ERROR_BADUTF8_OFFSET; 6467 #endif 6468 } 6469 #endif 6470 6471 /* If the pattern was successfully studied with JIT support, run the JIT 6472 executable instead of the rest of this function. Most options must be set at 6473 compile time for the JIT code to be usable. Fallback to the normal code path if 6474 an unsupported flag is set. */ 6475 6476 #ifdef SUPPORT_JIT 6477 if (extra_data != NULL 6478 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | 6479 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT 6480 && extra_data->executable_jit != NULL 6481 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) 6482 { 6483 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, 6484 start_offset, options, offsets, offsetcount); 6485 6486 /* PCRE_ERROR_NULL means that the selected normal or partial matching 6487 mode is not compiled. In this case we simply fallback to interpreter. */ 6488 6489 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; 6490 } 6491 #endif 6492 6493 /* Carry on with non-JIT matching. This information is for finding all the 6494 numbers associated with a given name, for condition testing. */ 6495 6496 md->name_table = (pcre_uchar *)re + re->name_table_offset; 6497 md->name_count = re->name_count; 6498 md->name_entry_size = re->name_entry_size; 6499 6500 /* Fish out the optional data from the extra_data structure, first setting 6501 the default values. */ 6502 6503 study = NULL; 6504 md->match_limit = MATCH_LIMIT; 6505 md->match_limit_recursion = MATCH_LIMIT_RECURSION; 6506 md->callout_data = NULL; 6507 6508 /* The table pointer is always in native byte order. */ 6509 6510 tables = re->tables; 6511 6512 /* The two limit values override the defaults, whatever their value. */ 6513 6514 if (extra_data != NULL) 6515 { 6516 unsigned long int flags = extra_data->flags; 6517 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 6518 study = (const pcre_study_data *)extra_data->study_data; 6519 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 6520 md->match_limit = extra_data->match_limit; 6521 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 6522 md->match_limit_recursion = extra_data->match_limit_recursion; 6523 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 6524 md->callout_data = extra_data->callout_data; 6525 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 6526 } 6527 6528 /* Limits in the regex override only if they are smaller. */ 6529 6530 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) 6531 md->match_limit = re->limit_match; 6532 6533 if ((re->flags & PCRE_RLSET) != 0 && 6534 re->limit_recursion < md->match_limit_recursion) 6535 md->match_limit_recursion = re->limit_recursion; 6536 6537 /* If the exec call supplied NULL for tables, use the inbuilt ones. This 6538 is a feature that makes it possible to save compiled regex and re-use them 6539 in other programs later. */ 6540 6541 if (tables == NULL) tables = PRIV(default_tables); 6542 6543 /* Set up other data */ 6544 6545 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 6546 startline = (re->flags & PCRE_STARTLINE) != 0; 6547 firstline = (re->options & PCRE_FIRSTLINE) != 0; 6548 6549 /* The code starts after the real_pcre block and the capture name table. */ 6550 6551 md->start_code = (const pcre_uchar *)re + re->name_table_offset + 6552 re->name_count * re->name_entry_size; 6553 6554 md->start_subject = (PCRE_PUCHAR)subject; 6555 md->start_offset = start_offset; 6556 md->end_subject = md->start_subject + length; 6557 end_subject = md->end_subject; 6558 6559 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 6560 md->use_ucp = (re->options & PCRE_UCP) != 0; 6561 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 6562 md->ignore_skip_arg = 0; 6563 6564 /* Some options are unpacked into BOOL variables in the hope that testing 6565 them will be faster than individual option bits. */ 6566 6567 md->notbol = (options & PCRE_NOTBOL) != 0; 6568 md->noteol = (options & PCRE_NOTEOL) != 0; 6569 md->notempty = (options & PCRE_NOTEMPTY) != 0; 6570 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 6571 6572 md->hitend = FALSE; 6573 md->mark = md->nomatch_mark = NULL; /* In case never set */ 6574 6575 md->recursive = NULL; /* No recursion at top level */ 6576 md->hasthen = (re->flags & PCRE_HASTHEN) != 0; 6577 6578 md->lcc = tables + lcc_offset; 6579 md->fcc = tables + fcc_offset; 6580 md->ctypes = tables + ctypes_offset; 6581 6582 /* Handle different \R options. */ 6583 6584 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 6585 { 6586 case 0: 6587 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 6588 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 6589 else 6590 #ifdef BSR_ANYCRLF 6591 md->bsr_anycrlf = TRUE; 6592 #else 6593 md->bsr_anycrlf = FALSE; 6594 #endif 6595 break; 6596 6597 case PCRE_BSR_ANYCRLF: 6598 md->bsr_anycrlf = TRUE; 6599 break; 6600 6601 case PCRE_BSR_UNICODE: 6602 md->bsr_anycrlf = FALSE; 6603 break; 6604 6605 default: return PCRE_ERROR_BADNEWLINE; 6606 } 6607 6608 /* Handle different types of newline. The three bits give eight cases. If 6609 nothing is set at run time, whatever was used at compile time applies. */ 6610 6611 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 6612 (pcre_uint32)options) & PCRE_NEWLINE_BITS) 6613 { 6614 case 0: newline = NEWLINE; break; /* Compile-time default */ 6615 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 6616 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 6617 case PCRE_NEWLINE_CR+ 6618 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 6619 case PCRE_NEWLINE_ANY: newline = -1; break; 6620 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 6621 default: return PCRE_ERROR_BADNEWLINE; 6622 } 6623 6624 if (newline == -2) 6625 { 6626 md->nltype = NLTYPE_ANYCRLF; 6627 } 6628 else if (newline < 0) 6629 { 6630 md->nltype = NLTYPE_ANY; 6631 } 6632 else 6633 { 6634 md->nltype = NLTYPE_FIXED; 6635 if (newline > 255) 6636 { 6637 md->nllen = 2; 6638 md->nl[0] = (newline >> 8) & 255; 6639 md->nl[1] = newline & 255; 6640 } 6641 else 6642 { 6643 md->nllen = 1; 6644 md->nl[0] = newline; 6645 } 6646 } 6647 6648 /* Partial matching was originally supported only for a restricted set of 6649 regexes; from release 8.00 there are no restrictions, but the bits are still 6650 defined (though never set). So there's no harm in leaving this code. */ 6651 6652 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 6653 return PCRE_ERROR_BADPARTIAL; 6654 6655 /* If the expression has got more back references than the offsets supplied can 6656 hold, we get a temporary chunk of working store to use during the matching. 6657 Otherwise, we can use the vector supplied, rounding down its size to a multiple 6658 of 3. */ 6659 6660 ocount = offsetcount - (offsetcount % 3); 6661 arg_offset_max = (2*ocount)/3; 6662 6663 if (re->top_backref > 0 && re->top_backref >= ocount/3) 6664 { 6665 ocount = re->top_backref * 3 + 3; 6666 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); 6667 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 6668 using_temporary_offsets = TRUE; 6669 DPRINTF(("Got memory to hold back references\n")); 6670 } 6671 else md->offset_vector = offsets; 6672 md->offset_end = ocount; 6673 md->offset_max = (2*ocount)/3; 6674 md->capture_last = 0; 6675 6676 /* Reset the working variable associated with each extraction. These should 6677 never be used unless previously set, but they get saved and restored, and so we 6678 initialize them to avoid reading uninitialized locations. Also, unset the 6679 offsets for the matched string. This is really just for tidiness with callouts, 6680 in case they inspect these fields. */ 6681 6682 if (md->offset_vector != NULL) 6683 { 6684 register int *iptr = md->offset_vector + ocount; 6685 register int *iend = iptr - re->top_bracket; 6686 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; 6687 while (--iptr >= iend) *iptr = -1; 6688 if (offsetcount > 0) md->offset_vector[0] = -1; 6689 if (offsetcount > 1) md->offset_vector[1] = -1; 6690 } 6691 6692 /* Set up the first character to match, if available. The first_char value is 6693 never set for an anchored regular expression, but the anchoring may be forced 6694 at run time, so we have to test for anchoring. The first char may be unset for 6695 an unanchored pattern, of course. If there's no first char and the pattern was 6696 studied, there may be a bitmap of possible first characters. */ 6697 6698 if (!anchored) 6699 { 6700 if ((re->flags & PCRE_FIRSTSET) != 0) 6701 { 6702 has_first_char = TRUE; 6703 first_char = first_char2 = (pcre_uchar)(re->first_char); 6704 if ((re->flags & PCRE_FCH_CASELESS) != 0) 6705 { 6706 first_char2 = TABLE_GET(first_char, md->fcc, first_char); 6707 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6708 if (utf && first_char > 127) 6709 first_char2 = UCD_OTHERCASE(first_char); 6710 #endif 6711 } 6712 } 6713 else 6714 if (!startline && study != NULL && 6715 (study->flags & PCRE_STUDY_MAPPED) != 0) 6716 start_bits = study->start_bits; 6717 } 6718 6719 /* For anchored or unanchored matches, there may be a "last known required 6720 character" set. */ 6721 6722 if ((re->flags & PCRE_REQCHSET) != 0) 6723 { 6724 has_req_char = TRUE; 6725 req_char = req_char2 = (pcre_uchar)(re->req_char); 6726 if ((re->flags & PCRE_RCH_CASELESS) != 0) 6727 { 6728 req_char2 = TABLE_GET(req_char, md->fcc, req_char); 6729 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6730 if (utf && req_char > 127) 6731 req_char2 = UCD_OTHERCASE(req_char); 6732 #endif 6733 } 6734 } 6735 6736 6737 /* ==========================================================================*/ 6738 6739 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 6740 the loop runs just once. */ 6741 6742 for(;;) 6743 { 6744 PCRE_PUCHAR save_end_subject = end_subject; 6745 PCRE_PUCHAR new_start_match; 6746 6747 /* If firstline is TRUE, the start of the match is constrained to the first 6748 line of a multiline string. That is, the match must be before or at the first 6749 newline. Implement this by temporarily adjusting end_subject so that we stop 6750 scanning at a newline. If the match fails at the newline, later code breaks 6751 this loop. */ 6752 6753 if (firstline) 6754 { 6755 PCRE_PUCHAR t = start_match; 6756 #ifdef SUPPORT_UTF 6757 if (utf) 6758 { 6759 while (t < md->end_subject && !IS_NEWLINE(t)) 6760 { 6761 t++; 6762 ACROSSCHAR(t < end_subject, *t, t++); 6763 } 6764 } 6765 else 6766 #endif 6767 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 6768 end_subject = t; 6769 } 6770 6771 /* There are some optimizations that avoid running the match if a known 6772 starting point is not found, or if a known later character is not present. 6773 However, there is an option that disables these, for testing and for ensuring 6774 that all callouts do actually occur. The option can be set in the regex by 6775 (*NO_START_OPT) or passed in match-time options. */ 6776 6777 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 6778 { 6779 /* Advance to a unique first char if there is one. */ 6780 6781 if (has_first_char) 6782 { 6783 pcre_uchar smc; 6784 6785 if (first_char != first_char2) 6786 while (start_match < end_subject && 6787 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2) 6788 start_match++; 6789 else 6790 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char) 6791 start_match++; 6792 } 6793 6794 /* Or to just after a linebreak for a multiline match */ 6795 6796 else if (startline) 6797 { 6798 if (start_match > md->start_subject + start_offset) 6799 { 6800 #ifdef SUPPORT_UTF 6801 if (utf) 6802 { 6803 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6804 { 6805 start_match++; 6806 ACROSSCHAR(start_match < end_subject, *start_match, 6807 start_match++); 6808 } 6809 } 6810 else 6811 #endif 6812 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6813 start_match++; 6814 6815 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 6816 and we are now at a LF, advance the match position by one more character. 6817 */ 6818 6819 if (start_match[-1] == CHAR_CR && 6820 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 6821 start_match < end_subject && 6822 UCHAR21TEST(start_match) == CHAR_NL) 6823 start_match++; 6824 } 6825 } 6826 6827 /* Or to a non-unique first byte after study */ 6828 6829 else if (start_bits != NULL) 6830 { 6831 while (start_match < end_subject) 6832 { 6833 register pcre_uint32 c = UCHAR21TEST(start_match); 6834 #ifndef COMPILE_PCRE8 6835 if (c > 255) c = 255; 6836 #endif 6837 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 6838 start_match++; 6839 } 6840 } 6841 } /* Starting optimizations */ 6842 6843 /* Restore fudged end_subject */ 6844 6845 end_subject = save_end_subject; 6846 6847 /* The following two optimizations are disabled for partial matching or if 6848 disabling is explicitly requested. */ 6849 6850 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 6851 { 6852 /* If the pattern was studied, a minimum subject length may be set. This is 6853 a lower bound; no actual string of that length may actually match the 6854 pattern. Although the value is, strictly, in characters, we treat it as 6855 bytes to avoid spending too much time in this optimization. */ 6856 6857 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 6858 (pcre_uint32)(end_subject - start_match) < study->minlength) 6859 { 6860 rc = MATCH_NOMATCH; 6861 break; 6862 } 6863 6864 /* If req_char is set, we know that that character must appear in the 6865 subject for the match to succeed. If the first character is set, req_char 6866 must be later in the subject; otherwise the test starts at the match point. 6867 This optimization can save a huge amount of backtracking in patterns with 6868 nested unlimited repeats that aren't going to match. Writing separate code 6869 for cased/caseless versions makes it go faster, as does using an 6870 autoincrement and backing off on a match. 6871 6872 HOWEVER: when the subject string is very, very long, searching to its end 6873 can take a long time, and give bad performance on quite ordinary patterns. 6874 This showed up when somebody was matching something like /^\d+C/ on a 6875 32-megabyte string... so we don't do this when the string is sufficiently 6876 long. */ 6877 6878 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) 6879 { 6880 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); 6881 6882 /* We don't need to repeat the search if we haven't yet reached the 6883 place we found it at last time. */ 6884 6885 if (p > req_char_ptr) 6886 { 6887 if (req_char != req_char2) 6888 { 6889 while (p < end_subject) 6890 { 6891 register pcre_uint32 pp = UCHAR21INCTEST(p); 6892 if (pp == req_char || pp == req_char2) { p--; break; } 6893 } 6894 } 6895 else 6896 { 6897 while (p < end_subject) 6898 { 6899 if (UCHAR21INCTEST(p) == req_char) { p--; break; } 6900 } 6901 } 6902 6903 /* If we can't find the required character, break the matching loop, 6904 forcing a match failure. */ 6905 6906 if (p >= end_subject) 6907 { 6908 rc = MATCH_NOMATCH; 6909 break; 6910 } 6911 6912 /* If we have found the required character, save the point where we 6913 found it, so that we don't search again next time round the loop if 6914 the start hasn't passed this character yet. */ 6915 6916 req_char_ptr = p; 6917 } 6918 } 6919 } 6920 6921 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 6922 printf(">>>> Match against: "); 6923 pchars(start_match, end_subject - start_match, TRUE, md); 6924 printf("\n"); 6925 #endif 6926 6927 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6928 first starting point for which a partial match was found. */ 6929 6930 md->start_match_ptr = start_match; 6931 md->start_used_ptr = start_match; 6932 md->match_call_count = 0; 6933 md->match_function_type = 0; 6934 md->end_offset_top = 0; 6935 md->skip_arg_count = 0; 6936 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); 6937 if (md->hitend && start_partial == NULL) 6938 { 6939 start_partial = md->start_used_ptr; 6940 match_partial = start_match; 6941 } 6942 6943 switch(rc) 6944 { 6945 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6946 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 6947 entirely. The only way we can do that is to re-do the match at the same 6948 point, with a flag to force SKIP with an argument to be ignored. Just 6949 treating this case as NOMATCH does not work because it does not check other 6950 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 6951 6952 case MATCH_SKIP_ARG: 6953 new_start_match = start_match; 6954 md->ignore_skip_arg = md->skip_arg_count; 6955 break; 6956 6957 /* SKIP passes back the next starting point explicitly, but if it is no 6958 greater than the match we have just done, treat it as NOMATCH. */ 6959 6960 case MATCH_SKIP: 6961 if (md->start_match_ptr > start_match) 6962 { 6963 new_start_match = md->start_match_ptr; 6964 break; 6965 } 6966 /* Fall through */ 6967 6968 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6969 exactly like PRUNE. Unset ignore SKIP-with-argument. */ 6970 6971 case MATCH_NOMATCH: 6972 case MATCH_PRUNE: 6973 case MATCH_THEN: 6974 md->ignore_skip_arg = 0; 6975 new_start_match = start_match + 1; 6976 #ifdef SUPPORT_UTF 6977 if (utf) 6978 ACROSSCHAR(new_start_match < end_subject, *new_start_match, 6979 new_start_match++); 6980 #endif 6981 break; 6982 6983 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6984 6985 case MATCH_COMMIT: 6986 rc = MATCH_NOMATCH; 6987 goto ENDLOOP; 6988 6989 /* Any other return is either a match, or some kind of error. */ 6990 6991 default: 6992 goto ENDLOOP; 6993 } 6994 6995 /* Control reaches here for the various types of "no match at this point" 6996 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 6997 6998 rc = MATCH_NOMATCH; 6999 7000 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 7001 newline in the subject (though it may continue over the newline). Therefore, 7002 if we have just failed to match, starting at a newline, do not continue. */ 7003 7004 if (firstline && IS_NEWLINE(start_match)) break; 7005 7006 /* Advance to new matching position */ 7007 7008 start_match = new_start_match; 7009 7010 /* Break the loop if the pattern is anchored or if we have passed the end of 7011 the subject. */ 7012 7013 if (anchored || start_match > end_subject) break; 7014 7015 /* If we have just passed a CR and we are now at a LF, and the pattern does 7016 not contain any explicit matches for \r or \n, and the newline option is CRLF 7017 or ANY or ANYCRLF, advance the match position by one more character. In 7018 normal matching start_match will aways be greater than the first position at 7019 this stage, but a failed *SKIP can cause a return at the same point, which is 7020 why the first test exists. */ 7021 7022 if (start_match > (PCRE_PUCHAR)subject + start_offset && 7023 start_match[-1] == CHAR_CR && 7024 start_match < end_subject && 7025 *start_match == CHAR_NL && 7026 (re->flags & PCRE_HASCRORLF) == 0 && 7027 (md->nltype == NLTYPE_ANY || 7028 md->nltype == NLTYPE_ANYCRLF || 7029 md->nllen == 2)) 7030 start_match++; 7031 7032 md->mark = NULL; /* Reset for start of next match attempt */ 7033 } /* End of for(;;) "bumpalong" loop */ 7034 7035 /* ==========================================================================*/ 7036 7037 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 7038 conditions is true: 7039 7040 (1) The pattern is anchored or the match was failed by (*COMMIT); 7041 7042 (2) We are past the end of the subject; 7043 7044 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 7045 this option requests that a match occur at or before the first newline in 7046 the subject. 7047 7048 When we have a match and the offset vector is big enough to deal with any 7049 backreferences, captured substring offsets will already be set up. In the case 7050 where we had to get some local store to hold offsets for backreference 7051 processing, copy those that we can. In this case there need not be overflow if 7052 certain parts of the pattern were not used, even though there are more 7053 capturing parentheses than vector slots. */ 7054 7055 ENDLOOP: 7056 7057 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 7058 { 7059 if (using_temporary_offsets) 7060 { 7061 if (arg_offset_max >= 4) 7062 { 7063 memcpy(offsets + 2, md->offset_vector + 2, 7064 (arg_offset_max - 2) * sizeof(int)); 7065 DPRINTF(("Copied offsets from temporary memory\n")); 7066 } 7067 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; 7068 DPRINTF(("Freeing temporary memory\n")); 7069 (PUBL(free))(md->offset_vector); 7070 } 7071 7072 /* Set the return code to the number of captured strings, or 0 if there were 7073 too many to fit into the vector. */ 7074 7075 rc = ((md->capture_last & OVFLBIT) != 0 && 7076 md->end_offset_top >= arg_offset_max)? 7077 0 : md->end_offset_top/2; 7078 7079 /* If there is space in the offset vector, set any unused pairs at the end of 7080 the pattern to -1 for backwards compatibility. It is documented that this 7081 happens. In earlier versions, the whole set of potential capturing offsets 7082 was set to -1 each time round the loop, but this is handled differently now. 7083 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only 7084 those at the end that need unsetting here. We can't just unset them all at 7085 the start of the whole thing because they may get set in one branch that is 7086 not the final matching branch. */ 7087 7088 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL) 7089 { 7090 register int *iptr, *iend; 7091 int resetcount = 2 + re->top_bracket * 2; 7092 if (resetcount > offsetcount) resetcount = offsetcount; 7093 iptr = offsets + md->end_offset_top; 7094 iend = offsets + resetcount; 7095 while (iptr < iend) *iptr++ = -1; 7096 } 7097 7098 /* If there is space, set up the whole thing as substring 0. The value of 7099 md->start_match_ptr might be modified if \K was encountered on the success 7100 matching path. */ 7101 7102 if (offsetcount < 2) rc = 0; else 7103 { 7104 offsets[0] = (int)(md->start_match_ptr - md->start_subject); 7105 offsets[1] = (int)(md->end_match_ptr - md->start_subject); 7106 } 7107 7108 /* Return MARK data if requested */ 7109 7110 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7111 *(extra_data->mark) = (pcre_uchar *)md->mark; 7112 DPRINTF((">>>> returning %d\n", rc)); 7113 #ifdef NO_RECURSE 7114 release_match_heapframes(&frame_zero); 7115 #endif 7116 return rc; 7117 } 7118 7119 /* Control gets here if there has been an error, or if the overall match 7120 attempt has failed at all permitted starting positions. */ 7121 7122 if (using_temporary_offsets) 7123 { 7124 DPRINTF(("Freeing temporary memory\n")); 7125 (PUBL(free))(md->offset_vector); 7126 } 7127 7128 /* For anything other than nomatch or partial match, just return the code. */ 7129 7130 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 7131 { 7132 DPRINTF((">>>> error: returning %d\n", rc)); 7133 #ifdef NO_RECURSE 7134 release_match_heapframes(&frame_zero); 7135 #endif 7136 return rc; 7137 } 7138 7139 /* Handle partial matches - disable any mark data */ 7140 7141 if (match_partial != NULL) 7142 { 7143 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 7144 md->mark = NULL; 7145 if (offsetcount > 1) 7146 { 7147 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); 7148 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 7149 if (offsetcount > 2) 7150 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); 7151 } 7152 rc = PCRE_ERROR_PARTIAL; 7153 } 7154 7155 /* This is the classic nomatch case */ 7156 7157 else 7158 { 7159 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 7160 rc = PCRE_ERROR_NOMATCH; 7161 } 7162 7163 /* Return the MARK data if it has been requested. */ 7164 7165 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7166 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; 7167 #ifdef NO_RECURSE 7168 release_match_heapframes(&frame_zero); 7169 #endif 7170 return rc; 7171 } 7172 7173 /* End of pcre_exec.c */ 7174