1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2014 University of Cambridge 10 11 ----------------------------------------------------------------------------- 12 Redistribution and use in source and binary forms, with or without 13 modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 POSSIBILITY OF SUCH DAMAGE. 37 ----------------------------------------------------------------------------- 38 */ 39 40 /* This module contains pcre_exec(), the externally visible function that does 41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as 42 possible. There are also some static supporting functions. */ 43 44 #ifdef HAVE_CONFIG_H 45 #include "config.h" 46 #endif 47 48 #define NLBLOCK md /* Block containing newline information */ 49 #define PSSTART start_subject /* Field containing processed string start */ 50 #define PSEND end_subject /* Field containing processed string end */ 51 52 #include "pcre_internal.h" 53 54 /* Undefine some potentially clashing cpp symbols */ 55 56 #undef min 57 #undef max 58 59 /* The md->capture_last field uses the lower 16 bits for the last captured 60 substring (which can never be greater than 65535) and a bit in the top half 61 to mean "capture vector overflowed". This odd way of doing things was 62 implemented when it was realized that preserving and restoring the overflow bit 63 whenever the last capture number was saved/restored made for a neater 64 interface, and doing it this way saved on (a) another variable, which would 65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another 66 separate set of save/restore instructions. The following defines are used in 67 implementing this. */ 68 69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */ 70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ 71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ 72 73 /* Values for setting in md->match_function_type to indicate two special types 74 of call to match(). We do it this way to save on using another stack variable, 75 as stack usage is to be discouraged. */ 76 77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ 78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ 79 80 /* Non-error returns from the match() function. Error returns are externally 81 defined PCRE_ERROR_xxx codes, which are all negative. */ 82 83 #define MATCH_MATCH 1 84 #define MATCH_NOMATCH 0 85 86 /* Special internal returns from the match() function. Make them sufficiently 87 negative to avoid the external error codes. */ 88 89 #define MATCH_ACCEPT (-999) 90 #define MATCH_KETRPOS (-998) 91 #define MATCH_ONCE (-997) 92 /* The next 5 must be kept together and in sequence so that a test that checks 93 for any one of them can use a range. */ 94 #define MATCH_COMMIT (-996) 95 #define MATCH_PRUNE (-995) 96 #define MATCH_SKIP (-994) 97 #define MATCH_SKIP_ARG (-993) 98 #define MATCH_THEN (-992) 99 #define MATCH_BACKTRACK_MAX MATCH_THEN 100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT 101 102 /* Maximum number of ints of offset to save on the stack for recursive calls. 103 If the offset vector is bigger, malloc is used. This should be a multiple of 3, 104 because the offset vector is always a multiple of 3 long. */ 105 106 #define REC_STACK_SAVE_MAX 30 107 108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ 109 110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; 111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; 112 113 #ifdef PCRE_DEBUG 114 /************************************************* 115 * Debugging function to print chars * 116 *************************************************/ 117 118 /* Print a sequence of chars in printable format, stopping at the end of the 119 subject if the requested. 120 121 Arguments: 122 p points to characters 123 length number to print 124 is_subject TRUE if printing from within md->start_subject 125 md pointer to matching data block, if is_subject is TRUE 126 127 Returns: nothing 128 */ 129 130 static void 131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) 132 { 133 pcre_uint32 c; 134 BOOL utf = md->utf; 135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 136 while (length-- > 0) 137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); 138 } 139 #endif 140 141 142 143 /************************************************* 144 * Match a back-reference * 145 *************************************************/ 146 147 /* Normally, if a back reference hasn't been set, the length that is passed is 148 negative, so the match always fails. However, in JavaScript compatibility mode, 149 the length passed is zero. Note that in caseless UTF-8 mode, the number of 150 subject bytes matched may be different to the number of reference bytes. 151 152 Arguments: 153 offset index into the offset vector 154 eptr pointer into the subject 155 length length of reference to be matched (number of bytes) 156 md points to match data block 157 caseless TRUE if caseless 158 159 Returns: >= 0 the number of subject bytes matched 160 -1 no match 161 -2 partial match; always given if at end subject 162 */ 163 164 static int 165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, 166 BOOL caseless) 167 { 168 PCRE_PUCHAR eptr_start = eptr; 169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; 170 #if defined SUPPORT_UTF && defined SUPPORT_UCP 171 BOOL utf = md->utf; 172 #endif 173 174 #ifdef PCRE_DEBUG 175 if (eptr >= md->end_subject) 176 printf("matching subject <null>"); 177 else 178 { 179 printf("matching subject "); 180 pchars(eptr, length, TRUE, md); 181 } 182 printf(" against backref "); 183 pchars(p, length, FALSE, md); 184 printf("\n"); 185 #endif 186 187 /* Always fail if reference not set (and not JavaScript compatible - in that 188 case the length is passed as zero). */ 189 190 if (length < 0) return -1; 191 192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this 193 properly if Unicode properties are supported. Otherwise, we can check only 194 ASCII characters. */ 195 196 if (caseless) 197 { 198 #if defined SUPPORT_UTF && defined SUPPORT_UCP 199 if (utf) 200 { 201 /* Match characters up to the end of the reference. NOTE: the number of 202 data units matched may differ, because in UTF-8 there are some characters 203 whose upper and lower case versions code have different numbers of bytes. 204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a 206 sequence of two of the latter. It is important, therefore, to check the 207 length along the reference, not along the subject (earlier code did this 208 wrong). */ 209 210 PCRE_PUCHAR endptr = p + length; 211 while (p < endptr) 212 { 213 pcre_uint32 c, d; 214 const ucd_record *ur; 215 if (eptr >= md->end_subject) return -2; /* Partial match */ 216 GETCHARINC(c, eptr); 217 GETCHARINC(d, p); 218 ur = GET_UCD(d); 219 if (c != d && c != d + ur->other_case) 220 { 221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset; 222 for (;;) 223 { 224 if (c < *pp) return -1; 225 if (c == *pp++) break; 226 } 227 } 228 } 229 } 230 else 231 #endif 232 233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 234 is no UCP support. */ 235 { 236 while (length-- > 0) 237 { 238 pcre_uint32 cc, cp; 239 if (eptr >= md->end_subject) return -2; /* Partial match */ 240 cc = UCHAR21TEST(eptr); 241 cp = UCHAR21TEST(p); 242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; 243 p++; 244 eptr++; 245 } 246 } 247 } 248 249 /* In the caseful case, we can just compare the bytes, whether or not we 250 are in UTF-8 mode. */ 251 252 else 253 { 254 while (length-- > 0) 255 { 256 if (eptr >= md->end_subject) return -2; /* Partial match */ 257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; 258 } 259 } 260 261 return (int)(eptr - eptr_start); 262 } 263 264 265 266 /*************************************************************************** 267 **************************************************************************** 268 RECURSION IN THE match() FUNCTION 269 270 The match() function is highly recursive, though not every recursive call 271 increases the recursive depth. Nevertheless, some regular expressions can cause 272 it to recurse to a great depth. I was writing for Unix, so I just let it call 273 itself recursively. This uses the stack for saving everything that has to be 274 saved for a recursive call. On Unix, the stack can be large, and this works 275 fine. 276 277 It turns out that on some non-Unix-like systems there are problems with 278 programs that use a lot of stack. (This despite the fact that every last chip 279 has oodles of memory these days, and techniques for extending the stack have 280 been known for decades.) So.... 281 282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 283 calls by keeping local variables that need to be preserved in blocks of memory 284 obtained from malloc() instead instead of on the stack. Macros are used to 285 achieve this so that the actual code doesn't look very different to what it 286 always used to. 287 288 The original heap-recursive code used longjmp(). However, it seems that this 289 can be very slow on some operating systems. Following a suggestion from Stan 290 Switzer, the use of longjmp() has been abolished, at the cost of having to 291 provide a unique number for each call to RMATCH. There is no way of generating 292 a sequence of numbers at compile time in C. I have given them names, to make 293 them stand out more clearly. 294 295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 297 tests. Furthermore, not using longjmp() means that local dynamic variables 298 don't have indeterminate values; this has meant that the frame size can be 299 reduced because the result can be "passed back" by straight setting of the 300 variable instead of being passed in the frame. 301 **************************************************************************** 302 ***************************************************************************/ 303 304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 305 below must be updated in sync. */ 306 307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; 314 315 /* These versions of the macros use the stack, as normal. There are debugging 316 versions and production versions. Note that the "rw" argument of RMATCH isn't 317 actually used in this definition. */ 318 319 #ifndef NO_RECURSE 320 #define REGISTER register 321 322 #ifdef PCRE_DEBUG 323 #define RMATCH(ra,rb,rc,rd,re,rw) \ 324 { \ 325 printf("match() called in line %d\n", __LINE__); \ 326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \ 327 printf("to line %d\n", __LINE__); \ 328 } 329 #define RRETURN(ra) \ 330 { \ 331 printf("match() returned %d from line %d\n", ra, __LINE__); \ 332 return ra; \ 333 } 334 #else 335 #define RMATCH(ra,rb,rc,rd,re,rw) \ 336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) 337 #define RRETURN(ra) return ra 338 #endif 339 340 #else 341 342 343 /* These versions of the macros manage a private stack on the heap. Note that 344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md 345 argument of match(), which never changes. */ 346 347 #define REGISTER 348 349 #define RMATCH(ra,rb,rc,rd,re,rw)\ 350 {\ 351 heapframe *newframe = frame->Xnextframe;\ 352 if (newframe == NULL)\ 353 {\ 354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ 355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ 356 newframe->Xnextframe = NULL;\ 357 frame->Xnextframe = newframe;\ 358 }\ 359 frame->Xwhere = rw;\ 360 newframe->Xeptr = ra;\ 361 newframe->Xecode = rb;\ 362 newframe->Xmstart = mstart;\ 363 newframe->Xoffset_top = rc;\ 364 newframe->Xeptrb = re;\ 365 newframe->Xrdepth = frame->Xrdepth + 1;\ 366 newframe->Xprevframe = frame;\ 367 frame = newframe;\ 368 DPRINTF(("restarting from line %d\n", __LINE__));\ 369 goto HEAP_RECURSE;\ 370 L_##rw:\ 371 DPRINTF(("jumped back to line %d\n", __LINE__));\ 372 } 373 374 #define RRETURN(ra)\ 375 {\ 376 heapframe *oldframe = frame;\ 377 frame = oldframe->Xprevframe;\ 378 if (frame != NULL)\ 379 {\ 380 rrc = ra;\ 381 goto HEAP_RETURN;\ 382 }\ 383 return ra;\ 384 } 385 386 387 /* Structure for remembering the local variables in a private frame */ 388 389 typedef struct heapframe { 390 struct heapframe *Xprevframe; 391 struct heapframe *Xnextframe; 392 393 /* Function arguments that may change */ 394 395 PCRE_PUCHAR Xeptr; 396 const pcre_uchar *Xecode; 397 PCRE_PUCHAR Xmstart; 398 int Xoffset_top; 399 eptrblock *Xeptrb; 400 unsigned int Xrdepth; 401 402 /* Function local variables */ 403 404 PCRE_PUCHAR Xcallpat; 405 #ifdef SUPPORT_UTF 406 PCRE_PUCHAR Xcharptr; 407 #endif 408 PCRE_PUCHAR Xdata; 409 PCRE_PUCHAR Xnext; 410 PCRE_PUCHAR Xpp; 411 PCRE_PUCHAR Xprev; 412 PCRE_PUCHAR Xsaved_eptr; 413 414 recursion_info Xnew_recursive; 415 416 BOOL Xcur_is_word; 417 BOOL Xcondition; 418 BOOL Xprev_is_word; 419 420 #ifdef SUPPORT_UCP 421 int Xprop_type; 422 unsigned int Xprop_value; 423 int Xprop_fail_result; 424 int Xoclength; 425 pcre_uchar Xocchars[6]; 426 #endif 427 428 int Xcodelink; 429 int Xctype; 430 unsigned int Xfc; 431 int Xfi; 432 int Xlength; 433 int Xmax; 434 int Xmin; 435 unsigned int Xnumber; 436 int Xoffset; 437 unsigned int Xop; 438 pcre_int32 Xsave_capture_last; 439 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 440 int Xstacksave[REC_STACK_SAVE_MAX]; 441 442 eptrblock Xnewptrb; 443 444 /* Where to jump back to */ 445 446 int Xwhere; 447 448 } heapframe; 449 450 #endif 451 452 453 /*************************************************************************** 454 ***************************************************************************/ 455 456 457 458 /************************************************* 459 * Match from current position * 460 *************************************************/ 461 462 /* This function is called recursively in many circumstances. Whenever it 463 returns a negative (error) response, the outer incarnation must also return the 464 same response. */ 465 466 /* These macros pack up tests that are used for partial matching, and which 467 appear several times in the code. We set the "hit end" flag if the pointer is 468 at the end of the subject and also past the start of the subject (i.e. 469 something has been matched). For hard partial matching, we then return 470 immediately. The second one is used when we already know we are past the end of 471 the subject. */ 472 473 #define CHECK_PARTIAL()\ 474 if (md->partial != 0 && eptr >= md->end_subject && \ 475 eptr > md->start_used_ptr) \ 476 { \ 477 md->hitend = TRUE; \ 478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 479 } 480 481 #define SCHECK_PARTIAL()\ 482 if (md->partial != 0 && eptr > md->start_used_ptr) \ 483 { \ 484 md->hitend = TRUE; \ 485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 486 } 487 488 489 /* Performance note: It might be tempting to extract commonly used fields from 490 the md structure (e.g. utf, end_subject) into individual variables to improve 491 performance. Tests using gcc on a SPARC disproved this; in the first case, it 492 made performance worse. 493 494 Arguments: 495 eptr pointer to current character in subject 496 ecode pointer to current position in compiled code 497 mstart pointer to the current match start position (can be modified 498 by encountering \K) 499 offset_top current top pointer 500 md pointer to "static" info for the match 501 eptrb pointer to chain of blocks containing eptr at start of 502 brackets - for testing for empty matches 503 rdepth the recursion depth 504 505 Returns: MATCH_MATCH if matched ) these values are >= 0 506 MATCH_NOMATCH if failed to match ) 507 a negative MATCH_xxx value for PRUNE, SKIP, etc 508 a negative PCRE_ERROR_xxx value if aborted by an error condition 509 (e.g. stopped by repeated call or recursion limit) 510 */ 511 512 static int 513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, 514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, 515 unsigned int rdepth) 516 { 517 /* These variables do not need to be preserved over recursion in this function, 518 so they can be ordinary variables in all cases. Mark some of them with 519 "register" because they are used a lot in loops. */ 520 521 register int rrc; /* Returns from recursive calls */ 522 register int i; /* Used for loops not involving calls to RMATCH() */ 523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */ 524 register BOOL utf; /* Local copy of UTF flag for speed */ 525 526 BOOL minimize, possessive; /* Quantifier options */ 527 BOOL caseless; 528 int condcode; 529 530 /* When recursion is not being used, all "local" variables that have to be 531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level 532 frame on the stack here; subsequent instantiations are obtained from the heap 533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting 534 the top-level on the stack rather than malloc-ing them all gives a performance 535 boost in many cases where there is not much "recursion". */ 536 537 #ifdef NO_RECURSE 538 heapframe *frame = (heapframe *)md->match_frames_base; 539 540 /* Copy in the original argument variables */ 541 542 frame->Xeptr = eptr; 543 frame->Xecode = ecode; 544 frame->Xmstart = mstart; 545 frame->Xoffset_top = offset_top; 546 frame->Xeptrb = eptrb; 547 frame->Xrdepth = rdepth; 548 549 /* This is where control jumps back to to effect "recursion" */ 550 551 HEAP_RECURSE: 552 553 /* Macros make the argument variables come from the current frame */ 554 555 #define eptr frame->Xeptr 556 #define ecode frame->Xecode 557 #define mstart frame->Xmstart 558 #define offset_top frame->Xoffset_top 559 #define eptrb frame->Xeptrb 560 #define rdepth frame->Xrdepth 561 562 /* Ditto for the local variables */ 563 564 #ifdef SUPPORT_UTF 565 #define charptr frame->Xcharptr 566 #endif 567 #define callpat frame->Xcallpat 568 #define codelink frame->Xcodelink 569 #define data frame->Xdata 570 #define next frame->Xnext 571 #define pp frame->Xpp 572 #define prev frame->Xprev 573 #define saved_eptr frame->Xsaved_eptr 574 575 #define new_recursive frame->Xnew_recursive 576 577 #define cur_is_word frame->Xcur_is_word 578 #define condition frame->Xcondition 579 #define prev_is_word frame->Xprev_is_word 580 581 #ifdef SUPPORT_UCP 582 #define prop_type frame->Xprop_type 583 #define prop_value frame->Xprop_value 584 #define prop_fail_result frame->Xprop_fail_result 585 #define oclength frame->Xoclength 586 #define occhars frame->Xocchars 587 #endif 588 589 #define ctype frame->Xctype 590 #define fc frame->Xfc 591 #define fi frame->Xfi 592 #define length frame->Xlength 593 #define max frame->Xmax 594 #define min frame->Xmin 595 #define number frame->Xnumber 596 #define offset frame->Xoffset 597 #define op frame->Xop 598 #define save_capture_last frame->Xsave_capture_last 599 #define save_offset1 frame->Xsave_offset1 600 #define save_offset2 frame->Xsave_offset2 601 #define save_offset3 frame->Xsave_offset3 602 #define stacksave frame->Xstacksave 603 604 #define newptrb frame->Xnewptrb 605 606 /* When recursion is being used, local variables are allocated on the stack and 607 get preserved during recursion in the normal way. In this environment, fi and 608 i, and fc and c, can be the same variables. */ 609 610 #else /* NO_RECURSE not defined */ 611 #define fi i 612 #define fc c 613 614 /* Many of the following variables are used only in small blocks of the code. 615 My normal style of coding would have declared them within each of those blocks. 616 However, in order to accommodate the version of this code that uses an external 617 "stack" implemented on the heap, it is easier to declare them all here, so the 618 declarations can be cut out in a block. The only declarations within blocks 619 below are for variables that do not have to be preserved over a recursive call 620 to RMATCH(). */ 621 622 #ifdef SUPPORT_UTF 623 const pcre_uchar *charptr; 624 #endif 625 const pcre_uchar *callpat; 626 const pcre_uchar *data; 627 const pcre_uchar *next; 628 PCRE_PUCHAR pp; 629 const pcre_uchar *prev; 630 PCRE_PUCHAR saved_eptr; 631 632 recursion_info new_recursive; 633 634 BOOL cur_is_word; 635 BOOL condition; 636 BOOL prev_is_word; 637 638 #ifdef SUPPORT_UCP 639 int prop_type; 640 unsigned int prop_value; 641 int prop_fail_result; 642 int oclength; 643 pcre_uchar occhars[6]; 644 #endif 645 646 int codelink; 647 int ctype; 648 int length; 649 int max; 650 int min; 651 unsigned int number; 652 int offset; 653 unsigned int op; 654 pcre_int32 save_capture_last; 655 int save_offset1, save_offset2, save_offset3; 656 int stacksave[REC_STACK_SAVE_MAX]; 657 658 eptrblock newptrb; 659 660 /* There is a special fudge for calling match() in a way that causes it to 661 measure the size of its basic stack frame when the stack is being used for 662 recursion. The second argument (ecode) being NULL triggers this behaviour. It 663 cannot normally ever be NULL. The return is the negated value of the frame 664 size. */ 665 666 if (ecode == NULL) 667 { 668 if (rdepth == 0) 669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); 670 else 671 { 672 int len = (char *)&rdepth - (char *)eptr; 673 return (len > 0)? -len : len; 674 } 675 } 676 #endif /* NO_RECURSE */ 677 678 /* To save space on the stack and in the heap frame, I have doubled up on some 679 of the local variables that are used only in localised parts of the code, but 680 still need to be preserved over recursive calls of match(). These macros define 681 the alternative names that are used. */ 682 683 #define allow_zero cur_is_word 684 #define cbegroup condition 685 #define code_offset codelink 686 #define condassert condition 687 #define matched_once prev_is_word 688 #define foc number 689 #define save_mark data 690 691 /* These statements are here to stop the compiler complaining about unitialized 692 variables. */ 693 694 #ifdef SUPPORT_UCP 695 prop_value = 0; 696 prop_fail_result = 0; 697 #endif 698 699 700 /* This label is used for tail recursion, which is used in a few cases even 701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is 702 used. Thanks to Ian Taylor for noticing this possibility and sending the 703 original patch. */ 704 705 TAIL_RECURSE: 706 707 /* OK, now we can get on with the real code of the function. Recursive calls 708 are specified by the macro RMATCH and RRETURN is used to return. When 709 NO_RECURSE is *not* defined, these just turn into a recursive call to match() 710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 711 defined). However, RMATCH isn't like a function call because it's quite a 712 complicated macro. It has to be used in one particular way. This shouldn't, 713 however, impact performance when true recursion is being used. */ 714 715 #ifdef SUPPORT_UTF 716 utf = md->utf; /* Local copy of the flag */ 717 #else 718 utf = FALSE; 719 #endif 720 721 /* First check that we haven't called match() too many times, or that we 722 haven't exceeded the recursive call limit. */ 723 724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 726 727 /* At the start of a group with an unlimited repeat that may match an empty 728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is 729 done this way to save having to use another function argument, which would take 730 up space on the stack. See also MATCH_CONDASSERT below. 731 732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of 733 such remembered pointers, to be checked when we hit the closing ket, in order 734 to break infinite loops that match no characters. When match() is called in 735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must 736 NOT be used with tail recursion, because the memory block that is used is on 737 the stack, so a new one may be required for each match(). */ 738 739 if (md->match_function_type == MATCH_CBEGROUP) 740 { 741 newptrb.epb_saved_eptr = eptr; 742 newptrb.epb_prev = eptrb; 743 eptrb = &newptrb; 744 md->match_function_type = 0; 745 } 746 747 /* Now start processing the opcodes. */ 748 749 for (;;) 750 { 751 minimize = possessive = FALSE; 752 op = *ecode; 753 754 switch(op) 755 { 756 case OP_MARK: 757 md->nomatch_mark = ecode + 2; 758 md->mark = NULL; /* In case previously set by assertion */ 759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 760 eptrb, RM55); 761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 762 md->mark == NULL) md->mark = ecode + 2; 763 764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 765 argument, and we must check whether that argument matches this MARK's 766 argument. It is passed back in md->start_match_ptr (an overloading of that 767 variable). If it does match, we reset that variable to the current subject 768 position and return MATCH_SKIP. Otherwise, pass back the return code 769 unaltered. */ 770 771 else if (rrc == MATCH_SKIP_ARG && 772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0) 773 { 774 md->start_match_ptr = eptr; 775 RRETURN(MATCH_SKIP); 776 } 777 RRETURN(rrc); 778 779 case OP_FAIL: 780 RRETURN(MATCH_NOMATCH); 781 782 case OP_COMMIT: 783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 784 eptrb, RM52); 785 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 786 RRETURN(MATCH_COMMIT); 787 788 case OP_PRUNE: 789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 790 eptrb, RM51); 791 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 792 RRETURN(MATCH_PRUNE); 793 794 case OP_PRUNE_ARG: 795 md->nomatch_mark = ecode + 2; 796 md->mark = NULL; /* In case previously set by assertion */ 797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 798 eptrb, RM56); 799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 800 md->mark == NULL) md->mark = ecode + 2; 801 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 802 RRETURN(MATCH_PRUNE); 803 804 case OP_SKIP: 805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 806 eptrb, RM53); 807 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 808 md->start_match_ptr = eptr; /* Pass back current position */ 809 RRETURN(MATCH_SKIP); 810 811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set 812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was 813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG 814 that failed and any that precede it (either they also failed, or were not 815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a 816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg 817 set to the count of the one that failed. */ 818 819 case OP_SKIP_ARG: 820 md->skip_arg_count++; 821 if (md->skip_arg_count <= md->ignore_skip_arg) 822 { 823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; 824 break; 825 } 826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 827 eptrb, RM57); 828 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 829 830 /* Pass back the current skip name by overloading md->start_match_ptr and 831 returning the special MATCH_SKIP_ARG return code. This will either be 832 caught by a matching MARK, or get to the top, where it causes a rematch 833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */ 834 835 md->start_match_ptr = ecode + 2; 836 RRETURN(MATCH_SKIP_ARG); 837 838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 839 the branch in which it occurs can be determined. Overload the start of 840 match pointer to do this. */ 841 842 case OP_THEN: 843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 844 eptrb, RM54); 845 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 846 md->start_match_ptr = ecode; 847 RRETURN(MATCH_THEN); 848 849 case OP_THEN_ARG: 850 md->nomatch_mark = ecode + 2; 851 md->mark = NULL; /* In case previously set by assertion */ 852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, 853 md, eptrb, RM58); 854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 855 md->mark == NULL) md->mark = ecode + 2; 856 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 857 md->start_match_ptr = ecode; 858 RRETURN(MATCH_THEN); 859 860 /* Handle an atomic group that does not contain any capturing parentheses. 861 This can be handled like an assertion. Prior to 8.13, all atomic groups 862 were handled this way. In 8.13, the code was changed as below for ONCE, so 863 that backups pass through the group and thereby reset captured values. 864 However, this uses a lot more stack, so in 8.20, atomic groups that do not 865 contain any captures generate OP_ONCE_NC, which can be handled in the old, 866 less stack intensive way. 867 868 Check the alternative branches in turn - the matching won't pass the KET 869 for this kind of subpattern. If any one branch matches, we carry on as at 870 the end of a normal bracket, leaving the subject pointer, but resetting 871 the start-of-match value in case it was changed by \K. */ 872 873 case OP_ONCE_NC: 874 prev = ecode; 875 saved_eptr = eptr; 876 save_mark = md->mark; 877 do 878 { 879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); 880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 881 { 882 mstart = md->start_match_ptr; 883 break; 884 } 885 if (rrc == MATCH_THEN) 886 { 887 next = ecode + GET(ecode,1); 888 if (md->start_match_ptr < next && 889 (*ecode == OP_ALT || *next == OP_ALT)) 890 rrc = MATCH_NOMATCH; 891 } 892 893 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 894 ecode += GET(ecode,1); 895 md->mark = save_mark; 896 } 897 while (*ecode == OP_ALT); 898 899 /* If hit the end of the group (which could be repeated), fail */ 900 901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 902 903 /* Continue as from after the group, updating the offsets high water 904 mark, since extracts may have been taken. */ 905 906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 907 908 offset_top = md->end_offset_top; 909 eptr = md->end_match_ptr; 910 911 /* For a non-repeating ket, just continue at this level. This also 912 happens for a repeating ket if no characters were matched in the group. 913 This is the forcible breaking of infinite loops as implemented in Perl 914 5.005. */ 915 916 if (*ecode == OP_KET || eptr == saved_eptr) 917 { 918 ecode += 1+LINK_SIZE; 919 break; 920 } 921 922 /* The repeating kets try the rest of the pattern or restart from the 923 preceding bracket, in the appropriate order. The second "call" of match() 924 uses tail recursion, to avoid using another stack frame. */ 925 926 if (*ecode == OP_KETRMIN) 927 { 928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65); 929 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 930 ecode = prev; 931 goto TAIL_RECURSE; 932 } 933 else /* OP_KETRMAX */ 934 { 935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66); 936 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 937 ecode += 1 + LINK_SIZE; 938 goto TAIL_RECURSE; 939 } 940 /* Control never gets here */ 941 942 /* Handle a capturing bracket, other than those that are possessive with an 943 unlimited repeat. If there is space in the offset vector, save the current 944 subject position in the working slot at the top of the vector. We mustn't 945 change the current values of the data slot, because they may be set from a 946 previous iteration of this group, and be referred to by a reference inside 947 the group. A failure to match might occur after the group has succeeded, 948 if something later on doesn't match. For this reason, we need to restore 949 the working value and also the values of the final offsets, in case they 950 were set by a previous iteration of the same bracket. 951 952 If there isn't enough space in the offset vector, treat this as if it were 953 a non-capturing bracket. Don't worry about setting the flag for the error 954 case here; that is handled in the code for KET. */ 955 956 case OP_CBRA: 957 case OP_SCBRA: 958 number = GET2(ecode, 1+LINK_SIZE); 959 offset = number << 1; 960 961 #ifdef PCRE_DEBUG 962 printf("start bracket %d\n", number); 963 printf("subject="); 964 pchars(eptr, 16, TRUE, md); 965 printf("\n"); 966 #endif 967 968 if (offset < md->offset_max) 969 { 970 save_offset1 = md->offset_vector[offset]; 971 save_offset2 = md->offset_vector[offset+1]; 972 save_offset3 = md->offset_vector[md->offset_end - number]; 973 save_capture_last = md->capture_last; 974 save_mark = md->mark; 975 976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 977 md->offset_vector[md->offset_end - number] = 978 (int)(eptr - md->start_subject); 979 980 for (;;) 981 { 982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 984 eptrb, RM1); 985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ 986 987 /* If we backed up to a THEN, check whether it is within the current 988 branch by comparing the address of the THEN that is passed back with 989 the end of the branch. If it is within the current branch, and the 990 branch is one of two or more alternatives (it either starts or ends 991 with OP_ALT), we have reached the limit of THEN's action, so convert 992 the return code to NOMATCH, which will cause normal backtracking to 993 happen from now on. Otherwise, THEN is passed back to an outer 994 alternative. This implements Perl's treatment of parenthesized groups, 995 where a group not containing | does not affect the current alternative, 996 that is, (X) is NOT the same as (X|(*F)). */ 997 998 if (rrc == MATCH_THEN) 999 { 1000 next = ecode + GET(ecode,1); 1001 if (md->start_match_ptr < next && 1002 (*ecode == OP_ALT || *next == OP_ALT)) 1003 rrc = MATCH_NOMATCH; 1004 } 1005 1006 /* Anything other than NOMATCH is passed back. */ 1007 1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1009 md->capture_last = save_capture_last; 1010 ecode += GET(ecode, 1); 1011 md->mark = save_mark; 1012 if (*ecode != OP_ALT) break; 1013 } 1014 1015 DPRINTF(("bracket %d failed\n", number)); 1016 md->offset_vector[offset] = save_offset1; 1017 md->offset_vector[offset+1] = save_offset2; 1018 md->offset_vector[md->offset_end - number] = save_offset3; 1019 1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ 1021 1022 RRETURN(rrc); 1023 } 1024 1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1026 as a non-capturing bracket. */ 1027 1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1030 1031 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1032 1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1035 1036 /* Non-capturing or atomic group, except for possessive with unlimited 1037 repeat and ONCE group with no captures. Loop for all the alternatives. 1038 1039 When we get to the final alternative within the brackets, we used to return 1040 the result of a recursive call to match() whatever happened so it was 1041 possible to reduce stack usage by turning this into a tail recursion, 1042 except in the case of a possibly empty group. However, now that there is 1043 the possiblity of (*THEN) occurring in the final alternative, this 1044 optimization is no longer always possible. 1045 1046 We can optimize if we know there are no (*THEN)s in the pattern; at present 1047 this is the best that can be done. 1048 1049 MATCH_ONCE is returned when the end of an atomic group is successfully 1050 reached, but subsequent matching fails. It passes back up the tree (causing 1051 captured values to be reset) until the original atomic group level is 1052 reached. This is tested by comparing md->once_target with the start of the 1053 group. At this point, the return is converted into MATCH_NOMATCH so that 1054 previous backup points can be taken. */ 1055 1056 case OP_ONCE: 1057 case OP_BRA: 1058 case OP_SBRA: 1059 DPRINTF(("start non-capturing bracket\n")); 1060 1061 for (;;) 1062 { 1063 if (op >= OP_SBRA || op == OP_ONCE) 1064 md->match_function_type = MATCH_CBEGROUP; 1065 1066 /* If this is not a possibly empty group, and there are no (*THEN)s in 1067 the pattern, and this is the final alternative, optimize as described 1068 above. */ 1069 1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) 1071 { 1072 ecode += PRIV(OP_lengths)[*ecode]; 1073 goto TAIL_RECURSE; 1074 } 1075 1076 /* In all other cases, we have to make another call to match(). */ 1077 1078 save_mark = md->mark; 1079 save_capture_last = md->capture_last; 1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, 1081 RM2); 1082 1083 /* See comment in the code for capturing groups above about handling 1084 THEN. */ 1085 1086 if (rrc == MATCH_THEN) 1087 { 1088 next = ecode + GET(ecode,1); 1089 if (md->start_match_ptr < next && 1090 (*ecode == OP_ALT || *next == OP_ALT)) 1091 rrc = MATCH_NOMATCH; 1092 } 1093 1094 if (rrc != MATCH_NOMATCH) 1095 { 1096 if (rrc == MATCH_ONCE) 1097 { 1098 const pcre_uchar *scode = ecode; 1099 if (*scode != OP_ONCE) /* If not at start, find it */ 1100 { 1101 while (*scode == OP_ALT) scode += GET(scode, 1); 1102 scode -= GET(scode, 1); 1103 } 1104 if (md->once_target == scode) rrc = MATCH_NOMATCH; 1105 } 1106 RRETURN(rrc); 1107 } 1108 ecode += GET(ecode, 1); 1109 md->mark = save_mark; 1110 if (*ecode != OP_ALT) break; 1111 md->capture_last = save_capture_last; 1112 } 1113 1114 RRETURN(MATCH_NOMATCH); 1115 1116 /* Handle possessive capturing brackets with an unlimited repeat. We come 1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are 1118 handled similarly to the normal case above. However, the matching is 1119 different. The end of these brackets will always be OP_KETRPOS, which 1120 returns MATCH_KETRPOS without going further in the pattern. By this means 1121 we can handle the group by iteration rather than recursion, thereby 1122 reducing the amount of stack needed. */ 1123 1124 case OP_CBRAPOS: 1125 case OP_SCBRAPOS: 1126 allow_zero = FALSE; 1127 1128 POSSESSIVE_CAPTURE: 1129 number = GET2(ecode, 1+LINK_SIZE); 1130 offset = number << 1; 1131 1132 #ifdef PCRE_DEBUG 1133 printf("start possessive bracket %d\n", number); 1134 printf("subject="); 1135 pchars(eptr, 16, TRUE, md); 1136 printf("\n"); 1137 #endif 1138 1139 if (offset < md->offset_max) 1140 { 1141 matched_once = FALSE; 1142 code_offset = (int)(ecode - md->start_code); 1143 1144 save_offset1 = md->offset_vector[offset]; 1145 save_offset2 = md->offset_vector[offset+1]; 1146 save_offset3 = md->offset_vector[md->offset_end - number]; 1147 save_capture_last = md->capture_last; 1148 1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 1150 1151 /* Each time round the loop, save the current subject position for use 1152 when the group matches. For MATCH_MATCH, the group has matched, so we 1153 restart it with a new subject starting position, remembering that we had 1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as 1155 usual. If we haven't matched any alternatives in any iteration, check to 1156 see if a previous iteration matched. If so, the group has matched; 1157 continue from afterwards. Otherwise it has failed; restore the previous 1158 capture values before returning NOMATCH. */ 1159 1160 for (;;) 1161 { 1162 md->offset_vector[md->offset_end - number] = 1163 (int)(eptr - md->start_subject); 1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1166 eptrb, RM63); 1167 if (rrc == MATCH_KETRPOS) 1168 { 1169 offset_top = md->end_offset_top; 1170 ecode = md->start_code + code_offset; 1171 save_capture_last = md->capture_last; 1172 matched_once = TRUE; 1173 mstart = md->start_match_ptr; /* In case \K changed it */ 1174 if (eptr == md->end_match_ptr) /* Matched an empty string */ 1175 { 1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1177 break; 1178 } 1179 eptr = md->end_match_ptr; 1180 continue; 1181 } 1182 1183 /* See comment in the code for capturing groups above about handling 1184 THEN. */ 1185 1186 if (rrc == MATCH_THEN) 1187 { 1188 next = ecode + GET(ecode,1); 1189 if (md->start_match_ptr < next && 1190 (*ecode == OP_ALT || *next == OP_ALT)) 1191 rrc = MATCH_NOMATCH; 1192 } 1193 1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1195 md->capture_last = save_capture_last; 1196 ecode += GET(ecode, 1); 1197 if (*ecode != OP_ALT) break; 1198 } 1199 1200 if (!matched_once) 1201 { 1202 md->offset_vector[offset] = save_offset1; 1203 md->offset_vector[offset+1] = save_offset2; 1204 md->offset_vector[md->offset_end - number] = save_offset3; 1205 } 1206 1207 if (allow_zero || matched_once) 1208 { 1209 ecode += 1 + LINK_SIZE; 1210 break; 1211 } 1212 1213 RRETURN(MATCH_NOMATCH); 1214 } 1215 1216 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1217 as a non-capturing bracket. */ 1218 1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1221 1222 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1223 1224 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1225 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1226 1227 /* Non-capturing possessive bracket with unlimited repeat. We come here 1228 from BRAZERO with allow_zero = TRUE. The code is similar to the above, 1229 without the capturing complication. It is written out separately for speed 1230 and cleanliness. */ 1231 1232 case OP_BRAPOS: 1233 case OP_SBRAPOS: 1234 allow_zero = FALSE; 1235 1236 POSSESSIVE_NON_CAPTURE: 1237 matched_once = FALSE; 1238 code_offset = (int)(ecode - md->start_code); 1239 save_capture_last = md->capture_last; 1240 1241 for (;;) 1242 { 1243 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1244 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1245 eptrb, RM48); 1246 if (rrc == MATCH_KETRPOS) 1247 { 1248 offset_top = md->end_offset_top; 1249 ecode = md->start_code + code_offset; 1250 matched_once = TRUE; 1251 mstart = md->start_match_ptr; /* In case \K reset it */ 1252 if (eptr == md->end_match_ptr) /* Matched an empty string */ 1253 { 1254 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1255 break; 1256 } 1257 eptr = md->end_match_ptr; 1258 continue; 1259 } 1260 1261 /* See comment in the code for capturing groups above about handling 1262 THEN. */ 1263 1264 if (rrc == MATCH_THEN) 1265 { 1266 next = ecode + GET(ecode,1); 1267 if (md->start_match_ptr < next && 1268 (*ecode == OP_ALT || *next == OP_ALT)) 1269 rrc = MATCH_NOMATCH; 1270 } 1271 1272 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1273 ecode += GET(ecode, 1); 1274 if (*ecode != OP_ALT) break; 1275 md->capture_last = save_capture_last; 1276 } 1277 1278 if (matched_once || allow_zero) 1279 { 1280 ecode += 1 + LINK_SIZE; 1281 break; 1282 } 1283 RRETURN(MATCH_NOMATCH); 1284 1285 /* Control never reaches here. */ 1286 1287 /* Conditional group: compilation checked that there are no more than two 1288 branches. If the condition is false, skipping the first branch takes us 1289 past the end of the item if there is only one branch, but that's exactly 1290 what we want. */ 1291 1292 case OP_COND: 1293 case OP_SCOND: 1294 1295 /* The variable codelink will be added to ecode when the condition is 1296 false, to get to the second branch. Setting it to the offset to the ALT 1297 or KET, then incrementing ecode achieves this effect. We now have ecode 1298 pointing to the condition or callout. */ 1299 1300 codelink = GET(ecode, 1); /* Offset to the second branch */ 1301 ecode += 1 + LINK_SIZE; /* From this opcode */ 1302 1303 /* Because of the way auto-callout works during compile, a callout item is 1304 inserted between OP_COND and an assertion condition. */ 1305 1306 if (*ecode == OP_CALLOUT) 1307 { 1308 if (PUBL(callout) != NULL) 1309 { 1310 PUBL(callout_block) cb; 1311 cb.version = 2; /* Version 1 of the callout block */ 1312 cb.callout_number = ecode[1]; 1313 cb.offset_vector = md->offset_vector; 1314 #if defined COMPILE_PCRE8 1315 cb.subject = (PCRE_SPTR)md->start_subject; 1316 #elif defined COMPILE_PCRE16 1317 cb.subject = (PCRE_SPTR16)md->start_subject; 1318 #elif defined COMPILE_PCRE32 1319 cb.subject = (PCRE_SPTR32)md->start_subject; 1320 #endif 1321 cb.subject_length = (int)(md->end_subject - md->start_subject); 1322 cb.start_match = (int)(mstart - md->start_subject); 1323 cb.current_position = (int)(eptr - md->start_subject); 1324 cb.pattern_position = GET(ecode, 2); 1325 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1326 cb.capture_top = offset_top/2; 1327 cb.capture_last = md->capture_last & CAPLMASK; 1328 /* Internal change requires this for API compatibility. */ 1329 if (cb.capture_last == 0) cb.capture_last = -1; 1330 cb.callout_data = md->callout_data; 1331 cb.mark = md->nomatch_mark; 1332 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1333 if (rrc < 0) RRETURN(rrc); 1334 } 1335 1336 /* Advance ecode past the callout, so it now points to the condition. We 1337 must adjust codelink so that the value of ecode+codelink is unchanged. */ 1338 1339 ecode += PRIV(OP_lengths)[OP_CALLOUT]; 1340 codelink -= PRIV(OP_lengths)[OP_CALLOUT]; 1341 } 1342 1343 /* Test the various possible conditions */ 1344 1345 condition = FALSE; 1346 switch(condcode = *ecode) 1347 { 1348 case OP_RREF: /* Numbered group recursion test */ 1349 if (md->recursive != NULL) /* Not recursing => FALSE */ 1350 { 1351 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ 1352 condition = (recno == RREF_ANY || recno == md->recursive->group_num); 1353 } 1354 break; 1355 1356 case OP_DNRREF: /* Duplicate named group recursion test */ 1357 if (md->recursive != NULL) 1358 { 1359 int count = GET2(ecode, 1 + IMM2_SIZE); 1360 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1361 while (count-- > 0) 1362 { 1363 unsigned int recno = GET2(slot, 0); 1364 condition = recno == md->recursive->group_num; 1365 if (condition) break; 1366 slot += md->name_entry_size; 1367 } 1368 } 1369 break; 1370 1371 case OP_CREF: /* Numbered group used test */ 1372 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 1373 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1374 break; 1375 1376 case OP_DNCREF: /* Duplicate named group used test */ 1377 { 1378 int count = GET2(ecode, 1 + IMM2_SIZE); 1379 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 1380 while (count-- > 0) 1381 { 1382 offset = GET2(slot, 0) << 1; 1383 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1384 if (condition) break; 1385 slot += md->name_entry_size; 1386 } 1387 } 1388 break; 1389 1390 case OP_DEF: /* DEFINE - always false */ 1391 break; 1392 1393 /* The condition is an assertion. Call match() to evaluate it - setting 1394 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end 1395 of an assertion. */ 1396 1397 default: 1398 md->match_function_type = MATCH_CONDASSERT; 1399 RMATCH(eptr, ecode, offset_top, md, NULL, RM3); 1400 if (rrc == MATCH_MATCH) 1401 { 1402 if (md->end_offset_top > offset_top) 1403 offset_top = md->end_offset_top; /* Captures may have happened */ 1404 condition = TRUE; 1405 1406 /* Advance ecode past the assertion to the start of the first branch, 1407 but adjust it so that the general choosing code below works. */ 1408 1409 ecode += GET(ecode, 1); 1410 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1411 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; 1412 } 1413 1414 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 1415 assertion; it is therefore treated as NOMATCH. Any other return is an 1416 error. */ 1417 1418 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1419 { 1420 RRETURN(rrc); /* Need braces because of following else */ 1421 } 1422 break; 1423 } 1424 1425 /* Choose branch according to the condition */ 1426 1427 ecode += condition? PRIV(OP_lengths)[condcode] : codelink; 1428 1429 /* We are now at the branch that is to be obeyed. As there is only one, we 1430 can use tail recursion to avoid using another stack frame, except when 1431 there is unlimited repeat of a possibly empty group. In the latter case, a 1432 recursive call to match() is always required, unless the second alternative 1433 doesn't exist, in which case we can just plough on. Note that, for 1434 compatibility with Perl, the | in a conditional group is NOT treated as 1435 creating two alternatives. If a THEN is encountered in the branch, it 1436 propagates out to the enclosing alternative (unless nested in a deeper set 1437 of alternatives, of course). */ 1438 1439 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) 1440 { 1441 if (op != OP_SCOND) 1442 { 1443 goto TAIL_RECURSE; 1444 } 1445 1446 md->match_function_type = MATCH_CBEGROUP; 1447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); 1448 RRETURN(rrc); 1449 } 1450 1451 /* Condition false & no alternative; continue after the group. */ 1452 1453 else 1454 { 1455 } 1456 break; 1457 1458 1459 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1460 to close any currently open capturing brackets. */ 1461 1462 case OP_CLOSE: 1463 number = GET2(ecode, 1); /* Must be less than 65536 */ 1464 offset = number << 1; 1465 1466 #ifdef PCRE_DEBUG 1467 printf("end bracket %d at *ACCEPT", number); 1468 printf("\n"); 1469 #endif 1470 1471 md->capture_last = (md->capture_last & OVFLMASK) | number; 1472 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1473 { 1474 md->offset_vector[offset] = 1475 md->offset_vector[md->offset_end - number]; 1476 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1477 if (offset_top <= offset) offset_top = offset + 2; 1478 } 1479 ecode += 1 + IMM2_SIZE; 1480 break; 1481 1482 1483 /* End of the pattern, either real or forced. */ 1484 1485 case OP_END: 1486 case OP_ACCEPT: 1487 case OP_ASSERT_ACCEPT: 1488 1489 /* If we have matched an empty string, fail if not in an assertion and not 1490 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART 1491 is set and we have matched at the start of the subject. In both cases, 1492 backtracking will then try other alternatives, if any. */ 1493 1494 if (eptr == mstart && op != OP_ASSERT_ACCEPT && 1495 md->recursive == NULL && 1496 (md->notempty || 1497 (md->notempty_atstart && 1498 mstart == md->start_subject + md->start_offset))) 1499 RRETURN(MATCH_NOMATCH); 1500 1501 /* Otherwise, we have a match. */ 1502 1503 md->end_match_ptr = eptr; /* Record where we ended */ 1504 md->end_offset_top = offset_top; /* and how many extracts were taken */ 1505 md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1506 1507 /* For some reason, the macros don't work properly if an expression is 1508 given as the argument to RRETURN when the heap is in use. */ 1509 1510 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1511 RRETURN(rrc); 1512 1513 /* Assertion brackets. Check the alternative branches in turn - the 1514 matching won't pass the KET for an assertion. If any one branch matches, 1515 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1516 start of each branch to move the current point backwards, so the code at 1517 this level is identical to the lookahead case. When the assertion is part 1518 of a condition, we want to return immediately afterwards. The caller of 1519 this incarnation of the match() function will have set MATCH_CONDASSERT in 1520 md->match_function type, and one of these opcodes will be the first opcode 1521 that is processed. We use a local variable that is preserved over calls to 1522 match() to remember this case. */ 1523 1524 case OP_ASSERT: 1525 case OP_ASSERTBACK: 1526 save_mark = md->mark; 1527 if (md->match_function_type == MATCH_CONDASSERT) 1528 { 1529 condassert = TRUE; 1530 md->match_function_type = 0; 1531 } 1532 else condassert = FALSE; 1533 1534 /* Loop for each branch */ 1535 1536 do 1537 { 1538 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); 1539 1540 /* A match means that the assertion is true; break out of the loop 1541 that matches its alternatives. */ 1542 1543 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1544 { 1545 mstart = md->start_match_ptr; /* In case \K reset it */ 1546 break; 1547 } 1548 1549 /* If not matched, restore the previous mark setting. */ 1550 1551 md->mark = save_mark; 1552 1553 /* See comment in the code for capturing groups above about handling 1554 THEN. */ 1555 1556 if (rrc == MATCH_THEN) 1557 { 1558 next = ecode + GET(ecode,1); 1559 if (md->start_match_ptr < next && 1560 (*ecode == OP_ALT || *next == OP_ALT)) 1561 rrc = MATCH_NOMATCH; 1562 } 1563 1564 /* Anything other than NOMATCH causes the entire assertion to fail, 1565 passing back the return code. This includes COMMIT, SKIP, PRUNE and an 1566 uncaptured THEN, which means they take their normal effect. This 1567 consistent approach does not always have exactly the same effect as in 1568 Perl. */ 1569 1570 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1571 ecode += GET(ecode, 1); 1572 } 1573 while (*ecode == OP_ALT); /* Continue for next alternative */ 1574 1575 /* If we have tried all the alternative branches, the assertion has 1576 failed. If not, we broke out after a match. */ 1577 1578 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1579 1580 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1581 1582 if (condassert) RRETURN(MATCH_MATCH); 1583 1584 /* Continue from after a successful assertion, updating the offsets high 1585 water mark, since extracts may have been taken during the assertion. */ 1586 1587 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1588 ecode += 1 + LINK_SIZE; 1589 offset_top = md->end_offset_top; 1590 continue; 1591 1592 /* Negative assertion: all branches must fail to match for the assertion to 1593 succeed. */ 1594 1595 case OP_ASSERT_NOT: 1596 case OP_ASSERTBACK_NOT: 1597 save_mark = md->mark; 1598 if (md->match_function_type == MATCH_CONDASSERT) 1599 { 1600 condassert = TRUE; 1601 md->match_function_type = 0; 1602 } 1603 else condassert = FALSE; 1604 1605 /* Loop for each alternative branch. */ 1606 1607 do 1608 { 1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); 1610 md->mark = save_mark; /* Always restore the mark setting */ 1611 1612 switch(rrc) 1613 { 1614 case MATCH_MATCH: /* A successful match means */ 1615 case MATCH_ACCEPT: /* the assertion has failed. */ 1616 RRETURN(MATCH_NOMATCH); 1617 1618 case MATCH_NOMATCH: /* Carry on with next branch */ 1619 break; 1620 1621 /* See comment in the code for capturing groups above about handling 1622 THEN. */ 1623 1624 case MATCH_THEN: 1625 next = ecode + GET(ecode,1); 1626 if (md->start_match_ptr < next && 1627 (*ecode == OP_ALT || *next == OP_ALT)) 1628 { 1629 rrc = MATCH_NOMATCH; 1630 break; 1631 } 1632 /* Otherwise fall through. */ 1633 1634 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole 1635 assertion to fail to match, without considering any more alternatives. 1636 Failing to match means the assertion is true. This is a consistent 1637 approach, but does not always have the same effect as in Perl. */ 1638 1639 case MATCH_COMMIT: 1640 case MATCH_SKIP: 1641 case MATCH_SKIP_ARG: 1642 case MATCH_PRUNE: 1643 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1644 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ 1645 1646 /* Anything else is an error */ 1647 1648 default: 1649 RRETURN(rrc); 1650 } 1651 1652 /* Continue with next branch */ 1653 1654 ecode += GET(ecode,1); 1655 } 1656 while (*ecode == OP_ALT); 1657 1658 /* All branches in the assertion failed to match. */ 1659 1660 NEG_ASSERT_TRUE: 1661 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ 1662 ecode += 1 + LINK_SIZE; /* Continue with current branch */ 1663 continue; 1664 1665 /* Move the subject pointer back. This occurs only at the start of 1666 each branch of a lookbehind assertion. If we are too close to the start to 1667 move back, this match function fails. When working with UTF-8 we move 1668 back a number of characters, not bytes. */ 1669 1670 case OP_REVERSE: 1671 #ifdef SUPPORT_UTF 1672 if (utf) 1673 { 1674 i = GET(ecode, 1); 1675 while (i-- > 0) 1676 { 1677 eptr--; 1678 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1679 BACKCHAR(eptr); 1680 } 1681 } 1682 else 1683 #endif 1684 1685 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1686 1687 { 1688 eptr -= GET(ecode, 1); 1689 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1690 } 1691 1692 /* Save the earliest consulted character, then skip to next op code */ 1693 1694 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1695 ecode += 1 + LINK_SIZE; 1696 break; 1697 1698 /* The callout item calls an external function, if one is provided, passing 1699 details of the match so far. This is mainly for debugging, though the 1700 function is able to force a failure. */ 1701 1702 case OP_CALLOUT: 1703 if (PUBL(callout) != NULL) 1704 { 1705 PUBL(callout_block) cb; 1706 cb.version = 2; /* Version 1 of the callout block */ 1707 cb.callout_number = ecode[1]; 1708 cb.offset_vector = md->offset_vector; 1709 #if defined COMPILE_PCRE8 1710 cb.subject = (PCRE_SPTR)md->start_subject; 1711 #elif defined COMPILE_PCRE16 1712 cb.subject = (PCRE_SPTR16)md->start_subject; 1713 #elif defined COMPILE_PCRE32 1714 cb.subject = (PCRE_SPTR32)md->start_subject; 1715 #endif 1716 cb.subject_length = (int)(md->end_subject - md->start_subject); 1717 cb.start_match = (int)(mstart - md->start_subject); 1718 cb.current_position = (int)(eptr - md->start_subject); 1719 cb.pattern_position = GET(ecode, 2); 1720 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1721 cb.capture_top = offset_top/2; 1722 cb.capture_last = md->capture_last & CAPLMASK; 1723 /* Internal change requires this for API compatibility. */ 1724 if (cb.capture_last == 0) cb.capture_last = -1; 1725 cb.callout_data = md->callout_data; 1726 cb.mark = md->nomatch_mark; 1727 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1728 if (rrc < 0) RRETURN(rrc); 1729 } 1730 ecode += 2 + 2*LINK_SIZE; 1731 break; 1732 1733 /* Recursion either matches the current regex, or some subexpression. The 1734 offset data is the offset to the starting bracket from the start of the 1735 whole pattern. (This is so that it works from duplicated subpatterns.) 1736 1737 The state of the capturing groups is preserved over recursion, and 1738 re-instated afterwards. We don't know how many are started and not yet 1739 finished (offset_top records the completed total) so we just have to save 1740 all the potential data. There may be up to 65535 such values, which is too 1741 large to put on the stack, but using malloc for small numbers seems 1742 expensive. As a compromise, the stack is used when there are no more than 1743 REC_STACK_SAVE_MAX values to store; otherwise malloc is used. 1744 1745 There are also other values that have to be saved. We use a chained 1746 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1747 for the original version of this logic. It has, however, been hacked around 1748 a lot, so he is not to blame for the current way it works. */ 1749 1750 case OP_RECURSE: 1751 { 1752 recursion_info *ri; 1753 unsigned int recno; 1754 1755 callpat = md->start_code + GET(ecode, 1); 1756 recno = (callpat == md->start_code)? 0 : 1757 GET2(callpat, 1 + LINK_SIZE); 1758 1759 /* Check for repeating a recursion without advancing the subject pointer. 1760 This should catch convoluted mutual recursions. (Some simple cases are 1761 caught at compile time.) */ 1762 1763 for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 1764 if (recno == ri->group_num && eptr == ri->subject_position) 1765 RRETURN(PCRE_ERROR_RECURSELOOP); 1766 1767 /* Add to "recursing stack" */ 1768 1769 new_recursive.group_num = recno; 1770 new_recursive.saved_capture_last = md->capture_last; 1771 new_recursive.subject_position = eptr; 1772 new_recursive.prevrec = md->recursive; 1773 md->recursive = &new_recursive; 1774 1775 /* Where to continue from afterwards */ 1776 1777 ecode += 1 + LINK_SIZE; 1778 1779 /* Now save the offset data */ 1780 1781 new_recursive.saved_max = md->offset_end; 1782 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1783 new_recursive.offset_save = stacksave; 1784 else 1785 { 1786 new_recursive.offset_save = 1787 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); 1788 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1789 } 1790 memcpy(new_recursive.offset_save, md->offset_vector, 1791 new_recursive.saved_max * sizeof(int)); 1792 1793 /* OK, now we can do the recursion. After processing each alternative, 1794 restore the offset data and the last captured value. If there were nested 1795 recursions, md->recursive might be changed, so reset it before looping. 1796 */ 1797 1798 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1799 cbegroup = (*callpat >= OP_SBRA); 1800 do 1801 { 1802 if (cbegroup) md->match_function_type = MATCH_CBEGROUP; 1803 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, 1804 md, eptrb, RM6); 1805 memcpy(md->offset_vector, new_recursive.offset_save, 1806 new_recursive.saved_max * sizeof(int)); 1807 md->capture_last = new_recursive.saved_capture_last; 1808 md->recursive = new_recursive.prevrec; 1809 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1810 { 1811 DPRINTF(("Recursion matched\n")); 1812 if (new_recursive.offset_save != stacksave) 1813 (PUBL(free))(new_recursive.offset_save); 1814 1815 /* Set where we got to in the subject, and reset the start in case 1816 it was changed by \K. This *is* propagated back out of a recursion, 1817 for Perl compatibility. */ 1818 1819 eptr = md->end_match_ptr; 1820 mstart = md->start_match_ptr; 1821 goto RECURSION_MATCHED; /* Exit loop; end processing */ 1822 } 1823 1824 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a 1825 recursion; they cause a NOMATCH for the entire recursion. These codes 1826 are defined in a range that can be tested for. */ 1827 1828 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) 1829 RRETURN(MATCH_NOMATCH); 1830 1831 /* Any return code other than NOMATCH is an error. */ 1832 1833 if (rrc != MATCH_NOMATCH) 1834 { 1835 DPRINTF(("Recursion gave error %d\n", rrc)); 1836 if (new_recursive.offset_save != stacksave) 1837 (PUBL(free))(new_recursive.offset_save); 1838 RRETURN(rrc); 1839 } 1840 1841 md->recursive = &new_recursive; 1842 callpat += GET(callpat, 1); 1843 } 1844 while (*callpat == OP_ALT); 1845 1846 DPRINTF(("Recursion didn't match\n")); 1847 md->recursive = new_recursive.prevrec; 1848 if (new_recursive.offset_save != stacksave) 1849 (PUBL(free))(new_recursive.offset_save); 1850 RRETURN(MATCH_NOMATCH); 1851 } 1852 1853 RECURSION_MATCHED: 1854 break; 1855 1856 /* An alternation is the end of a branch; scan along to find the end of the 1857 bracketed group and go to there. */ 1858 1859 case OP_ALT: 1860 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1861 break; 1862 1863 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1864 indicating that it may occur zero times. It may repeat infinitely, or not 1865 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1866 with fixed upper repeat limits are compiled as a number of copies, with the 1867 optional ones preceded by BRAZERO or BRAMINZERO. */ 1868 1869 case OP_BRAZERO: 1870 next = ecode + 1; 1871 RMATCH(eptr, next, offset_top, md, eptrb, RM10); 1872 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1873 do next += GET(next, 1); while (*next == OP_ALT); 1874 ecode = next + 1 + LINK_SIZE; 1875 break; 1876 1877 case OP_BRAMINZERO: 1878 next = ecode + 1; 1879 do next += GET(next, 1); while (*next == OP_ALT); 1880 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11); 1881 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1882 ecode++; 1883 break; 1884 1885 case OP_SKIPZERO: 1886 next = ecode+1; 1887 do next += GET(next,1); while (*next == OP_ALT); 1888 ecode = next + 1 + LINK_SIZE; 1889 break; 1890 1891 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything 1892 here; just jump to the group, with allow_zero set TRUE. */ 1893 1894 case OP_BRAPOSZERO: 1895 op = *(++ecode); 1896 allow_zero = TRUE; 1897 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; 1898 goto POSSESSIVE_NON_CAPTURE; 1899 1900 /* End of a group, repeated or non-repeating. */ 1901 1902 case OP_KET: 1903 case OP_KETRMIN: 1904 case OP_KETRMAX: 1905 case OP_KETRPOS: 1906 prev = ecode - GET(ecode, 1); 1907 1908 /* If this was a group that remembered the subject start, in order to break 1909 infinite repeats of empty string matches, retrieve the subject start from 1910 the chain. Otherwise, set it NULL. */ 1911 1912 if (*prev >= OP_SBRA || *prev == OP_ONCE) 1913 { 1914 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1915 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1916 } 1917 else saved_eptr = NULL; 1918 1919 /* If we are at the end of an assertion group or a non-capturing atomic 1920 group, stop matching and return MATCH_MATCH, but record the current high 1921 water mark for use by positive assertions. We also need to record the match 1922 start in case it was changed by \K. */ 1923 1924 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || 1925 *prev == OP_ONCE_NC) 1926 { 1927 md->end_match_ptr = eptr; /* For ONCE_NC */ 1928 md->end_offset_top = offset_top; 1929 md->start_match_ptr = mstart; 1930 RRETURN(MATCH_MATCH); /* Sets md->mark */ 1931 } 1932 1933 /* For capturing groups we have to check the group number back at the start 1934 and if necessary complete handling an extraction by setting the offsets and 1935 bumping the high water mark. Whole-pattern recursion is coded as a recurse 1936 into group 0, so it won't be picked up here. Instead, we catch it when the 1937 OP_END is reached. Other recursion is handled here. We just have to record 1938 the current subject position and start match pointer and give a MATCH 1939 return. */ 1940 1941 if (*prev == OP_CBRA || *prev == OP_SCBRA || 1942 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) 1943 { 1944 number = GET2(prev, 1+LINK_SIZE); 1945 offset = number << 1; 1946 1947 #ifdef PCRE_DEBUG 1948 printf("end bracket %d", number); 1949 printf("\n"); 1950 #endif 1951 1952 /* Handle a recursively called group. */ 1953 1954 if (md->recursive != NULL && md->recursive->group_num == number) 1955 { 1956 md->end_match_ptr = eptr; 1957 md->start_match_ptr = mstart; 1958 RRETURN(MATCH_MATCH); 1959 } 1960 1961 /* Deal with capturing */ 1962 1963 md->capture_last = (md->capture_last & OVFLMASK) | number; 1964 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else 1965 { 1966 /* If offset is greater than offset_top, it means that we are 1967 "skipping" a capturing group, and that group's offsets must be marked 1968 unset. In earlier versions of PCRE, all the offsets were unset at the 1969 start of matching, but this doesn't work because atomic groups and 1970 assertions can cause a value to be set that should later be unset. 1971 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as 1972 part of the atomic group, but this is not on the final matching path, 1973 so must be unset when 2 is set. (If there is no group 2, there is no 1974 problem, because offset_top will then be 2, indicating no capture.) */ 1975 1976 if (offset > offset_top) 1977 { 1978 register int *iptr = md->offset_vector + offset_top; 1979 register int *iend = md->offset_vector + offset; 1980 while (iptr < iend) *iptr++ = -1; 1981 } 1982 1983 /* Now make the extraction */ 1984 1985 md->offset_vector[offset] = 1986 md->offset_vector[md->offset_end - number]; 1987 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1988 if (offset_top <= offset) offset_top = offset + 2; 1989 } 1990 } 1991 1992 /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 1993 and return the MATCH_KETRPOS. This makes it possible to do the repeats one 1994 at a time from the outer level, thus saving stack. This must precede the 1995 empty string test - in this case that test is done at the outer level. */ 1996 1997 if (*ecode == OP_KETRPOS) 1998 { 1999 md->start_match_ptr = mstart; /* In case \K reset it */ 2000 md->end_match_ptr = eptr; 2001 md->end_offset_top = offset_top; 2002 RRETURN(MATCH_KETRPOS); 2003 } 2004 2005 /* For an ordinary non-repeating ket, just continue at this level. This 2006 also happens for a repeating ket if no characters were matched in the 2007 group. This is the forcible breaking of infinite loops as implemented in 2008 Perl 5.005. For a non-repeating atomic group that includes captures, 2009 establish a backup point by processing the rest of the pattern at a lower 2010 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the 2011 original OP_ONCE level, thereby bypassing intermediate backup points, but 2012 resetting any captures that happened along the way. */ 2013 2014 if (*ecode == OP_KET || eptr == saved_eptr) 2015 { 2016 if (*prev == OP_ONCE) 2017 { 2018 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12); 2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2020 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2021 RRETURN(MATCH_ONCE); 2022 } 2023 ecode += 1 + LINK_SIZE; /* Carry on at this level */ 2024 break; 2025 } 2026 2027 /* The normal repeating kets try the rest of the pattern or restart from 2028 the preceding bracket, in the appropriate order. In the second case, we can 2029 use tail recursion to avoid using another stack frame, unless we have an 2030 an atomic group or an unlimited repeat of a group that can match an empty 2031 string. */ 2032 2033 if (*ecode == OP_KETRMIN) 2034 { 2035 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7); 2036 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2037 if (*prev == OP_ONCE) 2038 { 2039 RMATCH(eptr, prev, offset_top, md, eptrb, RM8); 2040 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2041 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2042 RRETURN(MATCH_ONCE); 2043 } 2044 if (*prev >= OP_SBRA) /* Could match an empty string */ 2045 { 2046 RMATCH(eptr, prev, offset_top, md, eptrb, RM50); 2047 RRETURN(rrc); 2048 } 2049 ecode = prev; 2050 goto TAIL_RECURSE; 2051 } 2052 else /* OP_KETRMAX */ 2053 { 2054 RMATCH(eptr, prev, offset_top, md, eptrb, RM13); 2055 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; 2056 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2057 if (*prev == OP_ONCE) 2058 { 2059 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9); 2060 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2061 md->once_target = prev; 2062 RRETURN(MATCH_ONCE); 2063 } 2064 ecode += 1 + LINK_SIZE; 2065 goto TAIL_RECURSE; 2066 } 2067 /* Control never gets here */ 2068 2069 /* Not multiline mode: start of subject assertion, unless notbol. */ 2070 2071 case OP_CIRC: 2072 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2073 2074 /* Start of subject assertion */ 2075 2076 case OP_SOD: 2077 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 2078 ecode++; 2079 break; 2080 2081 /* Multiline mode: start of subject unless notbol, or after any newline. */ 2082 2083 case OP_CIRCM: 2084 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2085 if (eptr != md->start_subject && 2086 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 2087 RRETURN(MATCH_NOMATCH); 2088 ecode++; 2089 break; 2090 2091 /* Start of match assertion */ 2092 2093 case OP_SOM: 2094 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 2095 ecode++; 2096 break; 2097 2098 /* Reset the start of match point */ 2099 2100 case OP_SET_SOM: 2101 mstart = eptr; 2102 ecode++; 2103 break; 2104 2105 /* Multiline mode: assert before any newline, or before end of subject 2106 unless noteol is set. */ 2107 2108 case OP_DOLLM: 2109 if (eptr < md->end_subject) 2110 { 2111 if (!IS_NEWLINE(eptr)) 2112 { 2113 if (md->partial != 0 && 2114 eptr + 1 >= md->end_subject && 2115 NLBLOCK->nltype == NLTYPE_FIXED && 2116 NLBLOCK->nllen == 2 && 2117 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2118 { 2119 md->hitend = TRUE; 2120 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2121 } 2122 RRETURN(MATCH_NOMATCH); 2123 } 2124 } 2125 else 2126 { 2127 if (md->noteol) RRETURN(MATCH_NOMATCH); 2128 SCHECK_PARTIAL(); 2129 } 2130 ecode++; 2131 break; 2132 2133 /* Not multiline mode: assert before a terminating newline or before end of 2134 subject unless noteol is set. */ 2135 2136 case OP_DOLL: 2137 if (md->noteol) RRETURN(MATCH_NOMATCH); 2138 if (!md->endonly) goto ASSERT_NL_OR_EOS; 2139 2140 /* ... else fall through for endonly */ 2141 2142 /* End of subject assertion (\z) */ 2143 2144 case OP_EOD: 2145 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 2146 SCHECK_PARTIAL(); 2147 ecode++; 2148 break; 2149 2150 /* End of subject or ending \n assertion (\Z) */ 2151 2152 case OP_EODN: 2153 ASSERT_NL_OR_EOS: 2154 if (eptr < md->end_subject && 2155 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 2156 { 2157 if (md->partial != 0 && 2158 eptr + 1 >= md->end_subject && 2159 NLBLOCK->nltype == NLTYPE_FIXED && 2160 NLBLOCK->nllen == 2 && 2161 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2162 { 2163 md->hitend = TRUE; 2164 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2165 } 2166 RRETURN(MATCH_NOMATCH); 2167 } 2168 2169 /* Either at end of string or \n before end. */ 2170 2171 SCHECK_PARTIAL(); 2172 ecode++; 2173 break; 2174 2175 /* Word boundary assertions */ 2176 2177 case OP_NOT_WORD_BOUNDARY: 2178 case OP_WORD_BOUNDARY: 2179 { 2180 2181 /* Find out if the previous and current characters are "word" characters. 2182 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 2183 be "non-word" characters. Remember the earliest consulted character for 2184 partial matching. */ 2185 2186 #ifdef SUPPORT_UTF 2187 if (utf) 2188 { 2189 /* Get status of previous character */ 2190 2191 if (eptr == md->start_subject) prev_is_word = FALSE; else 2192 { 2193 PCRE_PUCHAR lastptr = eptr - 1; 2194 BACKCHAR(lastptr); 2195 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 2196 GETCHAR(c, lastptr); 2197 #ifdef SUPPORT_UCP 2198 if (md->use_ucp) 2199 { 2200 if (c == '_') prev_is_word = TRUE; else 2201 { 2202 int cat = UCD_CATEGORY(c); 2203 prev_is_word = (cat == ucp_L || cat == ucp_N); 2204 } 2205 } 2206 else 2207 #endif 2208 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2209 } 2210 2211 /* Get status of next character */ 2212 2213 if (eptr >= md->end_subject) 2214 { 2215 SCHECK_PARTIAL(); 2216 cur_is_word = FALSE; 2217 } 2218 else 2219 { 2220 GETCHAR(c, eptr); 2221 #ifdef SUPPORT_UCP 2222 if (md->use_ucp) 2223 { 2224 if (c == '_') cur_is_word = TRUE; else 2225 { 2226 int cat = UCD_CATEGORY(c); 2227 cur_is_word = (cat == ucp_L || cat == ucp_N); 2228 } 2229 } 2230 else 2231 #endif 2232 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2233 } 2234 } 2235 else 2236 #endif 2237 2238 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for 2239 consistency with the behaviour of \w we do use it in this case. */ 2240 2241 { 2242 /* Get status of previous character */ 2243 2244 if (eptr == md->start_subject) prev_is_word = FALSE; else 2245 { 2246 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 2247 #ifdef SUPPORT_UCP 2248 if (md->use_ucp) 2249 { 2250 c = eptr[-1]; 2251 if (c == '_') prev_is_word = TRUE; else 2252 { 2253 int cat = UCD_CATEGORY(c); 2254 prev_is_word = (cat == ucp_L || cat == ucp_N); 2255 } 2256 } 2257 else 2258 #endif 2259 prev_is_word = MAX_255(eptr[-1]) 2260 && ((md->ctypes[eptr[-1]] & ctype_word) != 0); 2261 } 2262 2263 /* Get status of next character */ 2264 2265 if (eptr >= md->end_subject) 2266 { 2267 SCHECK_PARTIAL(); 2268 cur_is_word = FALSE; 2269 } 2270 else 2271 #ifdef SUPPORT_UCP 2272 if (md->use_ucp) 2273 { 2274 c = *eptr; 2275 if (c == '_') cur_is_word = TRUE; else 2276 { 2277 int cat = UCD_CATEGORY(c); 2278 cur_is_word = (cat == ucp_L || cat == ucp_N); 2279 } 2280 } 2281 else 2282 #endif 2283 cur_is_word = MAX_255(*eptr) 2284 && ((md->ctypes[*eptr] & ctype_word) != 0); 2285 } 2286 2287 /* Now see if the situation is what we want */ 2288 2289 if ((*ecode++ == OP_WORD_BOUNDARY)? 2290 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 2291 RRETURN(MATCH_NOMATCH); 2292 } 2293 break; 2294 2295 /* Match any single character type except newline; have to take care with 2296 CRLF newlines and partial matching. */ 2297 2298 case OP_ANY: 2299 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 2300 if (md->partial != 0 && 2301 eptr + 1 >= md->end_subject && 2302 NLBLOCK->nltype == NLTYPE_FIXED && 2303 NLBLOCK->nllen == 2 && 2304 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2305 { 2306 md->hitend = TRUE; 2307 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2308 } 2309 2310 /* Fall through */ 2311 2312 /* Match any single character whatsoever. */ 2313 2314 case OP_ALLANY: 2315 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2316 { /* not be updated before SCHECK_PARTIAL. */ 2317 SCHECK_PARTIAL(); 2318 RRETURN(MATCH_NOMATCH); 2319 } 2320 eptr++; 2321 #ifdef SUPPORT_UTF 2322 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 2323 #endif 2324 ecode++; 2325 break; 2326 2327 /* Match a single byte, even in UTF-8 mode. This opcode really does match 2328 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 2329 2330 case OP_ANYBYTE: 2331 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2332 { /* not be updated before SCHECK_PARTIAL. */ 2333 SCHECK_PARTIAL(); 2334 RRETURN(MATCH_NOMATCH); 2335 } 2336 eptr++; 2337 ecode++; 2338 break; 2339 2340 case OP_NOT_DIGIT: 2341 if (eptr >= md->end_subject) 2342 { 2343 SCHECK_PARTIAL(); 2344 RRETURN(MATCH_NOMATCH); 2345 } 2346 GETCHARINCTEST(c, eptr); 2347 if ( 2348 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2349 c < 256 && 2350 #endif 2351 (md->ctypes[c] & ctype_digit) != 0 2352 ) 2353 RRETURN(MATCH_NOMATCH); 2354 ecode++; 2355 break; 2356 2357 case OP_DIGIT: 2358 if (eptr >= md->end_subject) 2359 { 2360 SCHECK_PARTIAL(); 2361 RRETURN(MATCH_NOMATCH); 2362 } 2363 GETCHARINCTEST(c, eptr); 2364 if ( 2365 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2366 c > 255 || 2367 #endif 2368 (md->ctypes[c] & ctype_digit) == 0 2369 ) 2370 RRETURN(MATCH_NOMATCH); 2371 ecode++; 2372 break; 2373 2374 case OP_NOT_WHITESPACE: 2375 if (eptr >= md->end_subject) 2376 { 2377 SCHECK_PARTIAL(); 2378 RRETURN(MATCH_NOMATCH); 2379 } 2380 GETCHARINCTEST(c, eptr); 2381 if ( 2382 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2383 c < 256 && 2384 #endif 2385 (md->ctypes[c] & ctype_space) != 0 2386 ) 2387 RRETURN(MATCH_NOMATCH); 2388 ecode++; 2389 break; 2390 2391 case OP_WHITESPACE: 2392 if (eptr >= md->end_subject) 2393 { 2394 SCHECK_PARTIAL(); 2395 RRETURN(MATCH_NOMATCH); 2396 } 2397 GETCHARINCTEST(c, eptr); 2398 if ( 2399 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2400 c > 255 || 2401 #endif 2402 (md->ctypes[c] & ctype_space) == 0 2403 ) 2404 RRETURN(MATCH_NOMATCH); 2405 ecode++; 2406 break; 2407 2408 case OP_NOT_WORDCHAR: 2409 if (eptr >= md->end_subject) 2410 { 2411 SCHECK_PARTIAL(); 2412 RRETURN(MATCH_NOMATCH); 2413 } 2414 GETCHARINCTEST(c, eptr); 2415 if ( 2416 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2417 c < 256 && 2418 #endif 2419 (md->ctypes[c] & ctype_word) != 0 2420 ) 2421 RRETURN(MATCH_NOMATCH); 2422 ecode++; 2423 break; 2424 2425 case OP_WORDCHAR: 2426 if (eptr >= md->end_subject) 2427 { 2428 SCHECK_PARTIAL(); 2429 RRETURN(MATCH_NOMATCH); 2430 } 2431 GETCHARINCTEST(c, eptr); 2432 if ( 2433 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2434 c > 255 || 2435 #endif 2436 (md->ctypes[c] & ctype_word) == 0 2437 ) 2438 RRETURN(MATCH_NOMATCH); 2439 ecode++; 2440 break; 2441 2442 case OP_ANYNL: 2443 if (eptr >= md->end_subject) 2444 { 2445 SCHECK_PARTIAL(); 2446 RRETURN(MATCH_NOMATCH); 2447 } 2448 GETCHARINCTEST(c, eptr); 2449 switch(c) 2450 { 2451 default: RRETURN(MATCH_NOMATCH); 2452 2453 case CHAR_CR: 2454 if (eptr >= md->end_subject) 2455 { 2456 SCHECK_PARTIAL(); 2457 } 2458 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++; 2459 break; 2460 2461 case CHAR_LF: 2462 break; 2463 2464 case CHAR_VT: 2465 case CHAR_FF: 2466 case CHAR_NEL: 2467 #ifndef EBCDIC 2468 case 0x2028: 2469 case 0x2029: 2470 #endif /* Not EBCDIC */ 2471 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 2472 break; 2473 } 2474 ecode++; 2475 break; 2476 2477 case OP_NOT_HSPACE: 2478 if (eptr >= md->end_subject) 2479 { 2480 SCHECK_PARTIAL(); 2481 RRETURN(MATCH_NOMATCH); 2482 } 2483 GETCHARINCTEST(c, eptr); 2484 switch(c) 2485 { 2486 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 2487 default: break; 2488 } 2489 ecode++; 2490 break; 2491 2492 case OP_HSPACE: 2493 if (eptr >= md->end_subject) 2494 { 2495 SCHECK_PARTIAL(); 2496 RRETURN(MATCH_NOMATCH); 2497 } 2498 GETCHARINCTEST(c, eptr); 2499 switch(c) 2500 { 2501 HSPACE_CASES: break; /* Byte and multibyte cases */ 2502 default: RRETURN(MATCH_NOMATCH); 2503 } 2504 ecode++; 2505 break; 2506 2507 case OP_NOT_VSPACE: 2508 if (eptr >= md->end_subject) 2509 { 2510 SCHECK_PARTIAL(); 2511 RRETURN(MATCH_NOMATCH); 2512 } 2513 GETCHARINCTEST(c, eptr); 2514 switch(c) 2515 { 2516 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2517 default: break; 2518 } 2519 ecode++; 2520 break; 2521 2522 case OP_VSPACE: 2523 if (eptr >= md->end_subject) 2524 { 2525 SCHECK_PARTIAL(); 2526 RRETURN(MATCH_NOMATCH); 2527 } 2528 GETCHARINCTEST(c, eptr); 2529 switch(c) 2530 { 2531 VSPACE_CASES: break; 2532 default: RRETURN(MATCH_NOMATCH); 2533 } 2534 ecode++; 2535 break; 2536 2537 #ifdef SUPPORT_UCP 2538 /* Check the next character by Unicode property. We will get here only 2539 if the support is in the binary; otherwise a compile-time error occurs. */ 2540 2541 case OP_PROP: 2542 case OP_NOTPROP: 2543 if (eptr >= md->end_subject) 2544 { 2545 SCHECK_PARTIAL(); 2546 RRETURN(MATCH_NOMATCH); 2547 } 2548 GETCHARINCTEST(c, eptr); 2549 { 2550 const pcre_uint32 *cp; 2551 const ucd_record *prop = GET_UCD(c); 2552 2553 switch(ecode[1]) 2554 { 2555 case PT_ANY: 2556 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2557 break; 2558 2559 case PT_LAMP: 2560 if ((prop->chartype == ucp_Lu || 2561 prop->chartype == ucp_Ll || 2562 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2563 RRETURN(MATCH_NOMATCH); 2564 break; 2565 2566 case PT_GC: 2567 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) 2568 RRETURN(MATCH_NOMATCH); 2569 break; 2570 2571 case PT_PC: 2572 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2573 RRETURN(MATCH_NOMATCH); 2574 break; 2575 2576 case PT_SC: 2577 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2578 RRETURN(MATCH_NOMATCH); 2579 break; 2580 2581 /* These are specials */ 2582 2583 case PT_ALNUM: 2584 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2585 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2586 RRETURN(MATCH_NOMATCH); 2587 break; 2588 2589 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2590 which means that Perl space and POSIX space are now identical. PCRE 2591 was changed at release 8.34. */ 2592 2593 case PT_SPACE: /* Perl space */ 2594 case PT_PXSPACE: /* POSIX space */ 2595 switch(c) 2596 { 2597 HSPACE_CASES: 2598 VSPACE_CASES: 2599 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2600 break; 2601 2602 default: 2603 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == 2604 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); 2605 break; 2606 } 2607 break; 2608 2609 case PT_WORD: 2610 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2611 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2612 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2613 RRETURN(MATCH_NOMATCH); 2614 break; 2615 2616 case PT_CLIST: 2617 cp = PRIV(ucd_caseless_sets) + ecode[2]; 2618 for (;;) 2619 { 2620 if (c < *cp) 2621 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } 2622 if (c == *cp++) 2623 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } 2624 } 2625 break; 2626 2627 case PT_UCNC: 2628 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 2629 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 2630 c >= 0xe000) == (op == OP_NOTPROP)) 2631 RRETURN(MATCH_NOMATCH); 2632 break; 2633 2634 /* This should never occur */ 2635 2636 default: 2637 RRETURN(PCRE_ERROR_INTERNAL); 2638 } 2639 2640 ecode += 3; 2641 } 2642 break; 2643 2644 /* Match an extended Unicode sequence. We will get here only if the support 2645 is in the binary; otherwise a compile-time error occurs. */ 2646 2647 case OP_EXTUNI: 2648 if (eptr >= md->end_subject) 2649 { 2650 SCHECK_PARTIAL(); 2651 RRETURN(MATCH_NOMATCH); 2652 } 2653 else 2654 { 2655 int lgb, rgb; 2656 GETCHARINCTEST(c, eptr); 2657 lgb = UCD_GRAPHBREAK(c); 2658 while (eptr < md->end_subject) 2659 { 2660 int len = 1; 2661 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 2662 rgb = UCD_GRAPHBREAK(c); 2663 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2664 lgb = rgb; 2665 eptr += len; 2666 } 2667 } 2668 CHECK_PARTIAL(); 2669 ecode++; 2670 break; 2671 #endif /* SUPPORT_UCP */ 2672 2673 2674 /* Match a back reference, possibly repeatedly. Look past the end of the 2675 item to see if there is repeat information following. The code is similar 2676 to that for character classes, but repeated for efficiency. Then obey 2677 similar code to character type repeats - written out again for speed. 2678 However, if the referenced string is the empty string, always treat 2679 it as matched, any number of times (otherwise there could be infinite 2680 loops). If the reference is unset, there are two possibilities: 2681 2682 (a) In the default, Perl-compatible state, set the length negative; 2683 this ensures that every attempt at a match fails. We can't just fail 2684 here, because of the possibility of quantifiers with zero minima. 2685 2686 (b) If the JavaScript compatibility flag is set, set the length to zero 2687 so that the back reference matches an empty string. 2688 2689 Otherwise, set the length to the length of what was matched by the 2690 referenced subpattern. 2691 2692 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group 2693 or to a non-duplicated named group. For a duplicated named group, OP_DNREF 2694 and OP_DNREFI are used. In this case we must scan the list of groups to 2695 which the name refers, and use the first one that is set. */ 2696 2697 case OP_DNREF: 2698 case OP_DNREFI: 2699 caseless = op == OP_DNREFI; 2700 { 2701 int count = GET2(ecode, 1+IMM2_SIZE); 2702 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; 2703 ecode += 1 + 2*IMM2_SIZE; 2704 2705 /* Setting the default length first and initializing 'offset' avoids 2706 compiler warnings in the REF_REPEAT code. */ 2707 2708 length = (md->jscript_compat)? 0 : -1; 2709 offset = 0; 2710 2711 while (count-- > 0) 2712 { 2713 offset = GET2(slot, 0) << 1; 2714 if (offset < offset_top && md->offset_vector[offset] >= 0) 2715 { 2716 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2717 break; 2718 } 2719 slot += md->name_entry_size; 2720 } 2721 } 2722 goto REF_REPEAT; 2723 2724 case OP_REF: 2725 case OP_REFI: 2726 caseless = op == OP_REFI; 2727 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2728 ecode += 1 + IMM2_SIZE; 2729 if (offset >= offset_top || md->offset_vector[offset] < 0) 2730 length = (md->jscript_compat)? 0 : -1; 2731 else 2732 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2733 2734 /* Set up for repetition, or handle the non-repeated case */ 2735 2736 REF_REPEAT: 2737 switch (*ecode) 2738 { 2739 case OP_CRSTAR: 2740 case OP_CRMINSTAR: 2741 case OP_CRPLUS: 2742 case OP_CRMINPLUS: 2743 case OP_CRQUERY: 2744 case OP_CRMINQUERY: 2745 c = *ecode++ - OP_CRSTAR; 2746 minimize = (c & 1) != 0; 2747 min = rep_min[c]; /* Pick up values from tables; */ 2748 max = rep_max[c]; /* zero for max => infinity */ 2749 if (max == 0) max = INT_MAX; 2750 break; 2751 2752 case OP_CRRANGE: 2753 case OP_CRMINRANGE: 2754 minimize = (*ecode == OP_CRMINRANGE); 2755 min = GET2(ecode, 1); 2756 max = GET2(ecode, 1 + IMM2_SIZE); 2757 if (max == 0) max = INT_MAX; 2758 ecode += 1 + 2 * IMM2_SIZE; 2759 break; 2760 2761 default: /* No repeat follows */ 2762 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) 2763 { 2764 if (length == -2) eptr = md->end_subject; /* Partial match */ 2765 CHECK_PARTIAL(); 2766 RRETURN(MATCH_NOMATCH); 2767 } 2768 eptr += length; 2769 continue; /* With the main loop */ 2770 } 2771 2772 /* Handle repeated back references. If the length of the reference is 2773 zero, just continue with the main loop. If the length is negative, it 2774 means the reference is unset in non-Java-compatible mode. If the minimum is 2775 zero, we can continue at the same level without recursion. For any other 2776 minimum, carrying on will result in NOMATCH. */ 2777 2778 if (length == 0) continue; 2779 if (length < 0 && min == 0) continue; 2780 2781 /* First, ensure the minimum number of matches are present. We get back 2782 the length of the reference string explicitly rather than passing the 2783 address of eptr, so that eptr can be a register variable. */ 2784 2785 for (i = 1; i <= min; i++) 2786 { 2787 int slength; 2788 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2789 { 2790 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2791 CHECK_PARTIAL(); 2792 RRETURN(MATCH_NOMATCH); 2793 } 2794 eptr += slength; 2795 } 2796 2797 /* If min = max, continue at the same level without recursion. 2798 They are not both allowed to be zero. */ 2799 2800 if (min == max) continue; 2801 2802 /* If minimizing, keep trying and advancing the pointer */ 2803 2804 if (minimize) 2805 { 2806 for (fi = min;; fi++) 2807 { 2808 int slength; 2809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14); 2810 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2811 if (fi >= max) RRETURN(MATCH_NOMATCH); 2812 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2813 { 2814 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2815 CHECK_PARTIAL(); 2816 RRETURN(MATCH_NOMATCH); 2817 } 2818 eptr += slength; 2819 } 2820 /* Control never gets here */ 2821 } 2822 2823 /* If maximizing, find the longest string and work backwards */ 2824 2825 else 2826 { 2827 pp = eptr; 2828 for (i = min; i < max; i++) 2829 { 2830 int slength; 2831 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2832 { 2833 /* Can't use CHECK_PARTIAL because we don't want to update eptr in 2834 the soft partial matching case. */ 2835 2836 if (slength == -2 && md->partial != 0 && 2837 md->end_subject > md->start_used_ptr) 2838 { 2839 md->hitend = TRUE; 2840 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2841 } 2842 break; 2843 } 2844 eptr += slength; 2845 } 2846 2847 while (eptr >= pp) 2848 { 2849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); 2850 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2851 eptr -= length; 2852 } 2853 RRETURN(MATCH_NOMATCH); 2854 } 2855 /* Control never gets here */ 2856 2857 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2858 used when all the characters in the class have values in the range 0-255, 2859 and either the matching is caseful, or the characters are in the range 2860 0-127 when UTF-8 processing is enabled. The only difference between 2861 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2862 encountered. 2863 2864 First, look past the end of the item to see if there is repeat information 2865 following. Then obey similar code to character type repeats - written out 2866 again for speed. */ 2867 2868 case OP_NCLASS: 2869 case OP_CLASS: 2870 { 2871 /* The data variable is saved across frames, so the byte map needs to 2872 be stored there. */ 2873 #define BYTE_MAP ((pcre_uint8 *)data) 2874 data = ecode + 1; /* Save for matching */ 2875 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ 2876 2877 switch (*ecode) 2878 { 2879 case OP_CRSTAR: 2880 case OP_CRMINSTAR: 2881 case OP_CRPLUS: 2882 case OP_CRMINPLUS: 2883 case OP_CRQUERY: 2884 case OP_CRMINQUERY: 2885 case OP_CRPOSSTAR: 2886 case OP_CRPOSPLUS: 2887 case OP_CRPOSQUERY: 2888 c = *ecode++ - OP_CRSTAR; 2889 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 2890 else possessive = TRUE; 2891 min = rep_min[c]; /* Pick up values from tables; */ 2892 max = rep_max[c]; /* zero for max => infinity */ 2893 if (max == 0) max = INT_MAX; 2894 break; 2895 2896 case OP_CRRANGE: 2897 case OP_CRMINRANGE: 2898 case OP_CRPOSRANGE: 2899 minimize = (*ecode == OP_CRMINRANGE); 2900 possessive = (*ecode == OP_CRPOSRANGE); 2901 min = GET2(ecode, 1); 2902 max = GET2(ecode, 1 + IMM2_SIZE); 2903 if (max == 0) max = INT_MAX; 2904 ecode += 1 + 2 * IMM2_SIZE; 2905 break; 2906 2907 default: /* No repeat follows */ 2908 min = max = 1; 2909 break; 2910 } 2911 2912 /* First, ensure the minimum number of matches are present. */ 2913 2914 #ifdef SUPPORT_UTF 2915 if (utf) 2916 { 2917 for (i = 1; i <= min; i++) 2918 { 2919 if (eptr >= md->end_subject) 2920 { 2921 SCHECK_PARTIAL(); 2922 RRETURN(MATCH_NOMATCH); 2923 } 2924 GETCHARINC(c, eptr); 2925 if (c > 255) 2926 { 2927 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2928 } 2929 else 2930 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2931 } 2932 } 2933 else 2934 #endif 2935 /* Not UTF mode */ 2936 { 2937 for (i = 1; i <= min; i++) 2938 { 2939 if (eptr >= md->end_subject) 2940 { 2941 SCHECK_PARTIAL(); 2942 RRETURN(MATCH_NOMATCH); 2943 } 2944 c = *eptr++; 2945 #ifndef COMPILE_PCRE8 2946 if (c > 255) 2947 { 2948 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2949 } 2950 else 2951 #endif 2952 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2953 } 2954 } 2955 2956 /* If max == min we can continue with the main loop without the 2957 need to recurse. */ 2958 2959 if (min == max) continue; 2960 2961 /* If minimizing, keep testing the rest of the expression and advancing 2962 the pointer while it matches the class. */ 2963 2964 if (minimize) 2965 { 2966 #ifdef SUPPORT_UTF 2967 if (utf) 2968 { 2969 for (fi = min;; fi++) 2970 { 2971 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16); 2972 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2973 if (fi >= max) RRETURN(MATCH_NOMATCH); 2974 if (eptr >= md->end_subject) 2975 { 2976 SCHECK_PARTIAL(); 2977 RRETURN(MATCH_NOMATCH); 2978 } 2979 GETCHARINC(c, eptr); 2980 if (c > 255) 2981 { 2982 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2983 } 2984 else 2985 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2986 } 2987 } 2988 else 2989 #endif 2990 /* Not UTF mode */ 2991 { 2992 for (fi = min;; fi++) 2993 { 2994 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17); 2995 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2996 if (fi >= max) RRETURN(MATCH_NOMATCH); 2997 if (eptr >= md->end_subject) 2998 { 2999 SCHECK_PARTIAL(); 3000 RRETURN(MATCH_NOMATCH); 3001 } 3002 c = *eptr++; 3003 #ifndef COMPILE_PCRE8 3004 if (c > 255) 3005 { 3006 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3007 } 3008 else 3009 #endif 3010 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3011 } 3012 } 3013 /* Control never gets here */ 3014 } 3015 3016 /* If maximizing, find the longest possible run, then work backwards. */ 3017 3018 else 3019 { 3020 pp = eptr; 3021 3022 #ifdef SUPPORT_UTF 3023 if (utf) 3024 { 3025 for (i = min; i < max; i++) 3026 { 3027 int len = 1; 3028 if (eptr >= md->end_subject) 3029 { 3030 SCHECK_PARTIAL(); 3031 break; 3032 } 3033 GETCHARLEN(c, eptr, len); 3034 if (c > 255) 3035 { 3036 if (op == OP_CLASS) break; 3037 } 3038 else 3039 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3040 eptr += len; 3041 } 3042 3043 if (possessive) continue; /* No backtracking */ 3044 3045 for (;;) 3046 { 3047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); 3048 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3049 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3050 BACKCHAR(eptr); 3051 } 3052 } 3053 else 3054 #endif 3055 /* Not UTF mode */ 3056 { 3057 for (i = min; i < max; i++) 3058 { 3059 if (eptr >= md->end_subject) 3060 { 3061 SCHECK_PARTIAL(); 3062 break; 3063 } 3064 c = *eptr; 3065 #ifndef COMPILE_PCRE8 3066 if (c > 255) 3067 { 3068 if (op == OP_CLASS) break; 3069 } 3070 else 3071 #endif 3072 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3073 eptr++; 3074 } 3075 3076 if (possessive) continue; /* No backtracking */ 3077 3078 while (eptr >= pp) 3079 { 3080 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); 3081 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3082 eptr--; 3083 } 3084 } 3085 3086 RRETURN(MATCH_NOMATCH); 3087 } 3088 #undef BYTE_MAP 3089 } 3090 /* Control never gets here */ 3091 3092 3093 /* Match an extended character class. In the 8-bit library, this opcode is 3094 encountered only when UTF-8 mode mode is supported. In the 16-bit and 3095 32-bit libraries, codepoints greater than 255 may be encountered even when 3096 UTF is not supported. */ 3097 3098 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3099 case OP_XCLASS: 3100 { 3101 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 3102 ecode += GET(ecode, 1); /* Advance past the item */ 3103 3104 switch (*ecode) 3105 { 3106 case OP_CRSTAR: 3107 case OP_CRMINSTAR: 3108 case OP_CRPLUS: 3109 case OP_CRMINPLUS: 3110 case OP_CRQUERY: 3111 case OP_CRMINQUERY: 3112 case OP_CRPOSSTAR: 3113 case OP_CRPOSPLUS: 3114 case OP_CRPOSQUERY: 3115 c = *ecode++ - OP_CRSTAR; 3116 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 3117 else possessive = TRUE; 3118 min = rep_min[c]; /* Pick up values from tables; */ 3119 max = rep_max[c]; /* zero for max => infinity */ 3120 if (max == 0) max = INT_MAX; 3121 break; 3122 3123 case OP_CRRANGE: 3124 case OP_CRMINRANGE: 3125 case OP_CRPOSRANGE: 3126 minimize = (*ecode == OP_CRMINRANGE); 3127 possessive = (*ecode == OP_CRPOSRANGE); 3128 min = GET2(ecode, 1); 3129 max = GET2(ecode, 1 + IMM2_SIZE); 3130 if (max == 0) max = INT_MAX; 3131 ecode += 1 + 2 * IMM2_SIZE; 3132 break; 3133 3134 default: /* No repeat follows */ 3135 min = max = 1; 3136 break; 3137 } 3138 3139 /* First, ensure the minimum number of matches are present. */ 3140 3141 for (i = 1; i <= min; i++) 3142 { 3143 if (eptr >= md->end_subject) 3144 { 3145 SCHECK_PARTIAL(); 3146 RRETURN(MATCH_NOMATCH); 3147 } 3148 GETCHARINCTEST(c, eptr); 3149 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3150 } 3151 3152 /* If max == min we can continue with the main loop without the 3153 need to recurse. */ 3154 3155 if (min == max) continue; 3156 3157 /* If minimizing, keep testing the rest of the expression and advancing 3158 the pointer while it matches the class. */ 3159 3160 if (minimize) 3161 { 3162 for (fi = min;; fi++) 3163 { 3164 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20); 3165 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3166 if (fi >= max) RRETURN(MATCH_NOMATCH); 3167 if (eptr >= md->end_subject) 3168 { 3169 SCHECK_PARTIAL(); 3170 RRETURN(MATCH_NOMATCH); 3171 } 3172 GETCHARINCTEST(c, eptr); 3173 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3174 } 3175 /* Control never gets here */ 3176 } 3177 3178 /* If maximizing, find the longest possible run, then work backwards. */ 3179 3180 else 3181 { 3182 pp = eptr; 3183 for (i = min; i < max; i++) 3184 { 3185 int len = 1; 3186 if (eptr >= md->end_subject) 3187 { 3188 SCHECK_PARTIAL(); 3189 break; 3190 } 3191 #ifdef SUPPORT_UTF 3192 GETCHARLENTEST(c, eptr, len); 3193 #else 3194 c = *eptr; 3195 #endif 3196 if (!PRIV(xclass)(c, data, utf)) break; 3197 eptr += len; 3198 } 3199 3200 if (possessive) continue; /* No backtracking */ 3201 3202 for(;;) 3203 { 3204 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); 3205 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3206 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3207 #ifdef SUPPORT_UTF 3208 if (utf) BACKCHAR(eptr); 3209 #endif 3210 } 3211 RRETURN(MATCH_NOMATCH); 3212 } 3213 3214 /* Control never gets here */ 3215 } 3216 #endif /* End of XCLASS */ 3217 3218 /* Match a single character, casefully */ 3219 3220 case OP_CHAR: 3221 #ifdef SUPPORT_UTF 3222 if (utf) 3223 { 3224 length = 1; 3225 ecode++; 3226 GETCHARLEN(fc, ecode, length); 3227 if (length > md->end_subject - eptr) 3228 { 3229 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 3230 RRETURN(MATCH_NOMATCH); 3231 } 3232 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); 3233 } 3234 else 3235 #endif 3236 /* Not UTF mode */ 3237 { 3238 if (md->end_subject - eptr < 1) 3239 { 3240 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 3241 RRETURN(MATCH_NOMATCH); 3242 } 3243 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 3244 ecode += 2; 3245 } 3246 break; 3247 3248 /* Match a single character, caselessly. If we are at the end of the 3249 subject, give up immediately. */ 3250 3251 case OP_CHARI: 3252 if (eptr >= md->end_subject) 3253 { 3254 SCHECK_PARTIAL(); 3255 RRETURN(MATCH_NOMATCH); 3256 } 3257 3258 #ifdef SUPPORT_UTF 3259 if (utf) 3260 { 3261 length = 1; 3262 ecode++; 3263 GETCHARLEN(fc, ecode, length); 3264 3265 /* If the pattern character's value is < 128, we have only one byte, and 3266 we know that its other case must also be one byte long, so we can use the 3267 fast lookup table. We know that there is at least one byte left in the 3268 subject. */ 3269 3270 if (fc < 128) 3271 { 3272 pcre_uint32 cc = UCHAR21(eptr); 3273 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); 3274 ecode++; 3275 eptr++; 3276 } 3277 3278 /* Otherwise we must pick up the subject character. Note that we cannot 3279 use the value of "length" to check for sufficient bytes left, because the 3280 other case of the character may have more or fewer bytes. */ 3281 3282 else 3283 { 3284 pcre_uint32 dc; 3285 GETCHARINC(dc, eptr); 3286 ecode += length; 3287 3288 /* If we have Unicode property support, we can use it to test the other 3289 case of the character, if there is one. */ 3290 3291 if (fc != dc) 3292 { 3293 #ifdef SUPPORT_UCP 3294 if (dc != UCD_OTHERCASE(fc)) 3295 #endif 3296 RRETURN(MATCH_NOMATCH); 3297 } 3298 } 3299 } 3300 else 3301 #endif /* SUPPORT_UTF */ 3302 3303 /* Not UTF mode */ 3304 { 3305 if (TABLE_GET(ecode[1], md->lcc, ecode[1]) 3306 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3307 eptr++; 3308 ecode += 2; 3309 } 3310 break; 3311 3312 /* Match a single character repeatedly. */ 3313 3314 case OP_EXACT: 3315 case OP_EXACTI: 3316 min = max = GET2(ecode, 1); 3317 ecode += 1 + IMM2_SIZE; 3318 goto REPEATCHAR; 3319 3320 case OP_POSUPTO: 3321 case OP_POSUPTOI: 3322 possessive = TRUE; 3323 /* Fall through */ 3324 3325 case OP_UPTO: 3326 case OP_UPTOI: 3327 case OP_MINUPTO: 3328 case OP_MINUPTOI: 3329 min = 0; 3330 max = GET2(ecode, 1); 3331 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; 3332 ecode += 1 + IMM2_SIZE; 3333 goto REPEATCHAR; 3334 3335 case OP_POSSTAR: 3336 case OP_POSSTARI: 3337 possessive = TRUE; 3338 min = 0; 3339 max = INT_MAX; 3340 ecode++; 3341 goto REPEATCHAR; 3342 3343 case OP_POSPLUS: 3344 case OP_POSPLUSI: 3345 possessive = TRUE; 3346 min = 1; 3347 max = INT_MAX; 3348 ecode++; 3349 goto REPEATCHAR; 3350 3351 case OP_POSQUERY: 3352 case OP_POSQUERYI: 3353 possessive = TRUE; 3354 min = 0; 3355 max = 1; 3356 ecode++; 3357 goto REPEATCHAR; 3358 3359 case OP_STAR: 3360 case OP_STARI: 3361 case OP_MINSTAR: 3362 case OP_MINSTARI: 3363 case OP_PLUS: 3364 case OP_PLUSI: 3365 case OP_MINPLUS: 3366 case OP_MINPLUSI: 3367 case OP_QUERY: 3368 case OP_QUERYI: 3369 case OP_MINQUERY: 3370 case OP_MINQUERYI: 3371 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); 3372 minimize = (c & 1) != 0; 3373 min = rep_min[c]; /* Pick up values from tables; */ 3374 max = rep_max[c]; /* zero for max => infinity */ 3375 if (max == 0) max = INT_MAX; 3376 3377 /* Common code for all repeated single-character matches. We first check 3378 for the minimum number of characters. If the minimum equals the maximum, we 3379 are done. Otherwise, if minimizing, check the rest of the pattern for a 3380 match; if there isn't one, advance up to the maximum, one character at a 3381 time. 3382 3383 If maximizing, advance up to the maximum number of matching characters, 3384 until eptr is past the end of the maximum run. If possessive, we are 3385 then done (no backing up). Otherwise, match at this position; anything 3386 other than no match is immediately returned. For nomatch, back up one 3387 character, unless we are matching \R and the last thing matched was 3388 \r\n, in which case, back up two bytes. When we reach the first optional 3389 character position, we can save stack by doing a tail recurse. 3390 3391 The various UTF/non-UTF and caseful/caseless cases are handled separately, 3392 for speed. */ 3393 3394 REPEATCHAR: 3395 #ifdef SUPPORT_UTF 3396 if (utf) 3397 { 3398 length = 1; 3399 charptr = ecode; 3400 GETCHARLEN(fc, ecode, length); 3401 ecode += length; 3402 3403 /* Handle multibyte character matching specially here. There is 3404 support for caseless matching if UCP support is present. */ 3405 3406 if (length > 1) 3407 { 3408 #ifdef SUPPORT_UCP 3409 pcre_uint32 othercase; 3410 if (op >= OP_STARI && /* Caseless */ 3411 (othercase = UCD_OTHERCASE(fc)) != fc) 3412 oclength = PRIV(ord2utf)(othercase, occhars); 3413 else oclength = 0; 3414 #endif /* SUPPORT_UCP */ 3415 3416 for (i = 1; i <= min; i++) 3417 { 3418 if (eptr <= md->end_subject - length && 3419 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3420 #ifdef SUPPORT_UCP 3421 else if (oclength > 0 && 3422 eptr <= md->end_subject - oclength && 3423 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3424 #endif /* SUPPORT_UCP */ 3425 else 3426 { 3427 CHECK_PARTIAL(); 3428 RRETURN(MATCH_NOMATCH); 3429 } 3430 } 3431 3432 if (min == max) continue; 3433 3434 if (minimize) 3435 { 3436 for (fi = min;; fi++) 3437 { 3438 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22); 3439 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3440 if (fi >= max) RRETURN(MATCH_NOMATCH); 3441 if (eptr <= md->end_subject - length && 3442 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3443 #ifdef SUPPORT_UCP 3444 else if (oclength > 0 && 3445 eptr <= md->end_subject - oclength && 3446 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3447 #endif /* SUPPORT_UCP */ 3448 else 3449 { 3450 CHECK_PARTIAL(); 3451 RRETURN(MATCH_NOMATCH); 3452 } 3453 } 3454 /* Control never gets here */ 3455 } 3456 3457 else /* Maximize */ 3458 { 3459 pp = eptr; 3460 for (i = min; i < max; i++) 3461 { 3462 if (eptr <= md->end_subject - length && 3463 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3464 #ifdef SUPPORT_UCP 3465 else if (oclength > 0 && 3466 eptr <= md->end_subject - oclength && 3467 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3468 #endif /* SUPPORT_UCP */ 3469 else 3470 { 3471 CHECK_PARTIAL(); 3472 break; 3473 } 3474 } 3475 3476 if (possessive) continue; /* No backtracking */ 3477 for(;;) 3478 { 3479 if (eptr == pp) goto TAIL_RECURSE; 3480 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); 3481 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3482 #ifdef SUPPORT_UCP 3483 eptr--; 3484 BACKCHAR(eptr); 3485 #else /* without SUPPORT_UCP */ 3486 eptr -= length; 3487 #endif /* SUPPORT_UCP */ 3488 } 3489 } 3490 /* Control never gets here */ 3491 } 3492 3493 /* If the length of a UTF-8 character is 1, we fall through here, and 3494 obey the code as for non-UTF-8 characters below, though in this case the 3495 value of fc will always be < 128. */ 3496 } 3497 else 3498 #endif /* SUPPORT_UTF */ 3499 /* When not in UTF-8 mode, load a single-byte character. */ 3500 fc = *ecode++; 3501 3502 /* The value of fc at this point is always one character, though we may 3503 or may not be in UTF mode. The code is duplicated for the caseless and 3504 caseful cases, for speed, since matching characters is likely to be quite 3505 common. First, ensure the minimum number of matches are present. If min = 3506 max, continue at the same level without recursing. Otherwise, if 3507 minimizing, keep trying the rest of the expression and advancing one 3508 matching character if failing, up to the maximum. Alternatively, if 3509 maximizing, find the maximum number of characters and work backwards. */ 3510 3511 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3512 max, (char *)eptr)); 3513 3514 if (op >= OP_STARI) /* Caseless */ 3515 { 3516 #ifdef COMPILE_PCRE8 3517 /* fc must be < 128 if UTF is enabled. */ 3518 foc = md->fcc[fc]; 3519 #else 3520 #ifdef SUPPORT_UTF 3521 #ifdef SUPPORT_UCP 3522 if (utf && fc > 127) 3523 foc = UCD_OTHERCASE(fc); 3524 #else 3525 if (utf && fc > 127) 3526 foc = fc; 3527 #endif /* SUPPORT_UCP */ 3528 else 3529 #endif /* SUPPORT_UTF */ 3530 foc = TABLE_GET(fc, md->fcc, fc); 3531 #endif /* COMPILE_PCRE8 */ 3532 3533 for (i = 1; i <= min; i++) 3534 { 3535 pcre_uint32 cc; /* Faster than pcre_uchar */ 3536 if (eptr >= md->end_subject) 3537 { 3538 SCHECK_PARTIAL(); 3539 RRETURN(MATCH_NOMATCH); 3540 } 3541 cc = UCHAR21TEST(eptr); 3542 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3543 eptr++; 3544 } 3545 if (min == max) continue; 3546 if (minimize) 3547 { 3548 for (fi = min;; fi++) 3549 { 3550 pcre_uint32 cc; /* Faster than pcre_uchar */ 3551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); 3552 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3553 if (fi >= max) RRETURN(MATCH_NOMATCH); 3554 if (eptr >= md->end_subject) 3555 { 3556 SCHECK_PARTIAL(); 3557 RRETURN(MATCH_NOMATCH); 3558 } 3559 cc = UCHAR21TEST(eptr); 3560 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3561 eptr++; 3562 } 3563 /* Control never gets here */ 3564 } 3565 else /* Maximize */ 3566 { 3567 pp = eptr; 3568 for (i = min; i < max; i++) 3569 { 3570 pcre_uint32 cc; /* Faster than pcre_uchar */ 3571 if (eptr >= md->end_subject) 3572 { 3573 SCHECK_PARTIAL(); 3574 break; 3575 } 3576 cc = UCHAR21TEST(eptr); 3577 if (fc != cc && foc != cc) break; 3578 eptr++; 3579 } 3580 if (possessive) continue; /* No backtracking */ 3581 for (;;) 3582 { 3583 if (eptr == pp) goto TAIL_RECURSE; 3584 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); 3585 eptr--; 3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3587 } 3588 /* Control never gets here */ 3589 } 3590 } 3591 3592 /* Caseful comparisons (includes all multi-byte characters) */ 3593 3594 else 3595 { 3596 for (i = 1; i <= min; i++) 3597 { 3598 if (eptr >= md->end_subject) 3599 { 3600 SCHECK_PARTIAL(); 3601 RRETURN(MATCH_NOMATCH); 3602 } 3603 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3604 } 3605 3606 if (min == max) continue; 3607 3608 if (minimize) 3609 { 3610 for (fi = min;; fi++) 3611 { 3612 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26); 3613 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3614 if (fi >= max) RRETURN(MATCH_NOMATCH); 3615 if (eptr >= md->end_subject) 3616 { 3617 SCHECK_PARTIAL(); 3618 RRETURN(MATCH_NOMATCH); 3619 } 3620 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3621 } 3622 /* Control never gets here */ 3623 } 3624 else /* Maximize */ 3625 { 3626 pp = eptr; 3627 for (i = min; i < max; i++) 3628 { 3629 if (eptr >= md->end_subject) 3630 { 3631 SCHECK_PARTIAL(); 3632 break; 3633 } 3634 if (fc != UCHAR21TEST(eptr)) break; 3635 eptr++; 3636 } 3637 if (possessive) continue; /* No backtracking */ 3638 for (;;) 3639 { 3640 if (eptr == pp) goto TAIL_RECURSE; 3641 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); 3642 eptr--; 3643 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3644 } 3645 /* Control never gets here */ 3646 } 3647 } 3648 /* Control never gets here */ 3649 3650 /* Match a negated single one-byte character. The character we are 3651 checking can be multibyte. */ 3652 3653 case OP_NOT: 3654 case OP_NOTI: 3655 if (eptr >= md->end_subject) 3656 { 3657 SCHECK_PARTIAL(); 3658 RRETURN(MATCH_NOMATCH); 3659 } 3660 #ifdef SUPPORT_UTF 3661 if (utf) 3662 { 3663 register pcre_uint32 ch, och; 3664 3665 ecode++; 3666 GETCHARINC(ch, ecode); 3667 GETCHARINC(c, eptr); 3668 3669 if (op == OP_NOT) 3670 { 3671 if (ch == c) RRETURN(MATCH_NOMATCH); 3672 } 3673 else 3674 { 3675 #ifdef SUPPORT_UCP 3676 if (ch > 127) 3677 och = UCD_OTHERCASE(ch); 3678 #else 3679 if (ch > 127) 3680 och = ch; 3681 #endif /* SUPPORT_UCP */ 3682 else 3683 och = TABLE_GET(ch, md->fcc, ch); 3684 if (ch == c || och == c) RRETURN(MATCH_NOMATCH); 3685 } 3686 } 3687 else 3688 #endif 3689 { 3690 register pcre_uint32 ch = ecode[1]; 3691 c = *eptr++; 3692 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) 3693 RRETURN(MATCH_NOMATCH); 3694 ecode += 2; 3695 } 3696 break; 3697 3698 /* Match a negated single one-byte character repeatedly. This is almost a 3699 repeat of the code for a repeated single character, but I haven't found a 3700 nice way of commoning these up that doesn't require a test of the 3701 positive/negative option for each character match. Maybe that wouldn't add 3702 very much to the time taken, but character matching *is* what this is all 3703 about... */ 3704 3705 case OP_NOTEXACT: 3706 case OP_NOTEXACTI: 3707 min = max = GET2(ecode, 1); 3708 ecode += 1 + IMM2_SIZE; 3709 goto REPEATNOTCHAR; 3710 3711 case OP_NOTUPTO: 3712 case OP_NOTUPTOI: 3713 case OP_NOTMINUPTO: 3714 case OP_NOTMINUPTOI: 3715 min = 0; 3716 max = GET2(ecode, 1); 3717 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; 3718 ecode += 1 + IMM2_SIZE; 3719 goto REPEATNOTCHAR; 3720 3721 case OP_NOTPOSSTAR: 3722 case OP_NOTPOSSTARI: 3723 possessive = TRUE; 3724 min = 0; 3725 max = INT_MAX; 3726 ecode++; 3727 goto REPEATNOTCHAR; 3728 3729 case OP_NOTPOSPLUS: 3730 case OP_NOTPOSPLUSI: 3731 possessive = TRUE; 3732 min = 1; 3733 max = INT_MAX; 3734 ecode++; 3735 goto REPEATNOTCHAR; 3736 3737 case OP_NOTPOSQUERY: 3738 case OP_NOTPOSQUERYI: 3739 possessive = TRUE; 3740 min = 0; 3741 max = 1; 3742 ecode++; 3743 goto REPEATNOTCHAR; 3744 3745 case OP_NOTPOSUPTO: 3746 case OP_NOTPOSUPTOI: 3747 possessive = TRUE; 3748 min = 0; 3749 max = GET2(ecode, 1); 3750 ecode += 1 + IMM2_SIZE; 3751 goto REPEATNOTCHAR; 3752 3753 case OP_NOTSTAR: 3754 case OP_NOTSTARI: 3755 case OP_NOTMINSTAR: 3756 case OP_NOTMINSTARI: 3757 case OP_NOTPLUS: 3758 case OP_NOTPLUSI: 3759 case OP_NOTMINPLUS: 3760 case OP_NOTMINPLUSI: 3761 case OP_NOTQUERY: 3762 case OP_NOTQUERYI: 3763 case OP_NOTMINQUERY: 3764 case OP_NOTMINQUERYI: 3765 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 3766 minimize = (c & 1) != 0; 3767 min = rep_min[c]; /* Pick up values from tables; */ 3768 max = rep_max[c]; /* zero for max => infinity */ 3769 if (max == 0) max = INT_MAX; 3770 3771 /* Common code for all repeated single-byte matches. */ 3772 3773 REPEATNOTCHAR: 3774 GETCHARINCTEST(fc, ecode); 3775 3776 /* The code is duplicated for the caseless and caseful cases, for speed, 3777 since matching characters is likely to be quite common. First, ensure the 3778 minimum number of matches are present. If min = max, continue at the same 3779 level without recursing. Otherwise, if minimizing, keep trying the rest of 3780 the expression and advancing one matching character if failing, up to the 3781 maximum. Alternatively, if maximizing, find the maximum number of 3782 characters and work backwards. */ 3783 3784 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3785 max, (char *)eptr)); 3786 3787 if (op >= OP_NOTSTARI) /* Caseless */ 3788 { 3789 #ifdef SUPPORT_UTF 3790 #ifdef SUPPORT_UCP 3791 if (utf && fc > 127) 3792 foc = UCD_OTHERCASE(fc); 3793 #else 3794 if (utf && fc > 127) 3795 foc = fc; 3796 #endif /* SUPPORT_UCP */ 3797 else 3798 #endif /* SUPPORT_UTF */ 3799 foc = TABLE_GET(fc, md->fcc, fc); 3800 3801 #ifdef SUPPORT_UTF 3802 if (utf) 3803 { 3804 register pcre_uint32 d; 3805 for (i = 1; i <= min; i++) 3806 { 3807 if (eptr >= md->end_subject) 3808 { 3809 SCHECK_PARTIAL(); 3810 RRETURN(MATCH_NOMATCH); 3811 } 3812 GETCHARINC(d, eptr); 3813 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3814 } 3815 } 3816 else 3817 #endif /* SUPPORT_UTF */ 3818 /* Not UTF mode */ 3819 { 3820 for (i = 1; i <= min; i++) 3821 { 3822 if (eptr >= md->end_subject) 3823 { 3824 SCHECK_PARTIAL(); 3825 RRETURN(MATCH_NOMATCH); 3826 } 3827 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3828 eptr++; 3829 } 3830 } 3831 3832 if (min == max) continue; 3833 3834 if (minimize) 3835 { 3836 #ifdef SUPPORT_UTF 3837 if (utf) 3838 { 3839 register pcre_uint32 d; 3840 for (fi = min;; fi++) 3841 { 3842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); 3843 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3844 if (fi >= max) RRETURN(MATCH_NOMATCH); 3845 if (eptr >= md->end_subject) 3846 { 3847 SCHECK_PARTIAL(); 3848 RRETURN(MATCH_NOMATCH); 3849 } 3850 GETCHARINC(d, eptr); 3851 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3852 } 3853 } 3854 else 3855 #endif /*SUPPORT_UTF */ 3856 /* Not UTF mode */ 3857 { 3858 for (fi = min;; fi++) 3859 { 3860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29); 3861 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3862 if (fi >= max) RRETURN(MATCH_NOMATCH); 3863 if (eptr >= md->end_subject) 3864 { 3865 SCHECK_PARTIAL(); 3866 RRETURN(MATCH_NOMATCH); 3867 } 3868 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3869 eptr++; 3870 } 3871 } 3872 /* Control never gets here */ 3873 } 3874 3875 /* Maximize case */ 3876 3877 else 3878 { 3879 pp = eptr; 3880 3881 #ifdef SUPPORT_UTF 3882 if (utf) 3883 { 3884 register pcre_uint32 d; 3885 for (i = min; i < max; i++) 3886 { 3887 int len = 1; 3888 if (eptr >= md->end_subject) 3889 { 3890 SCHECK_PARTIAL(); 3891 break; 3892 } 3893 GETCHARLEN(d, eptr, len); 3894 if (fc == d || (unsigned int)foc == d) break; 3895 eptr += len; 3896 } 3897 if (possessive) continue; /* No backtracking */ 3898 for(;;) 3899 { 3900 if (eptr == pp) goto TAIL_RECURSE; 3901 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); 3902 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3903 eptr--; 3904 BACKCHAR(eptr); 3905 } 3906 } 3907 else 3908 #endif /* SUPPORT_UTF */ 3909 /* Not UTF mode */ 3910 { 3911 for (i = min; i < max; i++) 3912 { 3913 if (eptr >= md->end_subject) 3914 { 3915 SCHECK_PARTIAL(); 3916 break; 3917 } 3918 if (fc == *eptr || foc == *eptr) break; 3919 eptr++; 3920 } 3921 if (possessive) continue; /* No backtracking */ 3922 for (;;) 3923 { 3924 if (eptr == pp) goto TAIL_RECURSE; 3925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); 3926 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3927 eptr--; 3928 } 3929 } 3930 /* Control never gets here */ 3931 } 3932 } 3933 3934 /* Caseful comparisons */ 3935 3936 else 3937 { 3938 #ifdef SUPPORT_UTF 3939 if (utf) 3940 { 3941 register pcre_uint32 d; 3942 for (i = 1; i <= min; i++) 3943 { 3944 if (eptr >= md->end_subject) 3945 { 3946 SCHECK_PARTIAL(); 3947 RRETURN(MATCH_NOMATCH); 3948 } 3949 GETCHARINC(d, eptr); 3950 if (fc == d) RRETURN(MATCH_NOMATCH); 3951 } 3952 } 3953 else 3954 #endif 3955 /* Not UTF mode */ 3956 { 3957 for (i = 1; i <= min; i++) 3958 { 3959 if (eptr >= md->end_subject) 3960 { 3961 SCHECK_PARTIAL(); 3962 RRETURN(MATCH_NOMATCH); 3963 } 3964 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3965 } 3966 } 3967 3968 if (min == max) continue; 3969 3970 if (minimize) 3971 { 3972 #ifdef SUPPORT_UTF 3973 if (utf) 3974 { 3975 register pcre_uint32 d; 3976 for (fi = min;; fi++) 3977 { 3978 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); 3979 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3980 if (fi >= max) RRETURN(MATCH_NOMATCH); 3981 if (eptr >= md->end_subject) 3982 { 3983 SCHECK_PARTIAL(); 3984 RRETURN(MATCH_NOMATCH); 3985 } 3986 GETCHARINC(d, eptr); 3987 if (fc == d) RRETURN(MATCH_NOMATCH); 3988 } 3989 } 3990 else 3991 #endif 3992 /* Not UTF mode */ 3993 { 3994 for (fi = min;; fi++) 3995 { 3996 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33); 3997 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3998 if (fi >= max) RRETURN(MATCH_NOMATCH); 3999 if (eptr >= md->end_subject) 4000 { 4001 SCHECK_PARTIAL(); 4002 RRETURN(MATCH_NOMATCH); 4003 } 4004 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 4005 } 4006 } 4007 /* Control never gets here */ 4008 } 4009 4010 /* Maximize case */ 4011 4012 else 4013 { 4014 pp = eptr; 4015 4016 #ifdef SUPPORT_UTF 4017 if (utf) 4018 { 4019 register pcre_uint32 d; 4020 for (i = min; i < max; i++) 4021 { 4022 int len = 1; 4023 if (eptr >= md->end_subject) 4024 { 4025 SCHECK_PARTIAL(); 4026 break; 4027 } 4028 GETCHARLEN(d, eptr, len); 4029 if (fc == d) break; 4030 eptr += len; 4031 } 4032 if (possessive) continue; /* No backtracking */ 4033 for(;;) 4034 { 4035 if (eptr == pp) goto TAIL_RECURSE; 4036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); 4037 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4038 eptr--; 4039 BACKCHAR(eptr); 4040 } 4041 } 4042 else 4043 #endif 4044 /* Not UTF mode */ 4045 { 4046 for (i = min; i < max; i++) 4047 { 4048 if (eptr >= md->end_subject) 4049 { 4050 SCHECK_PARTIAL(); 4051 break; 4052 } 4053 if (fc == *eptr) break; 4054 eptr++; 4055 } 4056 if (possessive) continue; /* No backtracking */ 4057 for (;;) 4058 { 4059 if (eptr == pp) goto TAIL_RECURSE; 4060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); 4061 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4062 eptr--; 4063 } 4064 } 4065 /* Control never gets here */ 4066 } 4067 } 4068 /* Control never gets here */ 4069 4070 /* Match a single character type repeatedly; several different opcodes 4071 share code. This is very similar to the code for single characters, but we 4072 repeat it in the interests of efficiency. */ 4073 4074 case OP_TYPEEXACT: 4075 min = max = GET2(ecode, 1); 4076 minimize = TRUE; 4077 ecode += 1 + IMM2_SIZE; 4078 goto REPEATTYPE; 4079 4080 case OP_TYPEUPTO: 4081 case OP_TYPEMINUPTO: 4082 min = 0; 4083 max = GET2(ecode, 1); 4084 minimize = *ecode == OP_TYPEMINUPTO; 4085 ecode += 1 + IMM2_SIZE; 4086 goto REPEATTYPE; 4087 4088 case OP_TYPEPOSSTAR: 4089 possessive = TRUE; 4090 min = 0; 4091 max = INT_MAX; 4092 ecode++; 4093 goto REPEATTYPE; 4094 4095 case OP_TYPEPOSPLUS: 4096 possessive = TRUE; 4097 min = 1; 4098 max = INT_MAX; 4099 ecode++; 4100 goto REPEATTYPE; 4101 4102 case OP_TYPEPOSQUERY: 4103 possessive = TRUE; 4104 min = 0; 4105 max = 1; 4106 ecode++; 4107 goto REPEATTYPE; 4108 4109 case OP_TYPEPOSUPTO: 4110 possessive = TRUE; 4111 min = 0; 4112 max = GET2(ecode, 1); 4113 ecode += 1 + IMM2_SIZE; 4114 goto REPEATTYPE; 4115 4116 case OP_TYPESTAR: 4117 case OP_TYPEMINSTAR: 4118 case OP_TYPEPLUS: 4119 case OP_TYPEMINPLUS: 4120 case OP_TYPEQUERY: 4121 case OP_TYPEMINQUERY: 4122 c = *ecode++ - OP_TYPESTAR; 4123 minimize = (c & 1) != 0; 4124 min = rep_min[c]; /* Pick up values from tables; */ 4125 max = rep_max[c]; /* zero for max => infinity */ 4126 if (max == 0) max = INT_MAX; 4127 4128 /* Common code for all repeated single character type matches. Note that 4129 in UTF-8 mode, '.' matches a character of any length, but for the other 4130 character types, the valid characters are all one-byte long. */ 4131 4132 REPEATTYPE: 4133 ctype = *ecode++; /* Code for the character type */ 4134 4135 #ifdef SUPPORT_UCP 4136 if (ctype == OP_PROP || ctype == OP_NOTPROP) 4137 { 4138 prop_fail_result = ctype == OP_NOTPROP; 4139 prop_type = *ecode++; 4140 prop_value = *ecode++; 4141 } 4142 else prop_type = -1; 4143 #endif 4144 4145 /* First, ensure the minimum number of matches are present. Use inline 4146 code for maximizing the speed, and do the type test once at the start 4147 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 4148 is tidier. Also separate the UCP code, which can be the same for both UTF-8 4149 and single-bytes. */ 4150 4151 if (min > 0) 4152 { 4153 #ifdef SUPPORT_UCP 4154 if (prop_type >= 0) 4155 { 4156 switch(prop_type) 4157 { 4158 case PT_ANY: 4159 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4160 for (i = 1; i <= min; i++) 4161 { 4162 if (eptr >= md->end_subject) 4163 { 4164 SCHECK_PARTIAL(); 4165 RRETURN(MATCH_NOMATCH); 4166 } 4167 GETCHARINCTEST(c, eptr); 4168 } 4169 break; 4170 4171 case PT_LAMP: 4172 for (i = 1; i <= min; i++) 4173 { 4174 int chartype; 4175 if (eptr >= md->end_subject) 4176 { 4177 SCHECK_PARTIAL(); 4178 RRETURN(MATCH_NOMATCH); 4179 } 4180 GETCHARINCTEST(c, eptr); 4181 chartype = UCD_CHARTYPE(c); 4182 if ((chartype == ucp_Lu || 4183 chartype == ucp_Ll || 4184 chartype == ucp_Lt) == prop_fail_result) 4185 RRETURN(MATCH_NOMATCH); 4186 } 4187 break; 4188 4189 case PT_GC: 4190 for (i = 1; i <= min; i++) 4191 { 4192 if (eptr >= md->end_subject) 4193 { 4194 SCHECK_PARTIAL(); 4195 RRETURN(MATCH_NOMATCH); 4196 } 4197 GETCHARINCTEST(c, eptr); 4198 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4199 RRETURN(MATCH_NOMATCH); 4200 } 4201 break; 4202 4203 case PT_PC: 4204 for (i = 1; i <= min; i++) 4205 { 4206 if (eptr >= md->end_subject) 4207 { 4208 SCHECK_PARTIAL(); 4209 RRETURN(MATCH_NOMATCH); 4210 } 4211 GETCHARINCTEST(c, eptr); 4212 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4213 RRETURN(MATCH_NOMATCH); 4214 } 4215 break; 4216 4217 case PT_SC: 4218 for (i = 1; i <= min; i++) 4219 { 4220 if (eptr >= md->end_subject) 4221 { 4222 SCHECK_PARTIAL(); 4223 RRETURN(MATCH_NOMATCH); 4224 } 4225 GETCHARINCTEST(c, eptr); 4226 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4227 RRETURN(MATCH_NOMATCH); 4228 } 4229 break; 4230 4231 case PT_ALNUM: 4232 for (i = 1; i <= min; i++) 4233 { 4234 int category; 4235 if (eptr >= md->end_subject) 4236 { 4237 SCHECK_PARTIAL(); 4238 RRETURN(MATCH_NOMATCH); 4239 } 4240 GETCHARINCTEST(c, eptr); 4241 category = UCD_CATEGORY(c); 4242 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4243 RRETURN(MATCH_NOMATCH); 4244 } 4245 break; 4246 4247 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4248 which means that Perl space and POSIX space are now identical. PCRE 4249 was changed at release 8.34. */ 4250 4251 case PT_SPACE: /* Perl space */ 4252 case PT_PXSPACE: /* POSIX space */ 4253 for (i = 1; i <= min; i++) 4254 { 4255 if (eptr >= md->end_subject) 4256 { 4257 SCHECK_PARTIAL(); 4258 RRETURN(MATCH_NOMATCH); 4259 } 4260 GETCHARINCTEST(c, eptr); 4261 switch(c) 4262 { 4263 HSPACE_CASES: 4264 VSPACE_CASES: 4265 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4266 break; 4267 4268 default: 4269 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 4270 RRETURN(MATCH_NOMATCH); 4271 break; 4272 } 4273 } 4274 break; 4275 4276 case PT_WORD: 4277 for (i = 1; i <= min; i++) 4278 { 4279 int category; 4280 if (eptr >= md->end_subject) 4281 { 4282 SCHECK_PARTIAL(); 4283 RRETURN(MATCH_NOMATCH); 4284 } 4285 GETCHARINCTEST(c, eptr); 4286 category = UCD_CATEGORY(c); 4287 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) 4288 == prop_fail_result) 4289 RRETURN(MATCH_NOMATCH); 4290 } 4291 break; 4292 4293 case PT_CLIST: 4294 for (i = 1; i <= min; i++) 4295 { 4296 const pcre_uint32 *cp; 4297 if (eptr >= md->end_subject) 4298 { 4299 SCHECK_PARTIAL(); 4300 RRETURN(MATCH_NOMATCH); 4301 } 4302 GETCHARINCTEST(c, eptr); 4303 cp = PRIV(ucd_caseless_sets) + prop_value; 4304 for (;;) 4305 { 4306 if (c < *cp) 4307 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 4308 if (c == *cp++) 4309 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 4310 } 4311 } 4312 break; 4313 4314 case PT_UCNC: 4315 for (i = 1; i <= min; i++) 4316 { 4317 if (eptr >= md->end_subject) 4318 { 4319 SCHECK_PARTIAL(); 4320 RRETURN(MATCH_NOMATCH); 4321 } 4322 GETCHARINCTEST(c, eptr); 4323 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 4324 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 4325 c >= 0xe000) == prop_fail_result) 4326 RRETURN(MATCH_NOMATCH); 4327 } 4328 break; 4329 4330 /* This should not occur */ 4331 4332 default: 4333 RRETURN(PCRE_ERROR_INTERNAL); 4334 } 4335 } 4336 4337 /* Match extended Unicode sequences. We will get here only if the 4338 support is in the binary; otherwise a compile-time error occurs. */ 4339 4340 else if (ctype == OP_EXTUNI) 4341 { 4342 for (i = 1; i <= min; i++) 4343 { 4344 if (eptr >= md->end_subject) 4345 { 4346 SCHECK_PARTIAL(); 4347 RRETURN(MATCH_NOMATCH); 4348 } 4349 else 4350 { 4351 int lgb, rgb; 4352 GETCHARINCTEST(c, eptr); 4353 lgb = UCD_GRAPHBREAK(c); 4354 while (eptr < md->end_subject) 4355 { 4356 int len = 1; 4357 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 4358 rgb = UCD_GRAPHBREAK(c); 4359 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 4360 lgb = rgb; 4361 eptr += len; 4362 } 4363 } 4364 CHECK_PARTIAL(); 4365 } 4366 } 4367 4368 else 4369 #endif /* SUPPORT_UCP */ 4370 4371 /* Handle all other cases when the coding is UTF-8 */ 4372 4373 #ifdef SUPPORT_UTF 4374 if (utf) switch(ctype) 4375 { 4376 case OP_ANY: 4377 for (i = 1; i <= min; i++) 4378 { 4379 if (eptr >= md->end_subject) 4380 { 4381 SCHECK_PARTIAL(); 4382 RRETURN(MATCH_NOMATCH); 4383 } 4384 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4385 if (md->partial != 0 && 4386 eptr + 1 >= md->end_subject && 4387 NLBLOCK->nltype == NLTYPE_FIXED && 4388 NLBLOCK->nllen == 2 && 4389 UCHAR21(eptr) == NLBLOCK->nl[0]) 4390 { 4391 md->hitend = TRUE; 4392 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4393 } 4394 eptr++; 4395 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4396 } 4397 break; 4398 4399 case OP_ALLANY: 4400 for (i = 1; i <= min; i++) 4401 { 4402 if (eptr >= md->end_subject) 4403 { 4404 SCHECK_PARTIAL(); 4405 RRETURN(MATCH_NOMATCH); 4406 } 4407 eptr++; 4408 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4409 } 4410 break; 4411 4412 case OP_ANYBYTE: 4413 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); 4414 eptr += min; 4415 break; 4416 4417 case OP_ANYNL: 4418 for (i = 1; i <= min; i++) 4419 { 4420 if (eptr >= md->end_subject) 4421 { 4422 SCHECK_PARTIAL(); 4423 RRETURN(MATCH_NOMATCH); 4424 } 4425 GETCHARINC(c, eptr); 4426 switch(c) 4427 { 4428 default: RRETURN(MATCH_NOMATCH); 4429 4430 case CHAR_CR: 4431 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 4432 break; 4433 4434 case CHAR_LF: 4435 break; 4436 4437 case CHAR_VT: 4438 case CHAR_FF: 4439 case CHAR_NEL: 4440 #ifndef EBCDIC 4441 case 0x2028: 4442 case 0x2029: 4443 #endif /* Not EBCDIC */ 4444 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4445 break; 4446 } 4447 } 4448 break; 4449 4450 case OP_NOT_HSPACE: 4451 for (i = 1; i <= min; i++) 4452 { 4453 if (eptr >= md->end_subject) 4454 { 4455 SCHECK_PARTIAL(); 4456 RRETURN(MATCH_NOMATCH); 4457 } 4458 GETCHARINC(c, eptr); 4459 switch(c) 4460 { 4461 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 4462 default: break; 4463 } 4464 } 4465 break; 4466 4467 case OP_HSPACE: 4468 for (i = 1; i <= min; i++) 4469 { 4470 if (eptr >= md->end_subject) 4471 { 4472 SCHECK_PARTIAL(); 4473 RRETURN(MATCH_NOMATCH); 4474 } 4475 GETCHARINC(c, eptr); 4476 switch(c) 4477 { 4478 HSPACE_CASES: break; /* Byte and multibyte cases */ 4479 default: RRETURN(MATCH_NOMATCH); 4480 } 4481 } 4482 break; 4483 4484 case OP_NOT_VSPACE: 4485 for (i = 1; i <= min; i++) 4486 { 4487 if (eptr >= md->end_subject) 4488 { 4489 SCHECK_PARTIAL(); 4490 RRETURN(MATCH_NOMATCH); 4491 } 4492 GETCHARINC(c, eptr); 4493 switch(c) 4494 { 4495 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 4496 default: break; 4497 } 4498 } 4499 break; 4500 4501 case OP_VSPACE: 4502 for (i = 1; i <= min; i++) 4503 { 4504 if (eptr >= md->end_subject) 4505 { 4506 SCHECK_PARTIAL(); 4507 RRETURN(MATCH_NOMATCH); 4508 } 4509 GETCHARINC(c, eptr); 4510 switch(c) 4511 { 4512 VSPACE_CASES: break; 4513 default: RRETURN(MATCH_NOMATCH); 4514 } 4515 } 4516 break; 4517 4518 case OP_NOT_DIGIT: 4519 for (i = 1; i <= min; i++) 4520 { 4521 if (eptr >= md->end_subject) 4522 { 4523 SCHECK_PARTIAL(); 4524 RRETURN(MATCH_NOMATCH); 4525 } 4526 GETCHARINC(c, eptr); 4527 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 4528 RRETURN(MATCH_NOMATCH); 4529 } 4530 break; 4531 4532 case OP_DIGIT: 4533 for (i = 1; i <= min; i++) 4534 { 4535 pcre_uint32 cc; 4536 if (eptr >= md->end_subject) 4537 { 4538 SCHECK_PARTIAL(); 4539 RRETURN(MATCH_NOMATCH); 4540 } 4541 cc = UCHAR21(eptr); 4542 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) 4543 RRETURN(MATCH_NOMATCH); 4544 eptr++; 4545 /* No need to skip more bytes - we know it's a 1-byte character */ 4546 } 4547 break; 4548 4549 case OP_NOT_WHITESPACE: 4550 for (i = 1; i <= min; i++) 4551 { 4552 pcre_uint32 cc; 4553 if (eptr >= md->end_subject) 4554 { 4555 SCHECK_PARTIAL(); 4556 RRETURN(MATCH_NOMATCH); 4557 } 4558 cc = UCHAR21(eptr); 4559 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) 4560 RRETURN(MATCH_NOMATCH); 4561 eptr++; 4562 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4563 } 4564 break; 4565 4566 case OP_WHITESPACE: 4567 for (i = 1; i <= min; i++) 4568 { 4569 pcre_uint32 cc; 4570 if (eptr >= md->end_subject) 4571 { 4572 SCHECK_PARTIAL(); 4573 RRETURN(MATCH_NOMATCH); 4574 } 4575 cc = UCHAR21(eptr); 4576 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) 4577 RRETURN(MATCH_NOMATCH); 4578 eptr++; 4579 /* No need to skip more bytes - we know it's a 1-byte character */ 4580 } 4581 break; 4582 4583 case OP_NOT_WORDCHAR: 4584 for (i = 1; i <= min; i++) 4585 { 4586 pcre_uint32 cc; 4587 if (eptr >= md->end_subject) 4588 { 4589 SCHECK_PARTIAL(); 4590 RRETURN(MATCH_NOMATCH); 4591 } 4592 cc = UCHAR21(eptr); 4593 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) 4594 RRETURN(MATCH_NOMATCH); 4595 eptr++; 4596 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4597 } 4598 break; 4599 4600 case OP_WORDCHAR: 4601 for (i = 1; i <= min; i++) 4602 { 4603 pcre_uint32 cc; 4604 if (eptr >= md->end_subject) 4605 { 4606 SCHECK_PARTIAL(); 4607 RRETURN(MATCH_NOMATCH); 4608 } 4609 cc = UCHAR21(eptr); 4610 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) 4611 RRETURN(MATCH_NOMATCH); 4612 eptr++; 4613 /* No need to skip more bytes - we know it's a 1-byte character */ 4614 } 4615 break; 4616 4617 default: 4618 RRETURN(PCRE_ERROR_INTERNAL); 4619 } /* End switch(ctype) */ 4620 4621 else 4622 #endif /* SUPPORT_UTF */ 4623 4624 /* Code for the non-UTF-8 case for minimum matching of operators other 4625 than OP_PROP and OP_NOTPROP. */ 4626 4627 switch(ctype) 4628 { 4629 case OP_ANY: 4630 for (i = 1; i <= min; i++) 4631 { 4632 if (eptr >= md->end_subject) 4633 { 4634 SCHECK_PARTIAL(); 4635 RRETURN(MATCH_NOMATCH); 4636 } 4637 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4638 if (md->partial != 0 && 4639 eptr + 1 >= md->end_subject && 4640 NLBLOCK->nltype == NLTYPE_FIXED && 4641 NLBLOCK->nllen == 2 && 4642 *eptr == NLBLOCK->nl[0]) 4643 { 4644 md->hitend = TRUE; 4645 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4646 } 4647 eptr++; 4648 } 4649 break; 4650 4651 case OP_ALLANY: 4652 if (eptr > md->end_subject - min) 4653 { 4654 SCHECK_PARTIAL(); 4655 RRETURN(MATCH_NOMATCH); 4656 } 4657 eptr += min; 4658 break; 4659 4660 case OP_ANYBYTE: 4661 if (eptr > md->end_subject - min) 4662 { 4663 SCHECK_PARTIAL(); 4664 RRETURN(MATCH_NOMATCH); 4665 } 4666 eptr += min; 4667 break; 4668 4669 case OP_ANYNL: 4670 for (i = 1; i <= min; i++) 4671 { 4672 if (eptr >= md->end_subject) 4673 { 4674 SCHECK_PARTIAL(); 4675 RRETURN(MATCH_NOMATCH); 4676 } 4677 switch(*eptr++) 4678 { 4679 default: RRETURN(MATCH_NOMATCH); 4680 4681 case CHAR_CR: 4682 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 4683 break; 4684 4685 case CHAR_LF: 4686 break; 4687 4688 case CHAR_VT: 4689 case CHAR_FF: 4690 case CHAR_NEL: 4691 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4692 case 0x2028: 4693 case 0x2029: 4694 #endif 4695 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4696 break; 4697 } 4698 } 4699 break; 4700 4701 case OP_NOT_HSPACE: 4702 for (i = 1; i <= min; i++) 4703 { 4704 if (eptr >= md->end_subject) 4705 { 4706 SCHECK_PARTIAL(); 4707 RRETURN(MATCH_NOMATCH); 4708 } 4709 switch(*eptr++) 4710 { 4711 default: break; 4712 HSPACE_BYTE_CASES: 4713 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4714 HSPACE_MULTIBYTE_CASES: 4715 #endif 4716 RRETURN(MATCH_NOMATCH); 4717 } 4718 } 4719 break; 4720 4721 case OP_HSPACE: 4722 for (i = 1; i <= min; i++) 4723 { 4724 if (eptr >= md->end_subject) 4725 { 4726 SCHECK_PARTIAL(); 4727 RRETURN(MATCH_NOMATCH); 4728 } 4729 switch(*eptr++) 4730 { 4731 default: RRETURN(MATCH_NOMATCH); 4732 HSPACE_BYTE_CASES: 4733 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4734 HSPACE_MULTIBYTE_CASES: 4735 #endif 4736 break; 4737 } 4738 } 4739 break; 4740 4741 case OP_NOT_VSPACE: 4742 for (i = 1; i <= min; i++) 4743 { 4744 if (eptr >= md->end_subject) 4745 { 4746 SCHECK_PARTIAL(); 4747 RRETURN(MATCH_NOMATCH); 4748 } 4749 switch(*eptr++) 4750 { 4751 VSPACE_BYTE_CASES: 4752 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4753 VSPACE_MULTIBYTE_CASES: 4754 #endif 4755 RRETURN(MATCH_NOMATCH); 4756 default: break; 4757 } 4758 } 4759 break; 4760 4761 case OP_VSPACE: 4762 for (i = 1; i <= min; i++) 4763 { 4764 if (eptr >= md->end_subject) 4765 { 4766 SCHECK_PARTIAL(); 4767 RRETURN(MATCH_NOMATCH); 4768 } 4769 switch(*eptr++) 4770 { 4771 default: RRETURN(MATCH_NOMATCH); 4772 VSPACE_BYTE_CASES: 4773 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 4774 VSPACE_MULTIBYTE_CASES: 4775 #endif 4776 break; 4777 } 4778 } 4779 break; 4780 4781 case OP_NOT_DIGIT: 4782 for (i = 1; i <= min; i++) 4783 { 4784 if (eptr >= md->end_subject) 4785 { 4786 SCHECK_PARTIAL(); 4787 RRETURN(MATCH_NOMATCH); 4788 } 4789 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) 4790 RRETURN(MATCH_NOMATCH); 4791 eptr++; 4792 } 4793 break; 4794 4795 case OP_DIGIT: 4796 for (i = 1; i <= min; i++) 4797 { 4798 if (eptr >= md->end_subject) 4799 { 4800 SCHECK_PARTIAL(); 4801 RRETURN(MATCH_NOMATCH); 4802 } 4803 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) 4804 RRETURN(MATCH_NOMATCH); 4805 eptr++; 4806 } 4807 break; 4808 4809 case OP_NOT_WHITESPACE: 4810 for (i = 1; i <= min; i++) 4811 { 4812 if (eptr >= md->end_subject) 4813 { 4814 SCHECK_PARTIAL(); 4815 RRETURN(MATCH_NOMATCH); 4816 } 4817 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) 4818 RRETURN(MATCH_NOMATCH); 4819 eptr++; 4820 } 4821 break; 4822 4823 case OP_WHITESPACE: 4824 for (i = 1; i <= min; i++) 4825 { 4826 if (eptr >= md->end_subject) 4827 { 4828 SCHECK_PARTIAL(); 4829 RRETURN(MATCH_NOMATCH); 4830 } 4831 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) 4832 RRETURN(MATCH_NOMATCH); 4833 eptr++; 4834 } 4835 break; 4836 4837 case OP_NOT_WORDCHAR: 4838 for (i = 1; i <= min; i++) 4839 { 4840 if (eptr >= md->end_subject) 4841 { 4842 SCHECK_PARTIAL(); 4843 RRETURN(MATCH_NOMATCH); 4844 } 4845 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) 4846 RRETURN(MATCH_NOMATCH); 4847 eptr++; 4848 } 4849 break; 4850 4851 case OP_WORDCHAR: 4852 for (i = 1; i <= min; i++) 4853 { 4854 if (eptr >= md->end_subject) 4855 { 4856 SCHECK_PARTIAL(); 4857 RRETURN(MATCH_NOMATCH); 4858 } 4859 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) 4860 RRETURN(MATCH_NOMATCH); 4861 eptr++; 4862 } 4863 break; 4864 4865 default: 4866 RRETURN(PCRE_ERROR_INTERNAL); 4867 } 4868 } 4869 4870 /* If min = max, continue at the same level without recursing */ 4871 4872 if (min == max) continue; 4873 4874 /* If minimizing, we have to test the rest of the pattern before each 4875 subsequent match. Again, separate the UTF-8 case for speed, and also 4876 separate the UCP cases. */ 4877 4878 if (minimize) 4879 { 4880 #ifdef SUPPORT_UCP 4881 if (prop_type >= 0) 4882 { 4883 switch(prop_type) 4884 { 4885 case PT_ANY: 4886 for (fi = min;; fi++) 4887 { 4888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36); 4889 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4890 if (fi >= max) RRETURN(MATCH_NOMATCH); 4891 if (eptr >= md->end_subject) 4892 { 4893 SCHECK_PARTIAL(); 4894 RRETURN(MATCH_NOMATCH); 4895 } 4896 GETCHARINCTEST(c, eptr); 4897 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4898 } 4899 /* Control never gets here */ 4900 4901 case PT_LAMP: 4902 for (fi = min;; fi++) 4903 { 4904 int chartype; 4905 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37); 4906 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4907 if (fi >= max) RRETURN(MATCH_NOMATCH); 4908 if (eptr >= md->end_subject) 4909 { 4910 SCHECK_PARTIAL(); 4911 RRETURN(MATCH_NOMATCH); 4912 } 4913 GETCHARINCTEST(c, eptr); 4914 chartype = UCD_CHARTYPE(c); 4915 if ((chartype == ucp_Lu || 4916 chartype == ucp_Ll || 4917 chartype == ucp_Lt) == prop_fail_result) 4918 RRETURN(MATCH_NOMATCH); 4919 } 4920 /* Control never gets here */ 4921 4922 case PT_GC: 4923 for (fi = min;; fi++) 4924 { 4925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38); 4926 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4927 if (fi >= max) RRETURN(MATCH_NOMATCH); 4928 if (eptr >= md->end_subject) 4929 { 4930 SCHECK_PARTIAL(); 4931 RRETURN(MATCH_NOMATCH); 4932 } 4933 GETCHARINCTEST(c, eptr); 4934 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4935 RRETURN(MATCH_NOMATCH); 4936 } 4937 /* Control never gets here */ 4938 4939 case PT_PC: 4940 for (fi = min;; fi++) 4941 { 4942 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39); 4943 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4944 if (fi >= max) RRETURN(MATCH_NOMATCH); 4945 if (eptr >= md->end_subject) 4946 { 4947 SCHECK_PARTIAL(); 4948 RRETURN(MATCH_NOMATCH); 4949 } 4950 GETCHARINCTEST(c, eptr); 4951 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4952 RRETURN(MATCH_NOMATCH); 4953 } 4954 /* Control never gets here */ 4955 4956 case PT_SC: 4957 for (fi = min;; fi++) 4958 { 4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40); 4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4961 if (fi >= max) RRETURN(MATCH_NOMATCH); 4962 if (eptr >= md->end_subject) 4963 { 4964 SCHECK_PARTIAL(); 4965 RRETURN(MATCH_NOMATCH); 4966 } 4967 GETCHARINCTEST(c, eptr); 4968 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4969 RRETURN(MATCH_NOMATCH); 4970 } 4971 /* Control never gets here */ 4972 4973 case PT_ALNUM: 4974 for (fi = min;; fi++) 4975 { 4976 int category; 4977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59); 4978 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4979 if (fi >= max) RRETURN(MATCH_NOMATCH); 4980 if (eptr >= md->end_subject) 4981 { 4982 SCHECK_PARTIAL(); 4983 RRETURN(MATCH_NOMATCH); 4984 } 4985 GETCHARINCTEST(c, eptr); 4986 category = UCD_CATEGORY(c); 4987 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4988 RRETURN(MATCH_NOMATCH); 4989 } 4990 /* Control never gets here */ 4991 4992 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4993 which means that Perl space and POSIX space are now identical. PCRE 4994 was changed at release 8.34. */ 4995 4996 case PT_SPACE: /* Perl space */ 4997 case PT_PXSPACE: /* POSIX space */ 4998 for (fi = min;; fi++) 4999 { 5000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); 5001 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5002 if (fi >= max) RRETURN(MATCH_NOMATCH); 5003 if (eptr >= md->end_subject) 5004 { 5005 SCHECK_PARTIAL(); 5006 RRETURN(MATCH_NOMATCH); 5007 } 5008 GETCHARINCTEST(c, eptr); 5009 switch(c) 5010 { 5011 HSPACE_CASES: 5012 VSPACE_CASES: 5013 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 5014 break; 5015 5016 default: 5017 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5018 RRETURN(MATCH_NOMATCH); 5019 break; 5020 } 5021 } 5022 /* Control never gets here */ 5023 5024 case PT_WORD: 5025 for (fi = min;; fi++) 5026 { 5027 int category; 5028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); 5029 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5030 if (fi >= max) RRETURN(MATCH_NOMATCH); 5031 if (eptr >= md->end_subject) 5032 { 5033 SCHECK_PARTIAL(); 5034 RRETURN(MATCH_NOMATCH); 5035 } 5036 GETCHARINCTEST(c, eptr); 5037 category = UCD_CATEGORY(c); 5038 if ((category == ucp_L || 5039 category == ucp_N || 5040 c == CHAR_UNDERSCORE) 5041 == prop_fail_result) 5042 RRETURN(MATCH_NOMATCH); 5043 } 5044 /* Control never gets here */ 5045 5046 case PT_CLIST: 5047 for (fi = min;; fi++) 5048 { 5049 const pcre_uint32 *cp; 5050 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67); 5051 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5052 if (fi >= max) RRETURN(MATCH_NOMATCH); 5053 if (eptr >= md->end_subject) 5054 { 5055 SCHECK_PARTIAL(); 5056 RRETURN(MATCH_NOMATCH); 5057 } 5058 GETCHARINCTEST(c, eptr); 5059 cp = PRIV(ucd_caseless_sets) + prop_value; 5060 for (;;) 5061 { 5062 if (c < *cp) 5063 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 5064 if (c == *cp++) 5065 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 5066 } 5067 } 5068 /* Control never gets here */ 5069 5070 case PT_UCNC: 5071 for (fi = min;; fi++) 5072 { 5073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); 5074 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5075 if (fi >= max) RRETURN(MATCH_NOMATCH); 5076 if (eptr >= md->end_subject) 5077 { 5078 SCHECK_PARTIAL(); 5079 RRETURN(MATCH_NOMATCH); 5080 } 5081 GETCHARINCTEST(c, eptr); 5082 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5083 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5084 c >= 0xe000) == prop_fail_result) 5085 RRETURN(MATCH_NOMATCH); 5086 } 5087 /* Control never gets here */ 5088 5089 /* This should never occur */ 5090 default: 5091 RRETURN(PCRE_ERROR_INTERNAL); 5092 } 5093 } 5094 5095 /* Match extended Unicode sequences. We will get here only if the 5096 support is in the binary; otherwise a compile-time error occurs. */ 5097 5098 else if (ctype == OP_EXTUNI) 5099 { 5100 for (fi = min;; fi++) 5101 { 5102 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41); 5103 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5104 if (fi >= max) RRETURN(MATCH_NOMATCH); 5105 if (eptr >= md->end_subject) 5106 { 5107 SCHECK_PARTIAL(); 5108 RRETURN(MATCH_NOMATCH); 5109 } 5110 else 5111 { 5112 int lgb, rgb; 5113 GETCHARINCTEST(c, eptr); 5114 lgb = UCD_GRAPHBREAK(c); 5115 while (eptr < md->end_subject) 5116 { 5117 int len = 1; 5118 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5119 rgb = UCD_GRAPHBREAK(c); 5120 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5121 lgb = rgb; 5122 eptr += len; 5123 } 5124 } 5125 CHECK_PARTIAL(); 5126 } 5127 } 5128 else 5129 #endif /* SUPPORT_UCP */ 5130 5131 #ifdef SUPPORT_UTF 5132 if (utf) 5133 { 5134 for (fi = min;; fi++) 5135 { 5136 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42); 5137 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5138 if (fi >= max) RRETURN(MATCH_NOMATCH); 5139 if (eptr >= md->end_subject) 5140 { 5141 SCHECK_PARTIAL(); 5142 RRETURN(MATCH_NOMATCH); 5143 } 5144 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5145 RRETURN(MATCH_NOMATCH); 5146 GETCHARINC(c, eptr); 5147 switch(ctype) 5148 { 5149 case OP_ANY: /* This is the non-NL case */ 5150 if (md->partial != 0 && /* Take care with CRLF partial */ 5151 eptr >= md->end_subject && 5152 NLBLOCK->nltype == NLTYPE_FIXED && 5153 NLBLOCK->nllen == 2 && 5154 c == NLBLOCK->nl[0]) 5155 { 5156 md->hitend = TRUE; 5157 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5158 } 5159 break; 5160 5161 case OP_ALLANY: 5162 case OP_ANYBYTE: 5163 break; 5164 5165 case OP_ANYNL: 5166 switch(c) 5167 { 5168 default: RRETURN(MATCH_NOMATCH); 5169 case CHAR_CR: 5170 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 5171 break; 5172 5173 case CHAR_LF: 5174 break; 5175 5176 case CHAR_VT: 5177 case CHAR_FF: 5178 case CHAR_NEL: 5179 #ifndef EBCDIC 5180 case 0x2028: 5181 case 0x2029: 5182 #endif /* Not EBCDIC */ 5183 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5184 break; 5185 } 5186 break; 5187 5188 case OP_NOT_HSPACE: 5189 switch(c) 5190 { 5191 HSPACE_CASES: RRETURN(MATCH_NOMATCH); 5192 default: break; 5193 } 5194 break; 5195 5196 case OP_HSPACE: 5197 switch(c) 5198 { 5199 HSPACE_CASES: break; 5200 default: RRETURN(MATCH_NOMATCH); 5201 } 5202 break; 5203 5204 case OP_NOT_VSPACE: 5205 switch(c) 5206 { 5207 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 5208 default: break; 5209 } 5210 break; 5211 5212 case OP_VSPACE: 5213 switch(c) 5214 { 5215 VSPACE_CASES: break; 5216 default: RRETURN(MATCH_NOMATCH); 5217 } 5218 break; 5219 5220 case OP_NOT_DIGIT: 5221 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 5222 RRETURN(MATCH_NOMATCH); 5223 break; 5224 5225 case OP_DIGIT: 5226 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 5227 RRETURN(MATCH_NOMATCH); 5228 break; 5229 5230 case OP_NOT_WHITESPACE: 5231 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 5232 RRETURN(MATCH_NOMATCH); 5233 break; 5234 5235 case OP_WHITESPACE: 5236 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 5237 RRETURN(MATCH_NOMATCH); 5238 break; 5239 5240 case OP_NOT_WORDCHAR: 5241 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 5242 RRETURN(MATCH_NOMATCH); 5243 break; 5244 5245 case OP_WORDCHAR: 5246 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 5247 RRETURN(MATCH_NOMATCH); 5248 break; 5249 5250 default: 5251 RRETURN(PCRE_ERROR_INTERNAL); 5252 } 5253 } 5254 } 5255 else 5256 #endif 5257 /* Not UTF mode */ 5258 { 5259 for (fi = min;; fi++) 5260 { 5261 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43); 5262 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5263 if (fi >= max) RRETURN(MATCH_NOMATCH); 5264 if (eptr >= md->end_subject) 5265 { 5266 SCHECK_PARTIAL(); 5267 RRETURN(MATCH_NOMATCH); 5268 } 5269 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5270 RRETURN(MATCH_NOMATCH); 5271 c = *eptr++; 5272 switch(ctype) 5273 { 5274 case OP_ANY: /* This is the non-NL case */ 5275 if (md->partial != 0 && /* Take care with CRLF partial */ 5276 eptr >= md->end_subject && 5277 NLBLOCK->nltype == NLTYPE_FIXED && 5278 NLBLOCK->nllen == 2 && 5279 c == NLBLOCK->nl[0]) 5280 { 5281 md->hitend = TRUE; 5282 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5283 } 5284 break; 5285 5286 case OP_ALLANY: 5287 case OP_ANYBYTE: 5288 break; 5289 5290 case OP_ANYNL: 5291 switch(c) 5292 { 5293 default: RRETURN(MATCH_NOMATCH); 5294 case CHAR_CR: 5295 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; 5296 break; 5297 5298 case CHAR_LF: 5299 break; 5300 5301 case CHAR_VT: 5302 case CHAR_FF: 5303 case CHAR_NEL: 5304 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5305 case 0x2028: 5306 case 0x2029: 5307 #endif 5308 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5309 break; 5310 } 5311 break; 5312 5313 case OP_NOT_HSPACE: 5314 switch(c) 5315 { 5316 default: break; 5317 HSPACE_BYTE_CASES: 5318 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5319 HSPACE_MULTIBYTE_CASES: 5320 #endif 5321 RRETURN(MATCH_NOMATCH); 5322 } 5323 break; 5324 5325 case OP_HSPACE: 5326 switch(c) 5327 { 5328 default: RRETURN(MATCH_NOMATCH); 5329 HSPACE_BYTE_CASES: 5330 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5331 HSPACE_MULTIBYTE_CASES: 5332 #endif 5333 break; 5334 } 5335 break; 5336 5337 case OP_NOT_VSPACE: 5338 switch(c) 5339 { 5340 default: break; 5341 VSPACE_BYTE_CASES: 5342 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5343 VSPACE_MULTIBYTE_CASES: 5344 #endif 5345 RRETURN(MATCH_NOMATCH); 5346 } 5347 break; 5348 5349 case OP_VSPACE: 5350 switch(c) 5351 { 5352 default: RRETURN(MATCH_NOMATCH); 5353 VSPACE_BYTE_CASES: 5354 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5355 VSPACE_MULTIBYTE_CASES: 5356 #endif 5357 break; 5358 } 5359 break; 5360 5361 case OP_NOT_DIGIT: 5362 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 5363 break; 5364 5365 case OP_DIGIT: 5366 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 5367 break; 5368 5369 case OP_NOT_WHITESPACE: 5370 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 5371 break; 5372 5373 case OP_WHITESPACE: 5374 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 5375 break; 5376 5377 case OP_NOT_WORDCHAR: 5378 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 5379 break; 5380 5381 case OP_WORDCHAR: 5382 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 5383 break; 5384 5385 default: 5386 RRETURN(PCRE_ERROR_INTERNAL); 5387 } 5388 } 5389 } 5390 /* Control never gets here */ 5391 } 5392 5393 /* If maximizing, it is worth using inline code for speed, doing the type 5394 test once at the start (i.e. keep it out of the loop). Again, keep the 5395 UTF-8 and UCP stuff separate. */ 5396 5397 else 5398 { 5399 pp = eptr; /* Remember where we started */ 5400 5401 #ifdef SUPPORT_UCP 5402 if (prop_type >= 0) 5403 { 5404 switch(prop_type) 5405 { 5406 case PT_ANY: 5407 for (i = min; i < max; i++) 5408 { 5409 int len = 1; 5410 if (eptr >= md->end_subject) 5411 { 5412 SCHECK_PARTIAL(); 5413 break; 5414 } 5415 GETCHARLENTEST(c, eptr, len); 5416 if (prop_fail_result) break; 5417 eptr+= len; 5418 } 5419 break; 5420 5421 case PT_LAMP: 5422 for (i = min; i < max; i++) 5423 { 5424 int chartype; 5425 int len = 1; 5426 if (eptr >= md->end_subject) 5427 { 5428 SCHECK_PARTIAL(); 5429 break; 5430 } 5431 GETCHARLENTEST(c, eptr, len); 5432 chartype = UCD_CHARTYPE(c); 5433 if ((chartype == ucp_Lu || 5434 chartype == ucp_Ll || 5435 chartype == ucp_Lt) == prop_fail_result) 5436 break; 5437 eptr+= len; 5438 } 5439 break; 5440 5441 case PT_GC: 5442 for (i = min; i < max; i++) 5443 { 5444 int len = 1; 5445 if (eptr >= md->end_subject) 5446 { 5447 SCHECK_PARTIAL(); 5448 break; 5449 } 5450 GETCHARLENTEST(c, eptr, len); 5451 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; 5452 eptr+= len; 5453 } 5454 break; 5455 5456 case PT_PC: 5457 for (i = min; i < max; i++) 5458 { 5459 int len = 1; 5460 if (eptr >= md->end_subject) 5461 { 5462 SCHECK_PARTIAL(); 5463 break; 5464 } 5465 GETCHARLENTEST(c, eptr, len); 5466 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; 5467 eptr+= len; 5468 } 5469 break; 5470 5471 case PT_SC: 5472 for (i = min; i < max; i++) 5473 { 5474 int len = 1; 5475 if (eptr >= md->end_subject) 5476 { 5477 SCHECK_PARTIAL(); 5478 break; 5479 } 5480 GETCHARLENTEST(c, eptr, len); 5481 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; 5482 eptr+= len; 5483 } 5484 break; 5485 5486 case PT_ALNUM: 5487 for (i = min; i < max; i++) 5488 { 5489 int category; 5490 int len = 1; 5491 if (eptr >= md->end_subject) 5492 { 5493 SCHECK_PARTIAL(); 5494 break; 5495 } 5496 GETCHARLENTEST(c, eptr, len); 5497 category = UCD_CATEGORY(c); 5498 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5499 break; 5500 eptr+= len; 5501 } 5502 break; 5503 5504 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5505 which means that Perl space and POSIX space are now identical. PCRE 5506 was changed at release 8.34. */ 5507 5508 case PT_SPACE: /* Perl space */ 5509 case PT_PXSPACE: /* POSIX space */ 5510 for (i = min; i < max; i++) 5511 { 5512 int len = 1; 5513 if (eptr >= md->end_subject) 5514 { 5515 SCHECK_PARTIAL(); 5516 break; 5517 } 5518 GETCHARLENTEST(c, eptr, len); 5519 switch(c) 5520 { 5521 HSPACE_CASES: 5522 VSPACE_CASES: 5523 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ 5524 break; 5525 5526 default: 5527 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5528 goto ENDLOOP99; /* Break the loop */ 5529 break; 5530 } 5531 eptr+= len; 5532 } 5533 ENDLOOP99: 5534 break; 5535 5536 case PT_WORD: 5537 for (i = min; i < max; i++) 5538 { 5539 int category; 5540 int len = 1; 5541 if (eptr >= md->end_subject) 5542 { 5543 SCHECK_PARTIAL(); 5544 break; 5545 } 5546 GETCHARLENTEST(c, eptr, len); 5547 category = UCD_CATEGORY(c); 5548 if ((category == ucp_L || category == ucp_N || 5549 c == CHAR_UNDERSCORE) == prop_fail_result) 5550 break; 5551 eptr+= len; 5552 } 5553 break; 5554 5555 case PT_CLIST: 5556 for (i = min; i < max; i++) 5557 { 5558 const pcre_uint32 *cp; 5559 int len = 1; 5560 if (eptr >= md->end_subject) 5561 { 5562 SCHECK_PARTIAL(); 5563 break; 5564 } 5565 GETCHARLENTEST(c, eptr, len); 5566 cp = PRIV(ucd_caseless_sets) + prop_value; 5567 for (;;) 5568 { 5569 if (c < *cp) 5570 { if (prop_fail_result) break; else goto GOT_MAX; } 5571 if (c == *cp++) 5572 { if (prop_fail_result) goto GOT_MAX; else break; } 5573 } 5574 eptr += len; 5575 } 5576 GOT_MAX: 5577 break; 5578 5579 case PT_UCNC: 5580 for (i = min; i < max; i++) 5581 { 5582 int len = 1; 5583 if (eptr >= md->end_subject) 5584 { 5585 SCHECK_PARTIAL(); 5586 break; 5587 } 5588 GETCHARLENTEST(c, eptr, len); 5589 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5590 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5591 c >= 0xe000) == prop_fail_result) 5592 break; 5593 eptr += len; 5594 } 5595 break; 5596 5597 default: 5598 RRETURN(PCRE_ERROR_INTERNAL); 5599 } 5600 5601 /* eptr is now past the end of the maximum run */ 5602 5603 if (possessive) continue; /* No backtracking */ 5604 for(;;) 5605 { 5606 if (eptr == pp) goto TAIL_RECURSE; 5607 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); 5608 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5609 eptr--; 5610 if (utf) BACKCHAR(eptr); 5611 } 5612 } 5613 5614 /* Match extended Unicode grapheme clusters. We will get here only if the 5615 support is in the binary; otherwise a compile-time error occurs. */ 5616 5617 else if (ctype == OP_EXTUNI) 5618 { 5619 for (i = min; i < max; i++) 5620 { 5621 if (eptr >= md->end_subject) 5622 { 5623 SCHECK_PARTIAL(); 5624 break; 5625 } 5626 else 5627 { 5628 int lgb, rgb; 5629 GETCHARINCTEST(c, eptr); 5630 lgb = UCD_GRAPHBREAK(c); 5631 while (eptr < md->end_subject) 5632 { 5633 int len = 1; 5634 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5635 rgb = UCD_GRAPHBREAK(c); 5636 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5637 lgb = rgb; 5638 eptr += len; 5639 } 5640 } 5641 CHECK_PARTIAL(); 5642 } 5643 5644 /* eptr is now past the end of the maximum run */ 5645 5646 if (possessive) continue; /* No backtracking */ 5647 5648 for(;;) 5649 { 5650 int lgb, rgb; 5651 PCRE_PUCHAR fptr; 5652 5653 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ 5654 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); 5655 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5656 5657 /* Backtracking over an extended grapheme cluster involves inspecting 5658 the previous two characters (if present) to see if a break is 5659 permitted between them. */ 5660 5661 eptr--; 5662 if (!utf) c = *eptr; else 5663 { 5664 BACKCHAR(eptr); 5665 GETCHAR(c, eptr); 5666 } 5667 rgb = UCD_GRAPHBREAK(c); 5668 5669 for (;;) 5670 { 5671 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ 5672 fptr = eptr - 1; 5673 if (!utf) c = *fptr; else 5674 { 5675 BACKCHAR(fptr); 5676 GETCHAR(c, fptr); 5677 } 5678 lgb = UCD_GRAPHBREAK(c); 5679 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5680 eptr = fptr; 5681 rgb = lgb; 5682 } 5683 } 5684 } 5685 5686 else 5687 #endif /* SUPPORT_UCP */ 5688 5689 #ifdef SUPPORT_UTF 5690 if (utf) 5691 { 5692 switch(ctype) 5693 { 5694 case OP_ANY: 5695 for (i = min; i < max; i++) 5696 { 5697 if (eptr >= md->end_subject) 5698 { 5699 SCHECK_PARTIAL(); 5700 break; 5701 } 5702 if (IS_NEWLINE(eptr)) break; 5703 if (md->partial != 0 && /* Take care with CRLF partial */ 5704 eptr + 1 >= md->end_subject && 5705 NLBLOCK->nltype == NLTYPE_FIXED && 5706 NLBLOCK->nllen == 2 && 5707 UCHAR21(eptr) == NLBLOCK->nl[0]) 5708 { 5709 md->hitend = TRUE; 5710 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5711 } 5712 eptr++; 5713 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5714 } 5715 break; 5716 5717 case OP_ALLANY: 5718 if (max < INT_MAX) 5719 { 5720 for (i = min; i < max; i++) 5721 { 5722 if (eptr >= md->end_subject) 5723 { 5724 SCHECK_PARTIAL(); 5725 break; 5726 } 5727 eptr++; 5728 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5729 } 5730 } 5731 else 5732 { 5733 eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 5734 SCHECK_PARTIAL(); 5735 } 5736 break; 5737 5738 /* The byte case is the same as non-UTF8 */ 5739 5740 case OP_ANYBYTE: 5741 c = max - min; 5742 if (c > (unsigned int)(md->end_subject - eptr)) 5743 { 5744 eptr = md->end_subject; 5745 SCHECK_PARTIAL(); 5746 } 5747 else eptr += c; 5748 break; 5749 5750 case OP_ANYNL: 5751 for (i = min; i < max; i++) 5752 { 5753 int len = 1; 5754 if (eptr >= md->end_subject) 5755 { 5756 SCHECK_PARTIAL(); 5757 break; 5758 } 5759 GETCHARLEN(c, eptr, len); 5760 if (c == CHAR_CR) 5761 { 5762 if (++eptr >= md->end_subject) break; 5763 if (UCHAR21(eptr) == CHAR_LF) eptr++; 5764 } 5765 else 5766 { 5767 if (c != CHAR_LF && 5768 (md->bsr_anycrlf || 5769 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5770 #ifndef EBCDIC 5771 && c != 0x2028 && c != 0x2029 5772 #endif /* Not EBCDIC */ 5773 ))) 5774 break; 5775 eptr += len; 5776 } 5777 } 5778 break; 5779 5780 case OP_NOT_HSPACE: 5781 case OP_HSPACE: 5782 for (i = min; i < max; i++) 5783 { 5784 BOOL gotspace; 5785 int len = 1; 5786 if (eptr >= md->end_subject) 5787 { 5788 SCHECK_PARTIAL(); 5789 break; 5790 } 5791 GETCHARLEN(c, eptr, len); 5792 switch(c) 5793 { 5794 HSPACE_CASES: gotspace = TRUE; break; 5795 default: gotspace = FALSE; break; 5796 } 5797 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5798 eptr += len; 5799 } 5800 break; 5801 5802 case OP_NOT_VSPACE: 5803 case OP_VSPACE: 5804 for (i = min; i < max; i++) 5805 { 5806 BOOL gotspace; 5807 int len = 1; 5808 if (eptr >= md->end_subject) 5809 { 5810 SCHECK_PARTIAL(); 5811 break; 5812 } 5813 GETCHARLEN(c, eptr, len); 5814 switch(c) 5815 { 5816 VSPACE_CASES: gotspace = TRUE; break; 5817 default: gotspace = FALSE; break; 5818 } 5819 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5820 eptr += len; 5821 } 5822 break; 5823 5824 case OP_NOT_DIGIT: 5825 for (i = min; i < max; i++) 5826 { 5827 int len = 1; 5828 if (eptr >= md->end_subject) 5829 { 5830 SCHECK_PARTIAL(); 5831 break; 5832 } 5833 GETCHARLEN(c, eptr, len); 5834 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 5835 eptr+= len; 5836 } 5837 break; 5838 5839 case OP_DIGIT: 5840 for (i = min; i < max; i++) 5841 { 5842 int len = 1; 5843 if (eptr >= md->end_subject) 5844 { 5845 SCHECK_PARTIAL(); 5846 break; 5847 } 5848 GETCHARLEN(c, eptr, len); 5849 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 5850 eptr+= len; 5851 } 5852 break; 5853 5854 case OP_NOT_WHITESPACE: 5855 for (i = min; i < max; i++) 5856 { 5857 int len = 1; 5858 if (eptr >= md->end_subject) 5859 { 5860 SCHECK_PARTIAL(); 5861 break; 5862 } 5863 GETCHARLEN(c, eptr, len); 5864 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 5865 eptr+= len; 5866 } 5867 break; 5868 5869 case OP_WHITESPACE: 5870 for (i = min; i < max; i++) 5871 { 5872 int len = 1; 5873 if (eptr >= md->end_subject) 5874 { 5875 SCHECK_PARTIAL(); 5876 break; 5877 } 5878 GETCHARLEN(c, eptr, len); 5879 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 5880 eptr+= len; 5881 } 5882 break; 5883 5884 case OP_NOT_WORDCHAR: 5885 for (i = min; i < max; i++) 5886 { 5887 int len = 1; 5888 if (eptr >= md->end_subject) 5889 { 5890 SCHECK_PARTIAL(); 5891 break; 5892 } 5893 GETCHARLEN(c, eptr, len); 5894 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 5895 eptr+= len; 5896 } 5897 break; 5898 5899 case OP_WORDCHAR: 5900 for (i = min; i < max; i++) 5901 { 5902 int len = 1; 5903 if (eptr >= md->end_subject) 5904 { 5905 SCHECK_PARTIAL(); 5906 break; 5907 } 5908 GETCHARLEN(c, eptr, len); 5909 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 5910 eptr+= len; 5911 } 5912 break; 5913 5914 default: 5915 RRETURN(PCRE_ERROR_INTERNAL); 5916 } 5917 5918 if (possessive) continue; /* No backtracking */ 5919 for(;;) 5920 { 5921 if (eptr == pp) goto TAIL_RECURSE; 5922 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); 5923 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5924 eptr--; 5925 BACKCHAR(eptr); 5926 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL && 5927 UCHAR21(eptr - 1) == CHAR_CR) eptr--; 5928 } 5929 } 5930 else 5931 #endif /* SUPPORT_UTF */ 5932 /* Not UTF mode */ 5933 { 5934 switch(ctype) 5935 { 5936 case OP_ANY: 5937 for (i = min; i < max; i++) 5938 { 5939 if (eptr >= md->end_subject) 5940 { 5941 SCHECK_PARTIAL(); 5942 break; 5943 } 5944 if (IS_NEWLINE(eptr)) break; 5945 if (md->partial != 0 && /* Take care with CRLF partial */ 5946 eptr + 1 >= md->end_subject && 5947 NLBLOCK->nltype == NLTYPE_FIXED && 5948 NLBLOCK->nllen == 2 && 5949 *eptr == NLBLOCK->nl[0]) 5950 { 5951 md->hitend = TRUE; 5952 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5953 } 5954 eptr++; 5955 } 5956 break; 5957 5958 case OP_ALLANY: 5959 case OP_ANYBYTE: 5960 c = max - min; 5961 if (c > (unsigned int)(md->end_subject - eptr)) 5962 { 5963 eptr = md->end_subject; 5964 SCHECK_PARTIAL(); 5965 } 5966 else eptr += c; 5967 break; 5968 5969 case OP_ANYNL: 5970 for (i = min; i < max; i++) 5971 { 5972 if (eptr >= md->end_subject) 5973 { 5974 SCHECK_PARTIAL(); 5975 break; 5976 } 5977 c = *eptr; 5978 if (c == CHAR_CR) 5979 { 5980 if (++eptr >= md->end_subject) break; 5981 if (*eptr == CHAR_LF) eptr++; 5982 } 5983 else 5984 { 5985 if (c != CHAR_LF && (md->bsr_anycrlf || 5986 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5987 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 5988 && c != 0x2028 && c != 0x2029 5989 #endif 5990 ))) break; 5991 eptr++; 5992 } 5993 } 5994 break; 5995 5996 case OP_NOT_HSPACE: 5997 for (i = min; i < max; i++) 5998 { 5999 if (eptr >= md->end_subject) 6000 { 6001 SCHECK_PARTIAL(); 6002 break; 6003 } 6004 switch(*eptr) 6005 { 6006 default: eptr++; break; 6007 HSPACE_BYTE_CASES: 6008 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6009 HSPACE_MULTIBYTE_CASES: 6010 #endif 6011 goto ENDLOOP00; 6012 } 6013 } 6014 ENDLOOP00: 6015 break; 6016 6017 case OP_HSPACE: 6018 for (i = min; i < max; i++) 6019 { 6020 if (eptr >= md->end_subject) 6021 { 6022 SCHECK_PARTIAL(); 6023 break; 6024 } 6025 switch(*eptr) 6026 { 6027 default: goto ENDLOOP01; 6028 HSPACE_BYTE_CASES: 6029 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6030 HSPACE_MULTIBYTE_CASES: 6031 #endif 6032 eptr++; break; 6033 } 6034 } 6035 ENDLOOP01: 6036 break; 6037 6038 case OP_NOT_VSPACE: 6039 for (i = min; i < max; i++) 6040 { 6041 if (eptr >= md->end_subject) 6042 { 6043 SCHECK_PARTIAL(); 6044 break; 6045 } 6046 switch(*eptr) 6047 { 6048 default: eptr++; break; 6049 VSPACE_BYTE_CASES: 6050 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6051 VSPACE_MULTIBYTE_CASES: 6052 #endif 6053 goto ENDLOOP02; 6054 } 6055 } 6056 ENDLOOP02: 6057 break; 6058 6059 case OP_VSPACE: 6060 for (i = min; i < max; i++) 6061 { 6062 if (eptr >= md->end_subject) 6063 { 6064 SCHECK_PARTIAL(); 6065 break; 6066 } 6067 switch(*eptr) 6068 { 6069 default: goto ENDLOOP03; 6070 VSPACE_BYTE_CASES: 6071 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 6072 VSPACE_MULTIBYTE_CASES: 6073 #endif 6074 eptr++; break; 6075 } 6076 } 6077 ENDLOOP03: 6078 break; 6079 6080 case OP_NOT_DIGIT: 6081 for (i = min; i < max; i++) 6082 { 6083 if (eptr >= md->end_subject) 6084 { 6085 SCHECK_PARTIAL(); 6086 break; 6087 } 6088 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; 6089 eptr++; 6090 } 6091 break; 6092 6093 case OP_DIGIT: 6094 for (i = min; i < max; i++) 6095 { 6096 if (eptr >= md->end_subject) 6097 { 6098 SCHECK_PARTIAL(); 6099 break; 6100 } 6101 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; 6102 eptr++; 6103 } 6104 break; 6105 6106 case OP_NOT_WHITESPACE: 6107 for (i = min; i < max; i++) 6108 { 6109 if (eptr >= md->end_subject) 6110 { 6111 SCHECK_PARTIAL(); 6112 break; 6113 } 6114 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; 6115 eptr++; 6116 } 6117 break; 6118 6119 case OP_WHITESPACE: 6120 for (i = min; i < max; i++) 6121 { 6122 if (eptr >= md->end_subject) 6123 { 6124 SCHECK_PARTIAL(); 6125 break; 6126 } 6127 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; 6128 eptr++; 6129 } 6130 break; 6131 6132 case OP_NOT_WORDCHAR: 6133 for (i = min; i < max; i++) 6134 { 6135 if (eptr >= md->end_subject) 6136 { 6137 SCHECK_PARTIAL(); 6138 break; 6139 } 6140 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; 6141 eptr++; 6142 } 6143 break; 6144 6145 case OP_WORDCHAR: 6146 for (i = min; i < max; i++) 6147 { 6148 if (eptr >= md->end_subject) 6149 { 6150 SCHECK_PARTIAL(); 6151 break; 6152 } 6153 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; 6154 eptr++; 6155 } 6156 break; 6157 6158 default: 6159 RRETURN(PCRE_ERROR_INTERNAL); 6160 } 6161 6162 if (possessive) continue; /* No backtracking */ 6163 for (;;) 6164 { 6165 if (eptr == pp) goto TAIL_RECURSE; 6166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); 6167 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6168 eptr--; 6169 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && 6170 eptr[-1] == CHAR_CR) eptr--; 6171 } 6172 } 6173 6174 /* Control never gets here */ 6175 } 6176 6177 /* There's been some horrible disaster. Arrival here can only mean there is 6178 something seriously wrong in the code above or the OP_xxx definitions. */ 6179 6180 default: 6181 DPRINTF(("Unknown opcode %d\n", *ecode)); 6182 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 6183 } 6184 6185 /* Do not stick any code in here without much thought; it is assumed 6186 that "continue" in the code above comes out to here to repeat the main 6187 loop. */ 6188 6189 } /* End of main loop */ 6190 /* Control never reaches here */ 6191 6192 6193 /* When compiling to use the heap rather than the stack for recursive calls to 6194 match(), the RRETURN() macro jumps here. The number that is saved in 6195 frame->Xwhere indicates which label we actually want to return to. */ 6196 6197 #ifdef NO_RECURSE 6198 #define LBL(val) case val: goto L_RM##val; 6199 HEAP_RETURN: 6200 switch (frame->Xwhere) 6201 { 6202 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 6203 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 6204 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 6205 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 6206 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) 6207 LBL(65) LBL(66) 6208 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 6209 LBL(20) LBL(21) 6210 #endif 6211 #ifdef SUPPORT_UTF 6212 LBL(16) LBL(18) 6213 LBL(22) LBL(23) LBL(28) LBL(30) 6214 LBL(32) LBL(34) LBL(42) LBL(46) 6215 #ifdef SUPPORT_UCP 6216 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 6217 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) 6218 #endif /* SUPPORT_UCP */ 6219 #endif /* SUPPORT_UTF */ 6220 default: 6221 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 6222 return PCRE_ERROR_INTERNAL; 6223 } 6224 #undef LBL 6225 #endif /* NO_RECURSE */ 6226 } 6227 6228 6229 /*************************************************************************** 6230 **************************************************************************** 6231 RECURSION IN THE match() FUNCTION 6232 6233 Undefine all the macros that were defined above to handle this. */ 6234 6235 #ifdef NO_RECURSE 6236 #undef eptr 6237 #undef ecode 6238 #undef mstart 6239 #undef offset_top 6240 #undef eptrb 6241 #undef flags 6242 6243 #undef callpat 6244 #undef charptr 6245 #undef data 6246 #undef next 6247 #undef pp 6248 #undef prev 6249 #undef saved_eptr 6250 6251 #undef new_recursive 6252 6253 #undef cur_is_word 6254 #undef condition 6255 #undef prev_is_word 6256 6257 #undef ctype 6258 #undef length 6259 #undef max 6260 #undef min 6261 #undef number 6262 #undef offset 6263 #undef op 6264 #undef save_capture_last 6265 #undef save_offset1 6266 #undef save_offset2 6267 #undef save_offset3 6268 #undef stacksave 6269 6270 #undef newptrb 6271 6272 #endif 6273 6274 /* These two are defined as macros in both cases */ 6275 6276 #undef fc 6277 #undef fi 6278 6279 /*************************************************************************** 6280 ***************************************************************************/ 6281 6282 6283 #ifdef NO_RECURSE 6284 /************************************************* 6285 * Release allocated heap frames * 6286 *************************************************/ 6287 6288 /* This function releases all the allocated frames. The base frame is on the 6289 machine stack, and so must not be freed. 6290 6291 Argument: the address of the base frame 6292 Returns: nothing 6293 */ 6294 6295 static void 6296 release_match_heapframes (heapframe *frame_base) 6297 { 6298 heapframe *nextframe = frame_base->Xnextframe; 6299 while (nextframe != NULL) 6300 { 6301 heapframe *oldframe = nextframe; 6302 nextframe = nextframe->Xnextframe; 6303 (PUBL(stack_free))(oldframe); 6304 } 6305 } 6306 #endif 6307 6308 6309 /************************************************* 6310 * Execute a Regular Expression * 6311 *************************************************/ 6312 6313 /* This function applies a compiled re to a subject string and picks out 6314 portions of the string if it matches. Two elements in the vector are set for 6315 each substring: the offsets to the start and end of the substring. 6316 6317 Arguments: 6318 argument_re points to the compiled expression 6319 extra_data points to extra data or is NULL 6320 subject points to the subject string 6321 length length of subject string (may contain binary zeros) 6322 start_offset where to start in the subject string 6323 options option bits 6324 offsets points to a vector of ints to be filled in with offsets 6325 offsetcount the number of elements in the vector 6326 6327 Returns: > 0 => success; value is the number of elements filled in 6328 = 0 => success, but offsets is not big enough 6329 -1 => failed to match 6330 < -1 => some kind of unexpected problem 6331 */ 6332 6333 #if defined COMPILE_PCRE8 6334 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6335 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 6336 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 6337 int offsetcount) 6338 #elif defined COMPILE_PCRE16 6339 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6340 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 6341 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 6342 int offsetcount) 6343 #elif defined COMPILE_PCRE32 6344 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6345 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, 6346 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets, 6347 int offsetcount) 6348 #endif 6349 { 6350 int rc, ocount, arg_offset_max; 6351 int newline; 6352 BOOL using_temporary_offsets = FALSE; 6353 BOOL anchored; 6354 BOOL startline; 6355 BOOL firstline; 6356 BOOL utf; 6357 BOOL has_first_char = FALSE; 6358 BOOL has_req_char = FALSE; 6359 pcre_uchar first_char = 0; 6360 pcre_uchar first_char2 = 0; 6361 pcre_uchar req_char = 0; 6362 pcre_uchar req_char2 = 0; 6363 match_data match_block; 6364 match_data *md = &match_block; 6365 const pcre_uint8 *tables; 6366 const pcre_uint8 *start_bits = NULL; 6367 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; 6368 PCRE_PUCHAR end_subject; 6369 PCRE_PUCHAR start_partial = NULL; 6370 PCRE_PUCHAR match_partial = NULL; 6371 PCRE_PUCHAR req_char_ptr = start_match - 1; 6372 6373 const pcre_study_data *study; 6374 const REAL_PCRE *re = (const REAL_PCRE *)argument_re; 6375 6376 #ifdef NO_RECURSE 6377 heapframe frame_zero; 6378 frame_zero.Xprevframe = NULL; /* Marks the top level */ 6379 frame_zero.Xnextframe = NULL; /* None are allocated yet */ 6380 md->match_frames_base = &frame_zero; 6381 #endif 6382 6383 /* Check for the special magic call that measures the size of the stack used 6384 per recursive call of match(). Without the funny casting for sizeof, a Windows 6385 compiler gave this error: "unary minus operator applied to unsigned type, 6386 result still unsigned". Hopefully the cast fixes that. */ 6387 6388 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && 6389 start_offset == -999) 6390 #ifdef NO_RECURSE 6391 return -((int)sizeof(heapframe)); 6392 #else 6393 return match(NULL, NULL, NULL, 0, NULL, NULL, 0); 6394 #endif 6395 6396 /* Plausibility checks */ 6397 6398 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 6399 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) 6400 return PCRE_ERROR_NULL; 6401 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 6402 if (length < 0) return PCRE_ERROR_BADLENGTH; 6403 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 6404 6405 /* Check that the first field in the block is the magic number. If it is not, 6406 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 6407 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 6408 means that the pattern is likely compiled with different endianness. */ 6409 6410 if (re->magic_number != MAGIC_NUMBER) 6411 return re->magic_number == REVERSED_MAGIC_NUMBER? 6412 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 6413 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 6414 6415 /* These two settings are used in the code for checking a UTF-8 string that 6416 follows immediately afterwards. Other values in the md block are used only 6417 during "normal" pcre_exec() processing, not when the JIT support is in use, 6418 so they are set up later. */ 6419 6420 /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 6421 utf = md->utf = (re->options & PCRE_UTF8) != 0; 6422 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 6423 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 6424 6425 /* Check a UTF-8 string if required. Pass back the character offset and error 6426 code for an invalid string if a results vector is available. */ 6427 6428 #ifdef SUPPORT_UTF 6429 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 6430 { 6431 int erroroffset; 6432 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); 6433 if (errorcode != 0) 6434 { 6435 if (offsetcount >= 2) 6436 { 6437 offsets[0] = erroroffset; 6438 offsets[1] = errorcode; 6439 } 6440 #if defined COMPILE_PCRE8 6441 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? 6442 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 6443 #elif defined COMPILE_PCRE16 6444 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? 6445 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 6446 #elif defined COMPILE_PCRE32 6447 return PCRE_ERROR_BADUTF32; 6448 #endif 6449 } 6450 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 6451 /* Check that a start_offset points to the start of a UTF character. */ 6452 if (start_offset > 0 && start_offset < length && 6453 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 6454 return PCRE_ERROR_BADUTF8_OFFSET; 6455 #endif 6456 } 6457 #endif 6458 6459 /* If the pattern was successfully studied with JIT support, run the JIT 6460 executable instead of the rest of this function. Most options must be set at 6461 compile time for the JIT code to be usable. Fallback to the normal code path if 6462 an unsupported flag is set. */ 6463 6464 #ifdef SUPPORT_JIT 6465 if (extra_data != NULL 6466 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | 6467 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT 6468 && extra_data->executable_jit != NULL 6469 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0) 6470 { 6471 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length, 6472 start_offset, options, offsets, offsetcount); 6473 6474 /* PCRE_ERROR_NULL means that the selected normal or partial matching 6475 mode is not compiled. In this case we simply fallback to interpreter. */ 6476 6477 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc; 6478 } 6479 #endif 6480 6481 /* Carry on with non-JIT matching. This information is for finding all the 6482 numbers associated with a given name, for condition testing. */ 6483 6484 md->name_table = (pcre_uchar *)re + re->name_table_offset; 6485 md->name_count = re->name_count; 6486 md->name_entry_size = re->name_entry_size; 6487 6488 /* Fish out the optional data from the extra_data structure, first setting 6489 the default values. */ 6490 6491 study = NULL; 6492 md->match_limit = MATCH_LIMIT; 6493 md->match_limit_recursion = MATCH_LIMIT_RECURSION; 6494 md->callout_data = NULL; 6495 6496 /* The table pointer is always in native byte order. */ 6497 6498 tables = re->tables; 6499 6500 /* The two limit values override the defaults, whatever their value. */ 6501 6502 if (extra_data != NULL) 6503 { 6504 unsigned long int flags = extra_data->flags; 6505 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 6506 study = (const pcre_study_data *)extra_data->study_data; 6507 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 6508 md->match_limit = extra_data->match_limit; 6509 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 6510 md->match_limit_recursion = extra_data->match_limit_recursion; 6511 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 6512 md->callout_data = extra_data->callout_data; 6513 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 6514 } 6515 6516 /* Limits in the regex override only if they are smaller. */ 6517 6518 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) 6519 md->match_limit = re->limit_match; 6520 6521 if ((re->flags & PCRE_RLSET) != 0 && 6522 re->limit_recursion < md->match_limit_recursion) 6523 md->match_limit_recursion = re->limit_recursion; 6524 6525 /* If the exec call supplied NULL for tables, use the inbuilt ones. This 6526 is a feature that makes it possible to save compiled regex and re-use them 6527 in other programs later. */ 6528 6529 if (tables == NULL) tables = PRIV(default_tables); 6530 6531 /* Set up other data */ 6532 6533 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 6534 startline = (re->flags & PCRE_STARTLINE) != 0; 6535 firstline = (re->options & PCRE_FIRSTLINE) != 0; 6536 6537 /* The code starts after the real_pcre block and the capture name table. */ 6538 6539 md->start_code = (const pcre_uchar *)re + re->name_table_offset + 6540 re->name_count * re->name_entry_size; 6541 6542 md->start_subject = (PCRE_PUCHAR)subject; 6543 md->start_offset = start_offset; 6544 md->end_subject = md->start_subject + length; 6545 end_subject = md->end_subject; 6546 6547 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 6548 md->use_ucp = (re->options & PCRE_UCP) != 0; 6549 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 6550 md->ignore_skip_arg = 0; 6551 6552 /* Some options are unpacked into BOOL variables in the hope that testing 6553 them will be faster than individual option bits. */ 6554 6555 md->notbol = (options & PCRE_NOTBOL) != 0; 6556 md->noteol = (options & PCRE_NOTEOL) != 0; 6557 md->notempty = (options & PCRE_NOTEMPTY) != 0; 6558 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 6559 6560 md->hitend = FALSE; 6561 md->mark = md->nomatch_mark = NULL; /* In case never set */ 6562 6563 md->recursive = NULL; /* No recursion at top level */ 6564 md->hasthen = (re->flags & PCRE_HASTHEN) != 0; 6565 6566 md->lcc = tables + lcc_offset; 6567 md->fcc = tables + fcc_offset; 6568 md->ctypes = tables + ctypes_offset; 6569 6570 /* Handle different \R options. */ 6571 6572 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 6573 { 6574 case 0: 6575 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 6576 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 6577 else 6578 #ifdef BSR_ANYCRLF 6579 md->bsr_anycrlf = TRUE; 6580 #else 6581 md->bsr_anycrlf = FALSE; 6582 #endif 6583 break; 6584 6585 case PCRE_BSR_ANYCRLF: 6586 md->bsr_anycrlf = TRUE; 6587 break; 6588 6589 case PCRE_BSR_UNICODE: 6590 md->bsr_anycrlf = FALSE; 6591 break; 6592 6593 default: return PCRE_ERROR_BADNEWLINE; 6594 } 6595 6596 /* Handle different types of newline. The three bits give eight cases. If 6597 nothing is set at run time, whatever was used at compile time applies. */ 6598 6599 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 6600 (pcre_uint32)options) & PCRE_NEWLINE_BITS) 6601 { 6602 case 0: newline = NEWLINE; break; /* Compile-time default */ 6603 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 6604 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 6605 case PCRE_NEWLINE_CR+ 6606 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 6607 case PCRE_NEWLINE_ANY: newline = -1; break; 6608 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 6609 default: return PCRE_ERROR_BADNEWLINE; 6610 } 6611 6612 if (newline == -2) 6613 { 6614 md->nltype = NLTYPE_ANYCRLF; 6615 } 6616 else if (newline < 0) 6617 { 6618 md->nltype = NLTYPE_ANY; 6619 } 6620 else 6621 { 6622 md->nltype = NLTYPE_FIXED; 6623 if (newline > 255) 6624 { 6625 md->nllen = 2; 6626 md->nl[0] = (newline >> 8) & 255; 6627 md->nl[1] = newline & 255; 6628 } 6629 else 6630 { 6631 md->nllen = 1; 6632 md->nl[0] = newline; 6633 } 6634 } 6635 6636 /* Partial matching was originally supported only for a restricted set of 6637 regexes; from release 8.00 there are no restrictions, but the bits are still 6638 defined (though never set). So there's no harm in leaving this code. */ 6639 6640 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 6641 return PCRE_ERROR_BADPARTIAL; 6642 6643 /* If the expression has got more back references than the offsets supplied can 6644 hold, we get a temporary chunk of working store to use during the matching. 6645 Otherwise, we can use the vector supplied, rounding down its size to a multiple 6646 of 3. */ 6647 6648 ocount = offsetcount - (offsetcount % 3); 6649 arg_offset_max = (2*ocount)/3; 6650 6651 if (re->top_backref > 0 && re->top_backref >= ocount/3) 6652 { 6653 ocount = re->top_backref * 3 + 3; 6654 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); 6655 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 6656 using_temporary_offsets = TRUE; 6657 DPRINTF(("Got memory to hold back references\n")); 6658 } 6659 else md->offset_vector = offsets; 6660 md->offset_end = ocount; 6661 md->offset_max = (2*ocount)/3; 6662 md->capture_last = 0; 6663 6664 /* Reset the working variable associated with each extraction. These should 6665 never be used unless previously set, but they get saved and restored, and so we 6666 initialize them to avoid reading uninitialized locations. Also, unset the 6667 offsets for the matched string. This is really just for tidiness with callouts, 6668 in case they inspect these fields. */ 6669 6670 if (md->offset_vector != NULL) 6671 { 6672 register int *iptr = md->offset_vector + ocount; 6673 register int *iend = iptr - re->top_bracket; 6674 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; 6675 while (--iptr >= iend) *iptr = -1; 6676 md->offset_vector[0] = md->offset_vector[1] = -1; 6677 } 6678 6679 /* Set up the first character to match, if available. The first_char value is 6680 never set for an anchored regular expression, but the anchoring may be forced 6681 at run time, so we have to test for anchoring. The first char may be unset for 6682 an unanchored pattern, of course. If there's no first char and the pattern was 6683 studied, there may be a bitmap of possible first characters. */ 6684 6685 if (!anchored) 6686 { 6687 if ((re->flags & PCRE_FIRSTSET) != 0) 6688 { 6689 has_first_char = TRUE; 6690 first_char = first_char2 = (pcre_uchar)(re->first_char); 6691 if ((re->flags & PCRE_FCH_CASELESS) != 0) 6692 { 6693 first_char2 = TABLE_GET(first_char, md->fcc, first_char); 6694 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6695 if (utf && first_char > 127) 6696 first_char2 = UCD_OTHERCASE(first_char); 6697 #endif 6698 } 6699 } 6700 else 6701 if (!startline && study != NULL && 6702 (study->flags & PCRE_STUDY_MAPPED) != 0) 6703 start_bits = study->start_bits; 6704 } 6705 6706 /* For anchored or unanchored matches, there may be a "last known required 6707 character" set. */ 6708 6709 if ((re->flags & PCRE_REQCHSET) != 0) 6710 { 6711 has_req_char = TRUE; 6712 req_char = req_char2 = (pcre_uchar)(re->req_char); 6713 if ((re->flags & PCRE_RCH_CASELESS) != 0) 6714 { 6715 req_char2 = TABLE_GET(req_char, md->fcc, req_char); 6716 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6717 if (utf && req_char > 127) 6718 req_char2 = UCD_OTHERCASE(req_char); 6719 #endif 6720 } 6721 } 6722 6723 6724 /* ==========================================================================*/ 6725 6726 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 6727 the loop runs just once. */ 6728 6729 for(;;) 6730 { 6731 PCRE_PUCHAR save_end_subject = end_subject; 6732 PCRE_PUCHAR new_start_match; 6733 6734 /* If firstline is TRUE, the start of the match is constrained to the first 6735 line of a multiline string. That is, the match must be before or at the first 6736 newline. Implement this by temporarily adjusting end_subject so that we stop 6737 scanning at a newline. If the match fails at the newline, later code breaks 6738 this loop. */ 6739 6740 if (firstline) 6741 { 6742 PCRE_PUCHAR t = start_match; 6743 #ifdef SUPPORT_UTF 6744 if (utf) 6745 { 6746 while (t < md->end_subject && !IS_NEWLINE(t)) 6747 { 6748 t++; 6749 ACROSSCHAR(t < end_subject, *t, t++); 6750 } 6751 } 6752 else 6753 #endif 6754 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 6755 end_subject = t; 6756 } 6757 6758 /* There are some optimizations that avoid running the match if a known 6759 starting point is not found, or if a known later character is not present. 6760 However, there is an option that disables these, for testing and for ensuring 6761 that all callouts do actually occur. The option can be set in the regex by 6762 (*NO_START_OPT) or passed in match-time options. */ 6763 6764 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 6765 { 6766 /* Advance to a unique first char if there is one. */ 6767 6768 if (has_first_char) 6769 { 6770 pcre_uchar smc; 6771 6772 if (first_char != first_char2) 6773 while (start_match < end_subject && 6774 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2) 6775 start_match++; 6776 else 6777 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char) 6778 start_match++; 6779 } 6780 6781 /* Or to just after a linebreak for a multiline match */ 6782 6783 else if (startline) 6784 { 6785 if (start_match > md->start_subject + start_offset) 6786 { 6787 #ifdef SUPPORT_UTF 6788 if (utf) 6789 { 6790 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6791 { 6792 start_match++; 6793 ACROSSCHAR(start_match < end_subject, *start_match, 6794 start_match++); 6795 } 6796 } 6797 else 6798 #endif 6799 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6800 start_match++; 6801 6802 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 6803 and we are now at a LF, advance the match position by one more character. 6804 */ 6805 6806 if (start_match[-1] == CHAR_CR && 6807 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 6808 start_match < end_subject && 6809 UCHAR21TEST(start_match) == CHAR_NL) 6810 start_match++; 6811 } 6812 } 6813 6814 /* Or to a non-unique first byte after study */ 6815 6816 else if (start_bits != NULL) 6817 { 6818 while (start_match < end_subject) 6819 { 6820 register pcre_uint32 c = UCHAR21TEST(start_match); 6821 #ifndef COMPILE_PCRE8 6822 if (c > 255) c = 255; 6823 #endif 6824 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 6825 start_match++; 6826 } 6827 } 6828 } /* Starting optimizations */ 6829 6830 /* Restore fudged end_subject */ 6831 6832 end_subject = save_end_subject; 6833 6834 /* The following two optimizations are disabled for partial matching or if 6835 disabling is explicitly requested. */ 6836 6837 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 6838 { 6839 /* If the pattern was studied, a minimum subject length may be set. This is 6840 a lower bound; no actual string of that length may actually match the 6841 pattern. Although the value is, strictly, in characters, we treat it as 6842 bytes to avoid spending too much time in this optimization. */ 6843 6844 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 6845 (pcre_uint32)(end_subject - start_match) < study->minlength) 6846 { 6847 rc = MATCH_NOMATCH; 6848 break; 6849 } 6850 6851 /* If req_char is set, we know that that character must appear in the 6852 subject for the match to succeed. If the first character is set, req_char 6853 must be later in the subject; otherwise the test starts at the match point. 6854 This optimization can save a huge amount of backtracking in patterns with 6855 nested unlimited repeats that aren't going to match. Writing separate code 6856 for cased/caseless versions makes it go faster, as does using an 6857 autoincrement and backing off on a match. 6858 6859 HOWEVER: when the subject string is very, very long, searching to its end 6860 can take a long time, and give bad performance on quite ordinary patterns. 6861 This showed up when somebody was matching something like /^\d+C/ on a 6862 32-megabyte string... so we don't do this when the string is sufficiently 6863 long. */ 6864 6865 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) 6866 { 6867 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); 6868 6869 /* We don't need to repeat the search if we haven't yet reached the 6870 place we found it at last time. */ 6871 6872 if (p > req_char_ptr) 6873 { 6874 if (req_char != req_char2) 6875 { 6876 while (p < end_subject) 6877 { 6878 register pcre_uint32 pp = UCHAR21INCTEST(p); 6879 if (pp == req_char || pp == req_char2) { p--; break; } 6880 } 6881 } 6882 else 6883 { 6884 while (p < end_subject) 6885 { 6886 if (UCHAR21INCTEST(p) == req_char) { p--; break; } 6887 } 6888 } 6889 6890 /* If we can't find the required character, break the matching loop, 6891 forcing a match failure. */ 6892 6893 if (p >= end_subject) 6894 { 6895 rc = MATCH_NOMATCH; 6896 break; 6897 } 6898 6899 /* If we have found the required character, save the point where we 6900 found it, so that we don't search again next time round the loop if 6901 the start hasn't passed this character yet. */ 6902 6903 req_char_ptr = p; 6904 } 6905 } 6906 } 6907 6908 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 6909 printf(">>>> Match against: "); 6910 pchars(start_match, end_subject - start_match, TRUE, md); 6911 printf("\n"); 6912 #endif 6913 6914 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6915 first starting point for which a partial match was found. */ 6916 6917 md->start_match_ptr = start_match; 6918 md->start_used_ptr = start_match; 6919 md->match_call_count = 0; 6920 md->match_function_type = 0; 6921 md->end_offset_top = 0; 6922 md->skip_arg_count = 0; 6923 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); 6924 if (md->hitend && start_partial == NULL) 6925 { 6926 start_partial = md->start_used_ptr; 6927 match_partial = start_match; 6928 } 6929 6930 switch(rc) 6931 { 6932 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6933 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 6934 entirely. The only way we can do that is to re-do the match at the same 6935 point, with a flag to force SKIP with an argument to be ignored. Just 6936 treating this case as NOMATCH does not work because it does not check other 6937 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 6938 6939 case MATCH_SKIP_ARG: 6940 new_start_match = start_match; 6941 md->ignore_skip_arg = md->skip_arg_count; 6942 break; 6943 6944 /* SKIP passes back the next starting point explicitly, but if it is no 6945 greater than the match we have just done, treat it as NOMATCH. */ 6946 6947 case MATCH_SKIP: 6948 if (md->start_match_ptr > start_match) 6949 { 6950 new_start_match = md->start_match_ptr; 6951 break; 6952 } 6953 /* Fall through */ 6954 6955 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6956 exactly like PRUNE. Unset ignore SKIP-with-argument. */ 6957 6958 case MATCH_NOMATCH: 6959 case MATCH_PRUNE: 6960 case MATCH_THEN: 6961 md->ignore_skip_arg = 0; 6962 new_start_match = start_match + 1; 6963 #ifdef SUPPORT_UTF 6964 if (utf) 6965 ACROSSCHAR(new_start_match < end_subject, *new_start_match, 6966 new_start_match++); 6967 #endif 6968 break; 6969 6970 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6971 6972 case MATCH_COMMIT: 6973 rc = MATCH_NOMATCH; 6974 goto ENDLOOP; 6975 6976 /* Any other return is either a match, or some kind of error. */ 6977 6978 default: 6979 goto ENDLOOP; 6980 } 6981 6982 /* Control reaches here for the various types of "no match at this point" 6983 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 6984 6985 rc = MATCH_NOMATCH; 6986 6987 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 6988 newline in the subject (though it may continue over the newline). Therefore, 6989 if we have just failed to match, starting at a newline, do not continue. */ 6990 6991 if (firstline && IS_NEWLINE(start_match)) break; 6992 6993 /* Advance to new matching position */ 6994 6995 start_match = new_start_match; 6996 6997 /* Break the loop if the pattern is anchored or if we have passed the end of 6998 the subject. */ 6999 7000 if (anchored || start_match > end_subject) break; 7001 7002 /* If we have just passed a CR and we are now at a LF, and the pattern does 7003 not contain any explicit matches for \r or \n, and the newline option is CRLF 7004 or ANY or ANYCRLF, advance the match position by one more character. In 7005 normal matching start_match will aways be greater than the first position at 7006 this stage, but a failed *SKIP can cause a return at the same point, which is 7007 why the first test exists. */ 7008 7009 if (start_match > (PCRE_PUCHAR)subject + start_offset && 7010 start_match[-1] == CHAR_CR && 7011 start_match < end_subject && 7012 *start_match == CHAR_NL && 7013 (re->flags & PCRE_HASCRORLF) == 0 && 7014 (md->nltype == NLTYPE_ANY || 7015 md->nltype == NLTYPE_ANYCRLF || 7016 md->nllen == 2)) 7017 start_match++; 7018 7019 md->mark = NULL; /* Reset for start of next match attempt */ 7020 } /* End of for(;;) "bumpalong" loop */ 7021 7022 /* ==========================================================================*/ 7023 7024 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 7025 conditions is true: 7026 7027 (1) The pattern is anchored or the match was failed by (*COMMIT); 7028 7029 (2) We are past the end of the subject; 7030 7031 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 7032 this option requests that a match occur at or before the first newline in 7033 the subject. 7034 7035 When we have a match and the offset vector is big enough to deal with any 7036 backreferences, captured substring offsets will already be set up. In the case 7037 where we had to get some local store to hold offsets for backreference 7038 processing, copy those that we can. In this case there need not be overflow if 7039 certain parts of the pattern were not used, even though there are more 7040 capturing parentheses than vector slots. */ 7041 7042 ENDLOOP: 7043 7044 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 7045 { 7046 if (using_temporary_offsets) 7047 { 7048 if (arg_offset_max >= 4) 7049 { 7050 memcpy(offsets + 2, md->offset_vector + 2, 7051 (arg_offset_max - 2) * sizeof(int)); 7052 DPRINTF(("Copied offsets from temporary memory\n")); 7053 } 7054 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; 7055 DPRINTF(("Freeing temporary memory\n")); 7056 (PUBL(free))(md->offset_vector); 7057 } 7058 7059 /* Set the return code to the number of captured strings, or 0 if there were 7060 too many to fit into the vector. */ 7061 7062 rc = ((md->capture_last & OVFLBIT) != 0 && 7063 md->end_offset_top >= arg_offset_max)? 7064 0 : md->end_offset_top/2; 7065 7066 /* If there is space in the offset vector, set any unused pairs at the end of 7067 the pattern to -1 for backwards compatibility. It is documented that this 7068 happens. In earlier versions, the whole set of potential capturing offsets 7069 was set to -1 each time round the loop, but this is handled differently now. 7070 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only 7071 those at the end that need unsetting here. We can't just unset them all at 7072 the start of the whole thing because they may get set in one branch that is 7073 not the final matching branch. */ 7074 7075 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL) 7076 { 7077 register int *iptr, *iend; 7078 int resetcount = 2 + re->top_bracket * 2; 7079 if (resetcount > offsetcount) resetcount = offsetcount; 7080 iptr = offsets + md->end_offset_top; 7081 iend = offsets + resetcount; 7082 while (iptr < iend) *iptr++ = -1; 7083 } 7084 7085 /* If there is space, set up the whole thing as substring 0. The value of 7086 md->start_match_ptr might be modified if \K was encountered on the success 7087 matching path. */ 7088 7089 if (offsetcount < 2) rc = 0; else 7090 { 7091 offsets[0] = (int)(md->start_match_ptr - md->start_subject); 7092 offsets[1] = (int)(md->end_match_ptr - md->start_subject); 7093 } 7094 7095 /* Return MARK data if requested */ 7096 7097 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7098 *(extra_data->mark) = (pcre_uchar *)md->mark; 7099 DPRINTF((">>>> returning %d\n", rc)); 7100 #ifdef NO_RECURSE 7101 release_match_heapframes(&frame_zero); 7102 #endif 7103 return rc; 7104 } 7105 7106 /* Control gets here if there has been an error, or if the overall match 7107 attempt has failed at all permitted starting positions. */ 7108 7109 if (using_temporary_offsets) 7110 { 7111 DPRINTF(("Freeing temporary memory\n")); 7112 (PUBL(free))(md->offset_vector); 7113 } 7114 7115 /* For anything other than nomatch or partial match, just return the code. */ 7116 7117 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 7118 { 7119 DPRINTF((">>>> error: returning %d\n", rc)); 7120 #ifdef NO_RECURSE 7121 release_match_heapframes(&frame_zero); 7122 #endif 7123 return rc; 7124 } 7125 7126 /* Handle partial matches - disable any mark data */ 7127 7128 if (match_partial != NULL) 7129 { 7130 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 7131 md->mark = NULL; 7132 if (offsetcount > 1) 7133 { 7134 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); 7135 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 7136 if (offsetcount > 2) 7137 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); 7138 } 7139 rc = PCRE_ERROR_PARTIAL; 7140 } 7141 7142 /* This is the classic nomatch case */ 7143 7144 else 7145 { 7146 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 7147 rc = PCRE_ERROR_NOMATCH; 7148 } 7149 7150 /* Return the MARK data if it has been requested. */ 7151 7152 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7153 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; 7154 #ifdef NO_RECURSE 7155 release_match_heapframes(&frame_zero); 7156 #endif 7157 return rc; 7158 } 7159 7160 /* End of pcre_exec.c */ 7161