1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 #ifdef HAVE_CONFIG_H 43 #include "config.h" 44 #endif 45 46 #define NLBLOCK mb /* Block containing newline information */ 47 #define PSSTART start_subject /* Field containing processed string start */ 48 #define PSEND end_subject /* Field containing processed string end */ 49 50 #include "pcre2_internal.h" 51 52 /* Masks for identifying the public options that are permitted at match time. 53 */ 54 55 #define PUBLIC_MATCH_OPTIONS \ 56 (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ 57 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ 58 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) 59 60 #define PUBLIC_JIT_MATCH_OPTIONS \ 61 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ 62 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD) 63 64 /* The mb->capture_last field uses the lower 16 bits for the last captured 65 substring (which can never be greater than 65535) and a bit in the top half 66 to mean "capture vector overflowed". This odd way of doing things was 67 implemented when it was realized that preserving and restoring the overflow bit 68 whenever the last capture number was saved/restored made for a neater 69 interface, and doing it this way saved on (a) another variable, which would 70 have increased the stack frame size (a big NO-NO in PCRE) and (b) another 71 separate set of save/restore instructions. The following defines are used in 72 implementing this. */ 73 74 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */ 75 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ 76 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ 77 78 /* Bits for setting in mb->match_function_type to indicate two special types 79 of call to match(). We do it this way to save on using another stack variable, 80 as stack usage is to be discouraged. */ 81 82 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ 83 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ 84 85 /* Non-error returns from the match() function. Error returns are externally 86 defined PCRE2_ERROR_xxx codes, which are all negative. */ 87 88 #define MATCH_MATCH 1 89 #define MATCH_NOMATCH 0 90 91 /* Special internal returns from the match() function. Make them sufficiently 92 negative to avoid the external error codes. */ 93 94 #define MATCH_ACCEPT (-999) 95 #define MATCH_KETRPOS (-998) 96 #define MATCH_ONCE (-997) 97 /* The next 5 must be kept together and in sequence so that a test that checks 98 for any one of them can use a range. */ 99 #define MATCH_COMMIT (-996) 100 #define MATCH_PRUNE (-995) 101 #define MATCH_SKIP (-994) 102 #define MATCH_SKIP_ARG (-993) 103 #define MATCH_THEN (-992) 104 #define MATCH_BACKTRACK_MAX MATCH_THEN 105 #define MATCH_BACKTRACK_MIN MATCH_COMMIT 106 107 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ 108 109 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; 110 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; 111 112 /* Maximum number of ovector elements that can be saved on the system stack 113 when processing OP_RECURSE in non-HEAP_MATCH_RECURSE mode. If the ovector is 114 bigger, malloc() is used. This value should be a multiple of 3, because the 115 ovector length is always a multiple of 3. */ 116 117 #define OP_RECURSE_STACK_SAVE_MAX 45 118 119 120 121 /************************************************* 122 * Match a back-reference * 123 *************************************************/ 124 125 /* This function is called only when it is known that the offset lies within 126 the offsets that have so far been used in the match. Note that in caseless 127 UTF-8 mode, the number of subject bytes matched may be different to the number 128 of reference bytes. (In theory this could also happen in UTF-16 mode, but it 129 seems unlikely.) 130 131 Arguments: 132 offset index into the offset vector 133 offset_top top of the used offset vector 134 eptr pointer into the subject 135 mb points to match block 136 caseless TRUE if caseless 137 lengthptr pointer for returning the length matched 138 139 Returns: = 0 sucessful match; number of code units matched is set 140 < 0 no match 141 > 0 partial match 142 */ 143 144 static int 145 match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr, 146 match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr) 147 { 148 #if defined SUPPORT_UNICODE 149 BOOL utf = (mb->poptions & PCRE2_UTF) != 0; 150 #endif 151 152 register PCRE2_SPTR p; 153 PCRE2_SIZE length; 154 PCRE2_SPTR eptr_start = eptr; 155 156 /* Deal with an unset group. The default is no match, but there is an option to 157 match an empty string. */ 158 159 if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET) 160 { 161 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) 162 { 163 *lengthptr = 0; 164 return 0; /* Match */ 165 } 166 else return -1; /* No match */ 167 } 168 169 /* Separate the caseless and UTF cases for speed. */ 170 171 p = mb->start_subject + mb->ovector[offset]; 172 length = mb->ovector[offset+1] - mb->ovector[offset]; 173 174 if (caseless) 175 { 176 #if defined SUPPORT_UNICODE 177 if (utf) 178 { 179 /* Match characters up to the end of the reference. NOTE: the number of 180 code units matched may differ, because in UTF-8 there are some characters 181 whose upper and lower case versions code have different numbers of bytes. 182 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 183 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a 184 sequence of two of the latter. It is important, therefore, to check the 185 length along the reference, not along the subject (earlier code did this 186 wrong). */ 187 188 PCRE2_SPTR endptr = p + length; 189 while (p < endptr) 190 { 191 uint32_t c, d; 192 const ucd_record *ur; 193 if (eptr >= mb->end_subject) return 1; /* Partial match */ 194 GETCHARINC(c, eptr); 195 GETCHARINC(d, p); 196 ur = GET_UCD(d); 197 if (c != d && c != (uint32_t)((int)d + ur->other_case)) 198 { 199 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; 200 for (;;) 201 { 202 if (c < *pp) return -1; /* No match */ 203 if (c == *pp++) break; 204 } 205 } 206 } 207 } 208 else 209 #endif 210 211 /* Not in UTF mode */ 212 213 { 214 for (; length > 0; length--) 215 { 216 uint32_t cc, cp; 217 if (eptr >= mb->end_subject) return 1; /* Partial match */ 218 cc = UCHAR21TEST(eptr); 219 cp = UCHAR21TEST(p); 220 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) 221 return -1; /* No match */ 222 p++; 223 eptr++; 224 } 225 } 226 } 227 228 /* In the caseful case, we can just compare the code units, whether or not we 229 are in UTF mode. */ 230 231 else 232 { 233 for (; length > 0; length--) 234 { 235 if (eptr >= mb->end_subject) return 1; /* Partial match */ 236 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */ 237 } 238 } 239 240 *lengthptr = eptr - eptr_start; 241 return 0; /* Match */ 242 } 243 244 245 246 /*************************************************************************** 247 **************************************************************************** 248 RECURSION IN THE match() FUNCTION 249 250 The match() function is highly recursive, though not every recursive call 251 increases the recursion depth. Nevertheless, some regular expressions can cause 252 it to recurse to a great depth. I was writing for Unix, so I just let it call 253 itself recursively. This uses the stack for saving everything that has to be 254 saved for a recursive call. On Unix, the stack can be large, and this works 255 fine. 256 257 It turns out that on some non-Unix-like systems there are problems with 258 programs that use a lot of stack. (This despite the fact that every last chip 259 has oodles of memory these days, and techniques for extending the stack have 260 been known for decades.) So.... 261 262 There is a fudge, triggered by defining HEAP_MATCH_RECURSE, which avoids 263 recursive calls by keeping local variables that need to be preserved in blocks 264 of memory on the heap instead instead of on the stack. Macros are used to 265 achieve this so that the actual code doesn't look very different to what it 266 always used to. 267 268 The original heap-recursive code used longjmp(). However, it seems that this 269 can be very slow on some operating systems. Following a suggestion from Stan 270 Switzer, the use of longjmp() has been abolished, at the cost of having to 271 provide a unique number for each call to RMATCH. There is no way of generating 272 a sequence of numbers at compile time in C. I have given them names, to make 273 them stand out more clearly. 274 275 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 276 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 277 tests. Furthermore, not using longjmp() means that local dynamic variables 278 don't have indeterminate values; this has meant that the frame size can be 279 reduced because the result can be "passed back" by straight setting of the 280 variable instead of being passed in the frame. 281 **************************************************************************** 282 ***************************************************************************/ 283 284 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 285 below must be updated in sync. */ 286 287 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 288 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 289 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 290 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 291 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 292 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 293 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 }; 294 295 /* These versions of the macros use the stack, as normal. Note that the "rw" 296 argument of RMATCH isn't actually used in this definition. */ 297 298 #ifndef HEAP_MATCH_RECURSE 299 #define REGISTER register 300 #define RMATCH(ra,rb,rc,rd,re,rw) \ 301 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) 302 #define RRETURN(ra) return ra 303 #else 304 305 /* These versions of the macros manage a private stack on the heap. Note that 306 the "rd" argument of RMATCH isn't actually used in this definition. It's the mb 307 argument of match(), which never changes. */ 308 309 #define REGISTER 310 311 #define RMATCH(ra,rb,rc,rd,re,rw)\ 312 {\ 313 heapframe *newframe = frame->Xnextframe;\ 314 if (newframe == NULL)\ 315 {\ 316 newframe = (heapframe *)(mb->stack_memctl.malloc)\ 317 (sizeof(heapframe), mb->stack_memctl.memory_data);\ 318 if (newframe == NULL) RRETURN(PCRE2_ERROR_NOMEMORY);\ 319 newframe->Xnextframe = NULL;\ 320 frame->Xnextframe = newframe;\ 321 }\ 322 frame->Xwhere = rw;\ 323 newframe->Xeptr = ra;\ 324 newframe->Xecode = rb;\ 325 newframe->Xmstart = mstart;\ 326 newframe->Xoffset_top = rc;\ 327 newframe->Xeptrb = re;\ 328 newframe->Xrdepth = frame->Xrdepth + 1;\ 329 newframe->Xprevframe = frame;\ 330 frame = newframe;\ 331 goto HEAP_RECURSE;\ 332 L_##rw:;\ 333 } 334 335 #define RRETURN(ra)\ 336 {\ 337 heapframe *oldframe = frame;\ 338 frame = oldframe->Xprevframe;\ 339 if (frame != NULL)\ 340 {\ 341 rrc = ra;\ 342 goto HEAP_RETURN;\ 343 }\ 344 return ra;\ 345 } 346 347 348 /* Structure for remembering the local variables in a private frame. Arrange it 349 so as to minimize the number of holes. */ 350 351 typedef struct heapframe { 352 struct heapframe *Xprevframe; 353 struct heapframe *Xnextframe; 354 355 #ifdef SUPPORT_UNICODE 356 PCRE2_SPTR Xcharptr; 357 #endif 358 PCRE2_SPTR Xeptr; 359 PCRE2_SPTR Xecode; 360 PCRE2_SPTR Xmstart; 361 PCRE2_SPTR Xcallpat; 362 PCRE2_SPTR Xdata; 363 PCRE2_SPTR Xnext_ecode; 364 PCRE2_SPTR Xpp; 365 PCRE2_SPTR Xprev; 366 PCRE2_SPTR Xsaved_eptr; 367 368 eptrblock *Xeptrb; 369 370 PCRE2_SIZE Xlength; 371 PCRE2_SIZE Xoffset; 372 PCRE2_SIZE Xoffset_top; 373 PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3; 374 375 uint32_t Xfc; 376 uint32_t Xnumber; 377 uint32_t Xrdepth; 378 uint32_t Xop; 379 uint32_t Xsave_capture_last; 380 381 #ifdef SUPPORT_UNICODE 382 uint32_t Xprop_value; 383 int Xprop_type; 384 int Xprop_fail_result; 385 int Xoclength; 386 #endif 387 388 int Xcodelink; 389 int Xctype; 390 int Xfi; 391 int Xmax; 392 int Xmin; 393 int Xwhere; /* Where to jump back to */ 394 395 BOOL Xcondition; 396 BOOL Xcur_is_word; 397 BOOL Xprev_is_word; 398 399 eptrblock Xnewptrb; 400 recursion_info Xnew_recursive; 401 402 #ifdef SUPPORT_UNICODE 403 PCRE2_UCHAR Xocchars[6]; 404 #endif 405 } heapframe; 406 407 #endif 408 409 410 /*************************************************************************** 411 ***************************************************************************/ 412 413 414 /* When HEAP_MATCH_RECURSE is not defined, the match() function implements 415 backtrack points by calling itself recursively in all but one case. The one 416 special case is when processing OP_RECURSE, which specifies recursion in the 417 pattern. The entire ovector must be saved and restored while processing 418 OP_RECURSE. If the ovector is small enough, instead of calling match() 419 directly, op_recurse_ovecsave() is called. This function uses the system stack 420 to save the ovector while calling match() to process the pattern recursion. */ 421 422 #ifndef HEAP_MATCH_RECURSE 423 424 /* We need a prototype for match() because it is mutually recursive with 425 op_recurse_ovecsave(). */ 426 427 static int 428 match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, 429 PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth); 430 431 432 /************************************************* 433 * Process OP_RECURSE, stacking ovector * 434 *************************************************/ 435 436 /* When this function is called, mb->recursive has already been updated to 437 point to a new recursion data block, and all its fields other than ovec_save 438 have been set. 439 440 This function exists so that the local vector variable ovecsave is no longer 441 defined in the match() function, as it was in PCRE1. It is used only when there 442 is recursion in the pattern, so it wastes a lot of stack to have it defined for 443 every call of match(). We now use this function as an indirect way of calling 444 match() only in the case when ovecsave is needed. (David Wheeler used to say 445 "All problems in computer science can be solved by another level of 446 indirection.") 447 448 HOWEVER: when this file is compiled by gcc in an optimizing mode, because this 449 function is called only once, and only from within match(), gcc will "inline" 450 it - that is, move it inside match() - and this completely negates its reason 451 for existence. Therefore, we mark it as non-inline when gcc is in use. 452 453 Arguments: 454 eptr pointer to current character in subject 455 callpat the recursion point in the pattern 456 mstart pointer to the current match start position (can be modified 457 by encountering \K) 458 offset_top current top pointer (highest ovector offset used + 1) 459 mb pointer to "static" info block for the match 460 eptrb pointer to chain of blocks containing eptr at start of 461 brackets - for testing for empty matches 462 rdepth the recursion depth 463 464 Returns: a match() return code 465 */ 466 467 static int 468 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) 469 __attribute__ ((noinline)) 470 #endif 471 op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr, PCRE2_SPTR callpat, 472 PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, 473 uint32_t rdepth) 474 { 475 register int rrc; 476 BOOL cbegroup = *callpat >= OP_SBRA; 477 recursion_info *new_recursive = mb->recursive; 478 PCRE2_SIZE ovecsave[OP_RECURSE_STACK_SAVE_MAX]; 479 480 /* Save the ovector */ 481 482 new_recursive->ovec_save = ovecsave; 483 memcpy(ovecsave, mb->ovector, mb->offset_end * sizeof(PCRE2_SIZE)); 484 485 /* Do the recursion. After processing each alternative, restore the ovector 486 data and the last captured value. */ 487 488 do 489 { 490 if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP; 491 rrc = match(eptr, callpat + PRIV(OP_lengths)[*callpat], mstart, offset_top, 492 mb, eptrb, rdepth + 1); 493 memcpy(mb->ovector, new_recursive->ovec_save, 494 mb->offset_end * sizeof(PCRE2_SIZE)); 495 mb->capture_last = new_recursive->saved_capture_last; 496 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) return rrc; 497 498 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a 499 recursion; they cause a NOMATCH for the entire recursion. These codes 500 are defined in a range that can be tested for. */ 501 502 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) 503 return MATCH_NOMATCH; 504 505 /* Any return code other than NOMATCH is an error. Otherwise, advance to the 506 next alternative or to the end of the recursing subpattern. If there were 507 nested recursions, mb->recursive might be changed, so reset it before 508 looping. */ 509 510 if (rrc != MATCH_NOMATCH) return rrc; 511 mb->recursive = new_recursive; 512 callpat += GET(callpat, 1); 513 } 514 while (*callpat == OP_ALT); /* Loop for the alternatives */ 515 516 /* None of the alternatives matched. */ 517 518 return MATCH_NOMATCH; 519 } 520 #endif /* HEAP_MATCH_RECURSE */ 521 522 523 524 /************************************************* 525 * Match from current position * 526 *************************************************/ 527 528 /* This function is called recursively in many circumstances. Whenever it 529 returns a negative (error) response, the outer incarnation must also return the 530 same response. */ 531 532 /* These macros pack up tests that are used for partial matching, and which 533 appear several times in the code. We set the "hit end" flag if the pointer is 534 at the end of the subject and also past the earliest inspected character (i.e. 535 something has been matched, even if not part of the actual matched string). For 536 hard partial matching, we then return immediately. The second one is used when 537 we already know we are past the end of the subject. */ 538 539 #define CHECK_PARTIAL()\ 540 if (mb->partial != 0 && eptr >= mb->end_subject && \ 541 eptr > mb->start_used_ptr) \ 542 { \ 543 mb->hitend = TRUE; \ 544 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \ 545 } 546 547 #define SCHECK_PARTIAL()\ 548 if (mb->partial != 0 && eptr > mb->start_used_ptr) \ 549 { \ 550 mb->hitend = TRUE; \ 551 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \ 552 } 553 554 555 /* Performance note: It might be tempting to extract commonly used fields from 556 the mb structure (e.g. utf, end_subject) into individual variables to improve 557 performance. Tests using gcc on a SPARC disproved this; in the first case, it 558 made performance worse. 559 560 Arguments: 561 eptr pointer to current character in subject 562 ecode pointer to current position in compiled code 563 mstart pointer to the current match start position (can be modified 564 by encountering \K) 565 offset_top current top pointer (highest ovector offset used + 1) 566 mb pointer to "static" info block for the match 567 eptrb pointer to chain of blocks containing eptr at start of 568 brackets - for testing for empty matches 569 rdepth the recursion depth 570 571 Returns: MATCH_MATCH if matched ) these values are >= 0 572 MATCH_NOMATCH if failed to match ) 573 a negative MATCH_xxx value for PRUNE, SKIP, etc 574 a negative PCRE2_ERROR_xxx value if aborted by an error condition 575 (e.g. stopped by repeated call or recursion limit) 576 */ 577 578 static int 579 match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, 580 PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth) 581 { 582 /* These variables do not need to be preserved over recursion in this function, 583 so they can be ordinary variables in all cases. Mark some of them with 584 "register" because they are used a lot in loops. */ 585 586 register int rrc; /* Returns from recursive calls */ 587 register int i; /* Used for loops not involving calls to RMATCH() */ 588 register uint32_t c; /* Character values not kept over RMATCH() calls */ 589 register BOOL utf; /* Local copy of UTF flag for speed */ 590 591 BOOL minimize, possessive; /* Quantifier options */ 592 BOOL caseless; 593 int condcode; 594 595 /* When recursion is not being used, all "local" variables that have to be 596 preserved over calls to RMATCH() are part of a "frame". We set up the top-level 597 frame on the stack here; subsequent instantiations are obtained from the heap 598 whenever RMATCH() does a "recursion". See the macro definitions above. Putting 599 the top-level on the stack rather than malloc-ing them all gives a performance 600 boost in many cases where there is not much "recursion". */ 601 602 #ifdef HEAP_MATCH_RECURSE 603 heapframe *frame = (heapframe *)mb->match_frames_base; 604 605 /* Copy in the original argument variables */ 606 607 frame->Xeptr = eptr; 608 frame->Xecode = ecode; 609 frame->Xmstart = mstart; 610 frame->Xoffset_top = offset_top; 611 frame->Xeptrb = eptrb; 612 frame->Xrdepth = rdepth; 613 614 /* This is where control jumps back to to effect "recursion" */ 615 616 HEAP_RECURSE: 617 618 /* Macros make the argument variables come from the current frame */ 619 620 #define eptr frame->Xeptr 621 #define ecode frame->Xecode 622 #define mstart frame->Xmstart 623 #define offset_top frame->Xoffset_top 624 #define eptrb frame->Xeptrb 625 #define rdepth frame->Xrdepth 626 627 /* Ditto for the local variables */ 628 629 #ifdef SUPPORT_UNICODE 630 #define charptr frame->Xcharptr 631 #define prop_value frame->Xprop_value 632 #define prop_type frame->Xprop_type 633 #define prop_fail_result frame->Xprop_fail_result 634 #define oclength frame->Xoclength 635 #define occhars frame->Xocchars 636 #endif 637 638 639 #define callpat frame->Xcallpat 640 #define codelink frame->Xcodelink 641 #define data frame->Xdata 642 #define next_ecode frame->Xnext_ecode 643 #define pp frame->Xpp 644 #define prev frame->Xprev 645 #define saved_eptr frame->Xsaved_eptr 646 647 #define new_recursive frame->Xnew_recursive 648 649 #define ctype frame->Xctype 650 #define fc frame->Xfc 651 #define fi frame->Xfi 652 #define length frame->Xlength 653 #define max frame->Xmax 654 #define min frame->Xmin 655 #define number frame->Xnumber 656 #define offset frame->Xoffset 657 #define op frame->Xop 658 #define save_capture_last frame->Xsave_capture_last 659 #define save_offset1 frame->Xsave_offset1 660 #define save_offset2 frame->Xsave_offset2 661 #define save_offset3 frame->Xsave_offset3 662 663 #define condition frame->Xcondition 664 #define cur_is_word frame->Xcur_is_word 665 #define prev_is_word frame->Xprev_is_word 666 667 #define newptrb frame->Xnewptrb 668 669 /* When normal stack-based recursion is being used for match(), local variables 670 are allocated on the stack and get preserved during recursion in the usual way. 671 In this environment, fi and i, and fc and c, can be the same variables. */ 672 673 #else /* HEAP_MATCH_RECURSE not defined */ 674 #define fi i 675 #define fc c 676 677 /* Many of the following variables are used only in small blocks of the code. 678 My normal style of coding would have declared them within each of those blocks. 679 However, in order to accommodate the version of this code that uses an external 680 "stack" implemented on the heap, it is easier to declare them all here, so the 681 declarations can be cut out in a block. The only declarations within blocks 682 below are for variables that do not have to be preserved over a recursive call 683 to RMATCH(). */ 684 685 #ifdef SUPPORT_UNICODE 686 PCRE2_SPTR charptr; 687 #endif 688 PCRE2_SPTR callpat; 689 PCRE2_SPTR data; 690 PCRE2_SPTR next_ecode; 691 PCRE2_SPTR pp; 692 PCRE2_SPTR prev; 693 PCRE2_SPTR saved_eptr; 694 695 PCRE2_SIZE length; 696 PCRE2_SIZE offset; 697 PCRE2_SIZE save_offset1, save_offset2, save_offset3; 698 699 uint32_t number; 700 uint32_t op; 701 uint32_t save_capture_last; 702 703 #ifdef SUPPORT_UNICODE 704 uint32_t prop_value; 705 int prop_type; 706 int prop_fail_result; 707 int oclength; 708 PCRE2_UCHAR occhars[6]; 709 #endif 710 711 int codelink; 712 int ctype; 713 int max; 714 int min; 715 716 BOOL condition; 717 BOOL cur_is_word; 718 BOOL prev_is_word; 719 720 eptrblock newptrb; 721 recursion_info new_recursive; 722 #endif /* HEAP_MATCH_RECURSE not defined */ 723 724 /* To save space on the stack and in the heap frame, I have doubled up on some 725 of the local variables that are used only in localised parts of the code, but 726 still need to be preserved over recursive calls of match(). These macros define 727 the alternative names that are used. */ 728 729 #define allow_zero cur_is_word 730 #define cbegroup condition 731 #define code_offset codelink 732 #define condassert condition 733 #define foc number 734 #define matched_once prev_is_word 735 #define save_mark data 736 737 /* These statements are here to stop the compiler complaining about unitialized 738 variables. */ 739 740 #ifdef SUPPORT_UNICODE 741 prop_value = 0; 742 prop_fail_result = 0; 743 #endif 744 745 746 /* This label is used for tail recursion, which is used in a few cases even 747 when HEAP_MATCH_RECURSE is not defined, in order to reduce the amount of stack 748 that is used. Thanks to Ian Taylor for noticing this possibility and sending 749 the original patch. */ 750 751 TAIL_RECURSE: 752 753 /* OK, now we can get on with the real code of the function. Recursive calls 754 are specified by the macro RMATCH and RRETURN is used to return. When 755 HEAP_MATCH_RECURSE is *not* defined, these just turn into a recursive call to 756 match() and a "return", respectively. However, RMATCH isn't like a function 757 call because it's quite a complicated macro. It has to be used in one 758 particular way. This shouldn't, however, impact performance when true recursion 759 is being used. */ 760 761 #ifdef SUPPORT_UNICODE 762 utf = (mb->poptions & PCRE2_UTF) != 0; 763 #else 764 utf = FALSE; 765 #endif 766 767 /* First check that we haven't called match() too many times, or that we 768 haven't exceeded the recursive call limit. */ 769 770 if (mb->match_call_count++ >= mb->match_limit) RRETURN(PCRE2_ERROR_MATCHLIMIT); 771 if (rdepth >= mb->match_limit_recursion) RRETURN(PCRE2_ERROR_RECURSIONLIMIT); 772 773 /* At the start of a group with an unlimited repeat that may match an empty 774 string, the variable mb->match_function_type contains the MATCH_CBEGROUP bit. 775 It is done this way to save having to use another function argument, which 776 would take up space on the stack. See also MATCH_CONDASSERT below. 777 778 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of 779 such remembered pointers, to be checked when we hit the closing ket, in order 780 to break infinite loops that match no characters. When match() is called in 781 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must 782 NOT be used with tail recursion, because the memory block that is used is on 783 the stack, so a new one may be required for each match(). */ 784 785 if ((mb->match_function_type & MATCH_CBEGROUP) != 0) 786 { 787 newptrb.epb_saved_eptr = eptr; 788 newptrb.epb_prev = eptrb; 789 eptrb = &newptrb; 790 mb->match_function_type &= ~MATCH_CBEGROUP; 791 } 792 793 /* Now, at last, we can start processing the opcodes. */ 794 795 for (;;) 796 { 797 minimize = possessive = FALSE; 798 op = *ecode; 799 800 switch(op) 801 { 802 case OP_MARK: 803 mb->nomatch_mark = ecode + 2; 804 mb->mark = NULL; /* In case previously set by assertion */ 805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb, 806 eptrb, RM55); 807 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 808 mb->mark == NULL) mb->mark = ecode + 2; 809 810 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 811 argument, and we must check whether that argument matches this MARK's 812 argument. It is passed back in mb->start_match_ptr (an overloading of that 813 variable). If it does match, we reset that variable to the current subject 814 position and return MATCH_SKIP. Otherwise, pass back the return code 815 unaltered. */ 816 817 else if (rrc == MATCH_SKIP_ARG && 818 PRIV(strcmp)(ecode + 2, mb->start_match_ptr) == 0) 819 { 820 mb->start_match_ptr = eptr; 821 RRETURN(MATCH_SKIP); 822 } 823 RRETURN(rrc); 824 825 case OP_FAIL: 826 RRETURN(MATCH_NOMATCH); 827 828 case OP_COMMIT: 829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 830 eptrb, RM52); 831 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 832 RRETURN(MATCH_COMMIT); 833 834 case OP_PRUNE: 835 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 836 eptrb, RM51); 837 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 838 RRETURN(MATCH_PRUNE); 839 840 case OP_PRUNE_ARG: 841 mb->nomatch_mark = ecode + 2; 842 mb->mark = NULL; /* In case previously set by assertion */ 843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb, 844 eptrb, RM56); 845 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 846 mb->mark == NULL) mb->mark = ecode + 2; 847 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 848 RRETURN(MATCH_PRUNE); 849 850 case OP_SKIP: 851 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 852 eptrb, RM53); 853 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 854 mb->start_match_ptr = eptr; /* Pass back current position */ 855 RRETURN(MATCH_SKIP); 856 857 /* Note that, for Perl compatibility, SKIP with an argument does NOT set 858 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was 859 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG 860 that failed and any that precede it (either they also failed, or were not 861 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a 862 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg 863 set to the count of the one that failed. */ 864 865 case OP_SKIP_ARG: 866 mb->skip_arg_count++; 867 if (mb->skip_arg_count <= mb->ignore_skip_arg) 868 { 869 ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; 870 break; 871 } 872 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb, 873 eptrb, RM57); 874 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 875 876 /* Pass back the current skip name by overloading mb->start_match_ptr and 877 returning the special MATCH_SKIP_ARG return code. This will either be 878 caught by a matching MARK, or get to the top, where it causes a rematch 879 with mb->ignore_skip_arg set to the value of mb->skip_arg_count. */ 880 881 mb->start_match_ptr = ecode + 2; 882 RRETURN(MATCH_SKIP_ARG); 883 884 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 885 the branch in which it occurs can be determined. Overload the start of 886 match pointer to do this. */ 887 888 case OP_THEN: 889 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 890 eptrb, RM54); 891 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 892 mb->start_match_ptr = ecode; 893 RRETURN(MATCH_THEN); 894 895 case OP_THEN_ARG: 896 mb->nomatch_mark = ecode + 2; 897 mb->mark = NULL; /* In case previously set by assertion */ 898 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, 899 mb, eptrb, RM58); 900 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 901 mb->mark == NULL) mb->mark = ecode + 2; 902 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 903 mb->start_match_ptr = ecode; 904 RRETURN(MATCH_THEN); 905 906 /* Handle an atomic group that does not contain any capturing parentheses. 907 This can be handled like an assertion. Prior to 8.13, all atomic groups 908 were handled this way. In 8.13, the code was changed as below for ONCE, so 909 that backups pass through the group and thereby reset captured values. 910 However, this uses a lot more stack, so in 8.20, atomic groups that do not 911 contain any captures generate OP_ONCE_NC, which can be handled in the old, 912 less stack intensive way. 913 914 Check the alternative branches in turn - the matching won't pass the KET 915 for this kind of subpattern. If any one branch matches, we carry on as at 916 the end of a normal bracket, leaving the subject pointer, but resetting 917 the start-of-match value in case it was changed by \K. */ 918 919 case OP_ONCE_NC: 920 prev = ecode; 921 saved_eptr = eptr; 922 save_mark = mb->mark; 923 do 924 { 925 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM64); 926 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 927 { 928 mstart = mb->start_match_ptr; 929 break; 930 } 931 if (rrc == MATCH_THEN) 932 { 933 next_ecode = ecode + GET(ecode,1); 934 if (mb->start_match_ptr < next_ecode && 935 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 936 rrc = MATCH_NOMATCH; 937 } 938 939 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 940 ecode += GET(ecode,1); 941 mb->mark = save_mark; 942 } 943 while (*ecode == OP_ALT); 944 945 /* If hit the end of the group (which could be repeated), fail */ 946 947 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 948 949 /* Continue as from after the group, updating the offsets high water 950 mark, since extracts may have been taken. */ 951 952 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 953 954 offset_top = mb->end_offset_top; 955 eptr = mb->end_match_ptr; 956 957 /* For a non-repeating ket, just continue at this level. This also 958 happens for a repeating ket if no characters were matched in the group. 959 This is the forcible breaking of infinite loops as implemented in Perl 960 5.005. */ 961 962 if (*ecode == OP_KET || eptr == saved_eptr) 963 { 964 ecode += 1+LINK_SIZE; 965 break; 966 } 967 968 /* The repeating kets try the rest of the pattern or restart from the 969 preceding bracket, in the appropriate order. The second "call" of match() 970 uses tail recursion, to avoid using another stack frame. */ 971 972 if (*ecode == OP_KETRMIN) 973 { 974 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM65); 975 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 976 ecode = prev; 977 goto TAIL_RECURSE; 978 } 979 else /* OP_KETRMAX */ 980 { 981 RMATCH(eptr, prev, offset_top, mb, eptrb, RM66); 982 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 983 ecode += 1 + LINK_SIZE; 984 goto TAIL_RECURSE; 985 } 986 /* Control never gets here */ 987 988 /* Handle a capturing bracket, other than those that are possessive with an 989 unlimited repeat. If there is space in the offset vector, save the current 990 subject position in the working slot at the top of the vector. We mustn't 991 change the current values of the data slot, because they may be set from a 992 previous iteration of this group, and be referred to by a reference inside 993 the group. A failure to match might occur after the group has succeeded, 994 if something later on doesn't match. For this reason, we need to restore 995 the working value and also the values of the final offsets, in case they 996 were set by a previous iteration of the same bracket. 997 998 If there isn't enough space in the offset vector, treat this as if it were 999 a non-capturing bracket. Don't worry about setting the flag for the error 1000 case here; that is handled in the code for KET. */ 1001 1002 case OP_CBRA: 1003 case OP_SCBRA: 1004 number = GET2(ecode, 1+LINK_SIZE); 1005 offset = number << 1; 1006 1007 if (offset < mb->offset_max) 1008 { 1009 save_offset1 = mb->ovector[offset]; 1010 save_offset2 = mb->ovector[offset+1]; 1011 save_offset3 = mb->ovector[mb->offset_end - number]; 1012 save_capture_last = mb->capture_last; 1013 save_mark = mb->mark; 1014 1015 mb->ovector[mb->offset_end - number] = eptr - mb->start_subject; 1016 1017 for (;;) 1018 { 1019 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; 1020 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 1021 eptrb, RM1); 1022 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ 1023 1024 /* If we backed up to a THEN, check whether it is within the current 1025 branch by comparing the address of the THEN that is passed back with 1026 the end of the branch. If it is within the current branch, and the 1027 branch is one of two or more alternatives (it either starts or ends 1028 with OP_ALT), we have reached the limit of THEN's action, so convert 1029 the return code to NOMATCH, which will cause normal backtracking to 1030 happen from now on. Otherwise, THEN is passed back to an outer 1031 alternative. This implements Perl's treatment of parenthesized groups, 1032 where a group not containing | does not affect the current alternative, 1033 that is, (X) is NOT the same as (X|(*F)). */ 1034 1035 if (rrc == MATCH_THEN) 1036 { 1037 next_ecode = ecode + GET(ecode,1); 1038 if (mb->start_match_ptr < next_ecode && 1039 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1040 rrc = MATCH_NOMATCH; 1041 } 1042 1043 /* Anything other than NOMATCH is passed back. */ 1044 1045 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1046 mb->capture_last = save_capture_last; 1047 ecode += GET(ecode, 1); 1048 mb->mark = save_mark; 1049 if (*ecode != OP_ALT) break; 1050 } 1051 1052 mb->ovector[offset] = save_offset1; 1053 mb->ovector[offset+1] = save_offset2; 1054 mb->ovector[mb->offset_end - number] = save_offset3; 1055 1056 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ 1057 1058 RRETURN(rrc); 1059 } 1060 1061 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1062 as a non-capturing bracket. */ 1063 1064 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1065 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1066 1067 /* Non-capturing or atomic group, except for possessive with unlimited 1068 repeat and ONCE group with no captures. Loop for all the alternatives. 1069 1070 When we get to the final alternative within the brackets, we used to return 1071 the result of a recursive call to match() whatever happened so it was 1072 possible to reduce stack usage by turning this into a tail recursion, 1073 except in the case of a possibly empty group. However, now that there is 1074 the possiblity of (*THEN) occurring in the final alternative, this 1075 optimization is no longer always possible. 1076 1077 We can optimize if we know there are no (*THEN)s in the pattern; at present 1078 this is the best that can be done. 1079 1080 MATCH_ONCE is returned when the end of an atomic group is successfully 1081 reached, but subsequent matching fails. It passes back up the tree (causing 1082 captured values to be reset) until the original atomic group level is 1083 reached. This is tested by comparing mb->once_target with the start of the 1084 group. At this point, the return is converted into MATCH_NOMATCH so that 1085 previous backup points can be taken. */ 1086 1087 case OP_ONCE: 1088 case OP_BRA: 1089 case OP_SBRA: 1090 1091 for (;;) 1092 { 1093 if (op >= OP_SBRA || op == OP_ONCE) 1094 mb->match_function_type |= MATCH_CBEGROUP; 1095 1096 /* If this is not a possibly empty group, and there are no (*THEN)s in 1097 the pattern, and this is the final alternative, optimize as described 1098 above. */ 1099 1100 else if (!mb->hasthen && ecode[GET(ecode, 1)] != OP_ALT) 1101 { 1102 ecode += PRIV(OP_lengths)[*ecode]; 1103 goto TAIL_RECURSE; 1104 } 1105 1106 /* In all other cases, we have to make another call to match(). */ 1107 1108 save_mark = mb->mark; 1109 save_capture_last = mb->capture_last; 1110 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb, 1111 RM2); 1112 1113 /* See comment in the code for capturing groups above about handling 1114 THEN. */ 1115 1116 if (rrc == MATCH_THEN) 1117 { 1118 next_ecode = ecode + GET(ecode,1); 1119 if (mb->start_match_ptr < next_ecode && 1120 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1121 rrc = MATCH_NOMATCH; 1122 } 1123 1124 if (rrc != MATCH_NOMATCH) 1125 { 1126 if (rrc == MATCH_ONCE) 1127 { 1128 PCRE2_SPTR scode = ecode; 1129 if (*scode != OP_ONCE) /* If not at start, find it */ 1130 { 1131 while (*scode == OP_ALT) scode += GET(scode, 1); 1132 scode -= GET(scode, 1); 1133 } 1134 if (mb->once_target == scode) rrc = MATCH_NOMATCH; 1135 } 1136 RRETURN(rrc); 1137 } 1138 ecode += GET(ecode, 1); 1139 mb->mark = save_mark; 1140 if (*ecode != OP_ALT) break; 1141 mb->capture_last = save_capture_last; 1142 } 1143 1144 RRETURN(MATCH_NOMATCH); 1145 1146 /* Handle possessive capturing brackets with an unlimited repeat. We come 1147 here from BRAZERO with allow_zero set TRUE. The ovector values are 1148 handled similarly to the normal case above. However, the matching is 1149 different. The end of these brackets will always be OP_KETRPOS, which 1150 returns MATCH_KETRPOS without going further in the pattern. By this means 1151 we can handle the group by iteration rather than recursion, thereby 1152 reducing the amount of stack needed. If the ovector is too small for 1153 capturing, treat as non-capturing. */ 1154 1155 case OP_CBRAPOS: 1156 case OP_SCBRAPOS: 1157 allow_zero = FALSE; 1158 1159 POSSESSIVE_CAPTURE: 1160 number = GET2(ecode, 1+LINK_SIZE); 1161 offset = number << 1; 1162 if (offset >= mb->offset_max) goto POSSESSIVE_NON_CAPTURE; 1163 1164 matched_once = FALSE; 1165 code_offset = (int)(ecode - mb->start_code); 1166 1167 save_offset1 = mb->ovector[offset]; 1168 save_offset2 = mb->ovector[offset+1]; 1169 save_offset3 = mb->ovector[mb->offset_end - number]; 1170 save_capture_last = mb->capture_last; 1171 1172 /* Each time round the loop, save the current subject position for use 1173 when the group matches. For MATCH_MATCH, the group has matched, so we 1174 restart it with a new subject starting position, remembering that we had 1175 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as 1176 usual. If we haven't matched any alternatives in any iteration, check to 1177 see if a previous iteration matched. If so, the group has matched; 1178 continue from afterwards. Otherwise it has failed; restore the previous 1179 capture values before returning NOMATCH. */ 1180 1181 for (;;) 1182 { 1183 mb->ovector[mb->offset_end - number] = eptr - mb->start_subject; 1184 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; 1185 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 1186 eptrb, RM63); 1187 if (rrc == MATCH_KETRPOS) 1188 { 1189 offset_top = mb->end_offset_top; 1190 ecode = mb->start_code + code_offset; 1191 save_capture_last = mb->capture_last; 1192 matched_once = TRUE; 1193 mstart = mb->start_match_ptr; /* In case \K changed it */ 1194 if (eptr == mb->end_match_ptr) /* Matched an empty string */ 1195 { 1196 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1197 break; 1198 } 1199 eptr = mb->end_match_ptr; 1200 continue; 1201 } 1202 1203 /* See comment in the code for capturing groups above about handling 1204 THEN. */ 1205 1206 if (rrc == MATCH_THEN) 1207 { 1208 next_ecode = ecode + GET(ecode,1); 1209 if (mb->start_match_ptr < next_ecode && 1210 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1211 rrc = MATCH_NOMATCH; 1212 } 1213 1214 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1215 mb->capture_last = save_capture_last; 1216 ecode += GET(ecode, 1); 1217 if (*ecode != OP_ALT) break; 1218 } 1219 1220 if (!matched_once) 1221 { 1222 mb->ovector[offset] = save_offset1; 1223 mb->ovector[offset+1] = save_offset2; 1224 mb->ovector[mb->offset_end - number] = save_offset3; 1225 } 1226 1227 if (allow_zero || matched_once) 1228 { 1229 ecode += 1 + LINK_SIZE; 1230 break; 1231 } 1232 RRETURN(MATCH_NOMATCH); 1233 1234 /* Non-capturing possessive bracket with unlimited repeat. We come here 1235 from BRAZERO with allow_zero = TRUE. The code is similar to the above, 1236 without the capturing complication. It is written out separately for speed 1237 and cleanliness. */ 1238 1239 case OP_BRAPOS: 1240 case OP_SBRAPOS: 1241 allow_zero = FALSE; 1242 1243 POSSESSIVE_NON_CAPTURE: 1244 matched_once = FALSE; 1245 code_offset = (int)(ecode - mb->start_code); 1246 save_capture_last = mb->capture_last; 1247 1248 for (;;) 1249 { 1250 if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; 1251 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, 1252 eptrb, RM48); 1253 if (rrc == MATCH_KETRPOS) 1254 { 1255 offset_top = mb->end_offset_top; 1256 ecode = mb->start_code + code_offset; 1257 matched_once = TRUE; 1258 mstart = mb->start_match_ptr; /* In case \K reset it */ 1259 if (eptr == mb->end_match_ptr) /* Matched an empty string */ 1260 { 1261 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1262 break; 1263 } 1264 eptr = mb->end_match_ptr; 1265 continue; 1266 } 1267 1268 /* See comment in the code for capturing groups above about handling 1269 THEN. */ 1270 1271 if (rrc == MATCH_THEN) 1272 { 1273 next_ecode = ecode + GET(ecode,1); 1274 if (mb->start_match_ptr < next_ecode && 1275 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1276 rrc = MATCH_NOMATCH; 1277 } 1278 1279 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1280 ecode += GET(ecode, 1); 1281 if (*ecode != OP_ALT) break; 1282 mb->capture_last = save_capture_last; 1283 } 1284 1285 if (matched_once || allow_zero) 1286 { 1287 ecode += 1 + LINK_SIZE; 1288 break; 1289 } 1290 RRETURN(MATCH_NOMATCH); 1291 1292 /* Control never reaches here. */ 1293 1294 /* Conditional group: compilation checked that there are no more than two 1295 branches. If the condition is false, skipping the first branch takes us 1296 past the end of the item if there is only one branch, but that's exactly 1297 what we want. */ 1298 1299 case OP_COND: 1300 case OP_SCOND: 1301 1302 /* The variable codelink will be added to ecode when the condition is 1303 false, to get to the second branch. Setting it to the offset to the ALT 1304 or KET, then incrementing ecode achieves this effect. We now have ecode 1305 pointing to the condition or callout. */ 1306 1307 codelink = GET(ecode, 1); /* Offset to the second branch */ 1308 ecode += 1 + LINK_SIZE; /* From this opcode */ 1309 1310 /* Because of the way auto-callout works during compile, a callout item is 1311 inserted between OP_COND and an assertion condition. */ 1312 1313 if (*ecode == OP_CALLOUT || *ecode == OP_CALLOUT_STR) 1314 { 1315 unsigned int callout_length = (*ecode == OP_CALLOUT) 1316 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE); 1317 1318 if (mb->callout != NULL) 1319 { 1320 pcre2_callout_block cb; 1321 cb.version = 1; 1322 cb.capture_top = offset_top/2; 1323 cb.capture_last = mb->capture_last & CAPLMASK; 1324 cb.offset_vector = mb->ovector; 1325 cb.mark = mb->nomatch_mark; 1326 cb.subject = mb->start_subject; 1327 cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject); 1328 cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject); 1329 cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); 1330 cb.pattern_position = GET(ecode, 1); 1331 cb.next_item_length = GET(ecode, 1 + LINK_SIZE); 1332 1333 if (*ecode == OP_CALLOUT) 1334 { 1335 cb.callout_number = ecode[1 + 2*LINK_SIZE]; 1336 cb.callout_string_offset = 0; 1337 cb.callout_string = NULL; 1338 cb.callout_string_length = 0; 1339 } 1340 else 1341 { 1342 cb.callout_number = 0; 1343 cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); 1344 cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; 1345 cb.callout_string_length = 1346 callout_length - (1 + 4*LINK_SIZE) - 2; 1347 } 1348 1349 if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) 1350 RRETURN(MATCH_NOMATCH); 1351 if (rrc < 0) RRETURN(rrc); 1352 } 1353 1354 /* Advance ecode past the callout, so it now points to the condition. We 1355 must adjust codelink so that the value of ecode+codelink is unchanged. */ 1356 1357 ecode += callout_length; 1358 codelink -= callout_length; 1359 } 1360 1361 /* Test the various possible conditions */ 1362 1363 condition = FALSE; 1364 switch(condcode = *ecode) 1365 { 1366 case OP_RREF: /* Numbered group recursion test */ 1367 if (mb->recursive != NULL) /* Not recursing => FALSE */ 1368 { 1369 uint32_t recno = GET2(ecode, 1); /* Recursion group number*/ 1370 condition = (recno == RREF_ANY || recno == mb->recursive->group_num); 1371 } 1372 break; 1373 1374 case OP_DNRREF: /* Duplicate named group recursion test */ 1375 if (mb->recursive != NULL) 1376 { 1377 int count = GET2(ecode, 1 + IMM2_SIZE); 1378 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size; 1379 while (count-- > 0) 1380 { 1381 uint32_t recno = GET2(slot, 0); 1382 condition = recno == mb->recursive->group_num; 1383 if (condition) break; 1384 slot += mb->name_entry_size; 1385 } 1386 } 1387 break; 1388 1389 case OP_CREF: /* Numbered group used test */ 1390 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 1391 condition = offset < offset_top && 1392 mb->ovector[offset] != PCRE2_UNSET; 1393 break; 1394 1395 case OP_DNCREF: /* Duplicate named group used test */ 1396 { 1397 int count = GET2(ecode, 1 + IMM2_SIZE); 1398 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size; 1399 while (count-- > 0) 1400 { 1401 offset = GET2(slot, 0) << 1; 1402 condition = offset < offset_top && 1403 mb->ovector[offset] != PCRE2_UNSET; 1404 if (condition) break; 1405 slot += mb->name_entry_size; 1406 } 1407 } 1408 break; 1409 1410 case OP_FALSE: 1411 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ 1412 break; 1413 1414 case OP_TRUE: 1415 condition = TRUE; 1416 break; 1417 1418 /* The condition is an assertion. Call match() to evaluate it - setting 1419 the MATCH_CONDASSERT bit in mb->match_function_type causes it to stop at 1420 the end of an assertion. */ 1421 1422 default: 1423 mb->match_function_type |= MATCH_CONDASSERT; 1424 RMATCH(eptr, ecode, offset_top, mb, NULL, RM3); 1425 if (rrc == MATCH_MATCH) 1426 { 1427 if (mb->end_offset_top > offset_top) 1428 offset_top = mb->end_offset_top; /* Captures may have happened */ 1429 condition = TRUE; 1430 1431 /* Advance ecode past the assertion to the start of the first branch, 1432 but adjust it so that the general choosing code below works. If the 1433 assertion has a quantifier that allows zero repeats we must skip over 1434 the BRAZERO. This is a lunatic thing to do, but somebody did! */ 1435 1436 if (*ecode == OP_BRAZERO) ecode++; 1437 ecode += GET(ecode, 1); 1438 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1439 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; 1440 } 1441 1442 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 1443 assertion; it is therefore treated as NOMATCH. Any other return is an 1444 error. */ 1445 1446 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1447 { 1448 RRETURN(rrc); /* Need braces because of following else */ 1449 } 1450 break; 1451 } 1452 1453 /* Choose branch according to the condition */ 1454 1455 ecode += condition? PRIV(OP_lengths)[condcode] : codelink; 1456 1457 /* We are now at the branch that is to be obeyed. As there is only one, we 1458 can use tail recursion to avoid using another stack frame, except when 1459 there is unlimited repeat of a possibly empty group. In the latter case, a 1460 recursive call to match() is always required, unless the second alternative 1461 doesn't exist, in which case we can just plough on. Note that, for 1462 compatibility with Perl, the | in a conditional group is NOT treated as 1463 creating two alternatives. If a THEN is encountered in the branch, it 1464 propagates out to the enclosing alternative (unless nested in a deeper set 1465 of alternatives, of course). */ 1466 1467 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) 1468 { 1469 if (op != OP_SCOND) 1470 { 1471 goto TAIL_RECURSE; 1472 } 1473 1474 mb->match_function_type |= MATCH_CBEGROUP; 1475 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM49); 1476 RRETURN(rrc); 1477 } 1478 1479 /* Condition false & no alternative; continue after the group. */ 1480 1481 else 1482 { 1483 } 1484 break; 1485 1486 1487 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1488 to close any currently open capturing brackets. */ 1489 1490 case OP_CLOSE: 1491 number = GET2(ecode, 1); /* Must be less than 65536 */ 1492 offset = number << 1; 1493 mb->capture_last = (mb->capture_last & OVFLMASK) | number; 1494 if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else 1495 { 1496 mb->ovector[offset] = 1497 mb->ovector[mb->offset_end - number]; 1498 mb->ovector[offset+1] = eptr - mb->start_subject; 1499 1500 /* If this group is at or above the current highwater mark, ensure that 1501 any groups between the current high water mark and this group are marked 1502 unset and then update the high water mark. */ 1503 1504 if (offset >= offset_top) 1505 { 1506 register PCRE2_SIZE *iptr = mb->ovector + offset_top; 1507 register PCRE2_SIZE *iend = mb->ovector + offset; 1508 while (iptr < iend) *iptr++ = PCRE2_UNSET; 1509 offset_top = offset + 2; 1510 } 1511 } 1512 ecode += 1 + IMM2_SIZE; 1513 break; 1514 1515 1516 /* End of the pattern, either real or forced. In an assertion ACCEPT, 1517 update the last used pointer. */ 1518 1519 case OP_ASSERT_ACCEPT: 1520 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr; 1521 1522 case OP_ACCEPT: 1523 case OP_END: 1524 1525 /* If we have matched an empty string, fail if not in an assertion and not 1526 in a recursion if either PCRE2_NOTEMPTY is set, or if PCRE2_NOTEMPTY_ATSTART 1527 is set and we have matched at the start of the subject. In both cases, 1528 backtracking will then try other alternatives, if any. */ 1529 1530 if (eptr == mstart && op != OP_ASSERT_ACCEPT && 1531 mb->recursive == NULL && 1532 ((mb->moptions & PCRE2_NOTEMPTY) != 0 || 1533 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && 1534 mstart == mb->start_subject + mb->start_offset))) 1535 RRETURN(MATCH_NOMATCH); 1536 1537 /* Otherwise, we have a match. */ 1538 1539 mb->end_match_ptr = eptr; /* Record where we ended */ 1540 mb->end_offset_top = offset_top; /* and how many extracts were taken */ 1541 mb->start_match_ptr = mstart; /* and the start (\K can modify) */ 1542 1543 /* For some reason, the macros don't work properly if an expression is 1544 given as the argument to RRETURN when the heap is in use. */ 1545 1546 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1547 RRETURN(rrc); 1548 1549 /* Assertion brackets. Check the alternative branches in turn - the 1550 matching won't pass the KET for an assertion. If any one branch matches, 1551 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1552 start of each branch to move the current point backwards, so the code at 1553 this level is identical to the lookahead case. When the assertion is part 1554 of a condition, we want to return immediately afterwards. The caller of 1555 this incarnation of the match() function will have set MATCH_CONDASSERT in 1556 mb->match_function type, and one of these opcodes will be the first opcode 1557 that is processed. We use a local variable that is preserved over calls to 1558 match() to remember this case. */ 1559 1560 case OP_ASSERT: 1561 case OP_ASSERTBACK: 1562 save_mark = mb->mark; 1563 if ((mb->match_function_type & MATCH_CONDASSERT) != 0) 1564 { 1565 condassert = TRUE; 1566 mb->match_function_type &= ~MATCH_CONDASSERT; 1567 } 1568 else condassert = FALSE; 1569 1570 /* Loop for each branch */ 1571 1572 do 1573 { 1574 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM4); 1575 1576 /* A match means that the assertion is true; break out of the loop 1577 that matches its alternatives. */ 1578 1579 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1580 { 1581 mstart = mb->start_match_ptr; /* In case \K reset it */ 1582 break; 1583 } 1584 1585 /* If not matched, restore the previous mark setting. */ 1586 1587 mb->mark = save_mark; 1588 1589 /* See comment in the code for capturing groups above about handling 1590 THEN. */ 1591 1592 if (rrc == MATCH_THEN) 1593 { 1594 next_ecode = ecode + GET(ecode,1); 1595 if (mb->start_match_ptr < next_ecode && 1596 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1597 rrc = MATCH_NOMATCH; 1598 } 1599 1600 /* Anything other than NOMATCH causes the entire assertion to fail, 1601 passing back the return code. This includes COMMIT, SKIP, PRUNE and an 1602 uncaptured THEN, which means they take their normal effect. This 1603 consistent approach does not always have exactly the same effect as in 1604 Perl. */ 1605 1606 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1607 ecode += GET(ecode, 1); 1608 } 1609 while (*ecode == OP_ALT); /* Continue for next alternative */ 1610 1611 /* If we have tried all the alternative branches, the assertion has 1612 failed. If not, we broke out after a match. */ 1613 1614 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1615 1616 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1617 1618 if (condassert) RRETURN(MATCH_MATCH); 1619 1620 /* Continue from after a successful assertion, updating the offsets high 1621 water mark, since extracts may have been taken during the assertion. */ 1622 1623 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1624 ecode += 1 + LINK_SIZE; 1625 offset_top = mb->end_offset_top; 1626 continue; 1627 1628 /* Negative assertion: all branches must fail to match for the assertion to 1629 succeed. */ 1630 1631 case OP_ASSERT_NOT: 1632 case OP_ASSERTBACK_NOT: 1633 save_mark = mb->mark; 1634 if ((mb->match_function_type & MATCH_CONDASSERT) != 0) 1635 { 1636 condassert = TRUE; 1637 mb->match_function_type &= ~MATCH_CONDASSERT; 1638 } 1639 else condassert = FALSE; 1640 1641 /* Loop for each alternative branch. */ 1642 1643 do 1644 { 1645 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM5); 1646 mb->mark = save_mark; /* Always restore the mark setting */ 1647 1648 switch(rrc) 1649 { 1650 case MATCH_MATCH: /* A successful match means */ 1651 case MATCH_ACCEPT: /* the assertion has failed. */ 1652 RRETURN(MATCH_NOMATCH); 1653 1654 case MATCH_NOMATCH: /* Carry on with next branch */ 1655 break; 1656 1657 /* See comment in the code for capturing groups above about handling 1658 THEN. */ 1659 1660 case MATCH_THEN: 1661 next_ecode = ecode + GET(ecode,1); 1662 if (mb->start_match_ptr < next_ecode && 1663 (*ecode == OP_ALT || *next_ecode == OP_ALT)) 1664 { 1665 rrc = MATCH_NOMATCH; 1666 break; 1667 } 1668 /* Otherwise fall through. */ 1669 1670 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole 1671 assertion to fail to match, without considering any more alternatives. 1672 Failing to match means the assertion is true. This is a consistent 1673 approach, but does not always have the same effect as in Perl. */ 1674 1675 case MATCH_COMMIT: 1676 case MATCH_SKIP: 1677 case MATCH_SKIP_ARG: 1678 case MATCH_PRUNE: 1679 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1680 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ 1681 1682 /* Anything else is an error */ 1683 1684 default: 1685 RRETURN(rrc); 1686 } 1687 1688 /* Continue with next branch */ 1689 1690 ecode += GET(ecode,1); 1691 } 1692 while (*ecode == OP_ALT); 1693 1694 /* All branches in the assertion failed to match. */ 1695 1696 NEG_ASSERT_TRUE: 1697 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ 1698 ecode += 1 + LINK_SIZE; /* Continue with current branch */ 1699 continue; 1700 1701 /* Move the subject pointer back. This occurs only at the start of 1702 each branch of a lookbehind assertion. If we are too close to the start to 1703 move back, this match function fails. When working with UTF-8 we move 1704 back a number of characters, not bytes. */ 1705 1706 case OP_REVERSE: 1707 i = GET(ecode, 1); 1708 #ifdef SUPPORT_UNICODE 1709 if (utf) 1710 { 1711 while (i-- > 0) 1712 { 1713 if (eptr <= mb->start_subject) RRETURN(MATCH_NOMATCH); 1714 eptr--; 1715 BACKCHAR(eptr); 1716 } 1717 } 1718 else 1719 #endif 1720 1721 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1722 1723 { 1724 if (i > eptr - mb->start_subject) RRETURN(MATCH_NOMATCH); 1725 eptr -= i; 1726 } 1727 1728 /* Save the earliest consulted character, then skip to next op code */ 1729 1730 if (eptr < mb->start_used_ptr) mb->start_used_ptr = eptr; 1731 ecode += 1 + LINK_SIZE; 1732 break; 1733 1734 /* The callout item calls an external function, if one is provided, passing 1735 details of the match so far. This is mainly for debugging, though the 1736 function is able to force a failure. */ 1737 1738 case OP_CALLOUT: 1739 case OP_CALLOUT_STR: 1740 { 1741 unsigned int callout_length = (*ecode == OP_CALLOUT) 1742 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE); 1743 1744 if (mb->callout != NULL) 1745 { 1746 pcre2_callout_block cb; 1747 cb.version = 1; 1748 cb.callout_number = ecode[LINK_SIZE + 1]; 1749 cb.capture_top = offset_top/2; 1750 cb.capture_last = mb->capture_last & CAPLMASK; 1751 cb.offset_vector = mb->ovector; 1752 cb.mark = mb->nomatch_mark; 1753 cb.subject = mb->start_subject; 1754 cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject); 1755 cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject); 1756 cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); 1757 cb.pattern_position = GET(ecode, 1); 1758 cb.next_item_length = GET(ecode, 1 + LINK_SIZE); 1759 1760 if (*ecode == OP_CALLOUT) 1761 { 1762 cb.callout_number = ecode[1 + 2*LINK_SIZE]; 1763 cb.callout_string_offset = 0; 1764 cb.callout_string = NULL; 1765 cb.callout_string_length = 0; 1766 } 1767 else 1768 { 1769 cb.callout_number = 0; 1770 cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); 1771 cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; 1772 cb.callout_string_length = 1773 callout_length - (1 + 4*LINK_SIZE) - 2; 1774 } 1775 1776 if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) 1777 RRETURN(MATCH_NOMATCH); 1778 if (rrc < 0) RRETURN(rrc); 1779 } 1780 ecode += callout_length; 1781 } 1782 break; 1783 1784 /* Recursion either matches the current regex, or some subexpression. The 1785 offset data is the offset to the starting bracket from the start of the 1786 whole pattern. (This is so that it works from duplicated subpatterns.) 1787 1788 The state of the capturing groups is preserved over recursion, and 1789 re-instated afterwards. We don't know how many are started and not yet 1790 finished (offset_top records the completed total) so we just have to save 1791 all the potential data. There may be up to 65535 such values, which is too 1792 large to put on the stack, but using malloc for small numbers seems 1793 expensive. As a compromise, the stack is used when there are no more than 1794 OP_RECURSE_STACK_SAVE_MAX values to store; otherwise malloc is used. 1795 1796 There are also other values that have to be saved. We use a chained 1797 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1798 for the original version of this logic. It has, however, been hacked around 1799 a lot, so he is not to blame for the current way it works. */ 1800 1801 case OP_RECURSE: 1802 { 1803 ovecsave_frame *fr; 1804 recursion_info *ri; 1805 uint32_t recno; 1806 1807 callpat = mb->start_code + GET(ecode, 1); 1808 recno = (callpat == mb->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE); 1809 1810 /* Check for repeating a pattern recursion without advancing the subject 1811 pointer. This should catch convoluted mutual recursions. (Some simple 1812 cases are caught at compile time.) */ 1813 1814 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) 1815 if (recno == ri->group_num && eptr == ri->subject_position) 1816 RRETURN(PCRE2_ERROR_RECURSELOOP); 1817 1818 /* Add to "recursing stack" */ 1819 1820 new_recursive.group_num = recno; 1821 new_recursive.saved_capture_last = mb->capture_last; 1822 new_recursive.subject_position = eptr; 1823 new_recursive.prevrec = mb->recursive; 1824 mb->recursive = &new_recursive; 1825 1826 /* Where to continue from afterwards */ 1827 1828 ecode += 1 + LINK_SIZE; 1829 1830 /* When we are using the system stack for match() recursion we can call a 1831 function that uses the system stack for preserving the ovector while 1832 processing the pattern recursion, but only if the ovector is small 1833 enough. */ 1834 1835 #ifndef HEAP_MATCH_RECURSE 1836 if (mb->offset_end <= OP_RECURSE_STACK_SAVE_MAX) 1837 { 1838 rrc = op_recurse_ovecsave(eptr, callpat, mstart, offset_top, mb, 1839 eptrb, rdepth); 1840 mb->recursive = new_recursive.prevrec; 1841 if (rrc != MATCH_MATCH && rrc != MATCH_ACCEPT) RRETURN(rrc); 1842 1843 /* Set where we got to in the subject, and reset the start, in case 1844 it was changed by \K. This *is* propagated back out of a recursion, 1845 for Perl compatibility. */ 1846 1847 eptr = mb->end_match_ptr; 1848 mstart = mb->start_match_ptr; 1849 break; /* End of processing OP_RECURSE */ 1850 } 1851 #endif 1852 /* If the ovector is too big, or if we are using the heap for match() 1853 recursion, we have to use the heap for saving the ovector. Used ovecsave 1854 frames are kept on a chain and re-used. This makes a small improvement in 1855 execution time on Linux. */ 1856 1857 if (mb->ovecsave_chain != NULL) 1858 { 1859 new_recursive.ovec_save = mb->ovecsave_chain->saved_ovec; 1860 mb->ovecsave_chain = mb->ovecsave_chain->next; 1861 } 1862 else 1863 { 1864 fr = (ovecsave_frame *)(mb->memctl.malloc(sizeof(ovecsave_frame *) + 1865 mb->offset_end * sizeof(PCRE2_SIZE), mb->memctl.memory_data)); 1866 if (fr == NULL) RRETURN(PCRE2_ERROR_NOMEMORY); 1867 new_recursive.ovec_save = fr->saved_ovec; 1868 } 1869 1870 memcpy(new_recursive.ovec_save, mb->ovector, 1871 mb->offset_end * sizeof(PCRE2_SIZE)); 1872 1873 /* Do the recursion. After processing each alternative, restore the 1874 ovector data and the last captured value. This code has the same overall 1875 logic as the code in the op_recurse_ovecsave() function, but is adapted 1876 to use RMATCH/RRETURN and to release the heap block containing the saved 1877 ovector. */ 1878 1879 cbegroup = (*callpat >= OP_SBRA); 1880 do 1881 { 1882 if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP; 1883 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, 1884 mb, eptrb, RM6); 1885 memcpy(mb->ovector, new_recursive.ovec_save, 1886 mb->offset_end * sizeof(PCRE2_SIZE)); 1887 mb->capture_last = new_recursive.saved_capture_last; 1888 mb->recursive = new_recursive.prevrec; 1889 1890 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1891 { 1892 fr = (ovecsave_frame *) 1893 ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *)); 1894 fr->next = mb->ovecsave_chain; 1895 mb->ovecsave_chain = fr; 1896 1897 /* Set where we got to in the subject, and reset the start, in case 1898 it was changed by \K. This *is* propagated back out of a recursion, 1899 for Perl compatibility. */ 1900 1901 eptr = mb->end_match_ptr; 1902 mstart = mb->start_match_ptr; 1903 goto RECURSION_MATCHED; /* Exit loop; end processing */ 1904 } 1905 1906 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a 1907 recursion; they cause a NOMATCH for the entire recursion. These codes 1908 are defined in a range that can be tested for. */ 1909 1910 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) 1911 { 1912 rrc = MATCH_NOMATCH; 1913 goto RECURSION_RETURN; 1914 } 1915 1916 /* Any return code other than NOMATCH is an error. */ 1917 1918 if (rrc != MATCH_NOMATCH) goto RECURSION_RETURN; 1919 mb->recursive = &new_recursive; 1920 callpat += GET(callpat, 1); 1921 } 1922 while (*callpat == OP_ALT); 1923 1924 RECURSION_RETURN: 1925 mb->recursive = new_recursive.prevrec; 1926 fr = (ovecsave_frame *) 1927 ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *)); 1928 fr->next = mb->ovecsave_chain; 1929 mb->ovecsave_chain = fr; 1930 RRETURN(rrc); 1931 } 1932 1933 RECURSION_MATCHED: 1934 break; 1935 1936 /* An alternation is the end of a branch; scan along to find the end of the 1937 bracketed group and go to there. */ 1938 1939 case OP_ALT: 1940 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1941 break; 1942 1943 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1944 indicating that it may occur zero times. It may repeat infinitely, or not 1945 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1946 with fixed upper repeat limits are compiled as a number of copies, with the 1947 optional ones preceded by BRAZERO or BRAMINZERO. */ 1948 1949 case OP_BRAZERO: 1950 next_ecode = ecode + 1; 1951 RMATCH(eptr, next_ecode, offset_top, mb, eptrb, RM10); 1952 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1953 do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT); 1954 ecode = next_ecode + 1 + LINK_SIZE; 1955 break; 1956 1957 case OP_BRAMINZERO: 1958 next_ecode = ecode + 1; 1959 do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT); 1960 RMATCH(eptr, next_ecode + 1+LINK_SIZE, offset_top, mb, eptrb, RM11); 1961 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1962 ecode++; 1963 break; 1964 1965 case OP_SKIPZERO: 1966 next_ecode = ecode+1; 1967 do next_ecode += GET(next_ecode,1); while (*next_ecode == OP_ALT); 1968 ecode = next_ecode + 1 + LINK_SIZE; 1969 break; 1970 1971 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything 1972 here; just jump to the group, with allow_zero set TRUE. */ 1973 1974 case OP_BRAPOSZERO: 1975 op = *(++ecode); 1976 allow_zero = TRUE; 1977 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; 1978 goto POSSESSIVE_NON_CAPTURE; 1979 1980 /* End of a group, repeated or non-repeating. */ 1981 1982 case OP_KET: 1983 case OP_KETRMIN: 1984 case OP_KETRMAX: 1985 case OP_KETRPOS: 1986 prev = ecode - GET(ecode, 1); 1987 1988 /* If this was a group that remembered the subject start, in order to break 1989 infinite repeats of empty string matches, retrieve the subject start from 1990 the chain. Otherwise, set it NULL. */ 1991 1992 if (*prev >= OP_SBRA || *prev == OP_ONCE) 1993 { 1994 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1995 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1996 } 1997 else saved_eptr = NULL; 1998 1999 /* If we are at the end of an assertion group or a non-capturing atomic 2000 group, stop matching and return MATCH_MATCH, but record the current high 2001 water mark for use by positive assertions. We also need to record the match 2002 start in case it was changed by \K. */ 2003 2004 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || 2005 *prev == OP_ONCE_NC) 2006 { 2007 mb->end_match_ptr = eptr; /* For ONCE_NC */ 2008 mb->end_offset_top = offset_top; 2009 mb->start_match_ptr = mstart; 2010 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr; 2011 RRETURN(MATCH_MATCH); /* Sets mb->mark */ 2012 } 2013 2014 /* For capturing groups we have to check the group number back at the start 2015 and if necessary complete handling an extraction by setting the offsets and 2016 bumping the high water mark. Whole-pattern recursion is coded as a recurse 2017 into group 0, so it won't be picked up here. Instead, we catch it when the 2018 OP_END is reached. Other recursion is handled here. We just have to record 2019 the current subject position and start match pointer and give a MATCH 2020 return. */ 2021 2022 if (*prev == OP_CBRA || *prev == OP_SCBRA || 2023 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) 2024 { 2025 number = GET2(prev, 1+LINK_SIZE); 2026 offset = number << 1; 2027 2028 /* Handle a recursively called group. */ 2029 2030 if (mb->recursive != NULL && mb->recursive->group_num == number) 2031 { 2032 mb->end_match_ptr = eptr; 2033 mb->start_match_ptr = mstart; 2034 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr; 2035 RRETURN(MATCH_MATCH); 2036 } 2037 2038 /* Deal with capturing */ 2039 2040 mb->capture_last = (mb->capture_last & OVFLMASK) | number; 2041 if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else 2042 { 2043 /* If offset is greater than offset_top, it means that we are 2044 "skipping" a capturing group, and that group's offsets must be marked 2045 unset. In earlier versions of PCRE, all the offsets were unset at the 2046 start of matching, but this doesn't work because atomic groups and 2047 assertions can cause a value to be set that should later be unset. 2048 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as 2049 part of the atomic group, but this is not on the final matching path, 2050 so must be unset when 2 is set. (If there is no group 2, there is no 2051 problem, because offset_top will then be 2, indicating no capture.) */ 2052 2053 if (offset > offset_top) 2054 { 2055 register PCRE2_SIZE *iptr = mb->ovector + offset_top; 2056 register PCRE2_SIZE *iend = mb->ovector + offset; 2057 while (iptr < iend) *iptr++ = PCRE2_UNSET; 2058 } 2059 2060 /* Now make the extraction */ 2061 2062 mb->ovector[offset] = mb->ovector[mb->offset_end - number]; 2063 mb->ovector[offset+1] = eptr - mb->start_subject; 2064 if (offset_top <= offset) offset_top = offset + 2; 2065 } 2066 } 2067 2068 /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 2069 and return the MATCH_KETRPOS. This makes it possible to do the repeats one 2070 at a time from the outer level, thus saving stack. This must precede the 2071 empty string test - in this case that test is done at the outer level. */ 2072 2073 if (*ecode == OP_KETRPOS) 2074 { 2075 mb->start_match_ptr = mstart; /* In case \K reset it */ 2076 mb->end_match_ptr = eptr; 2077 mb->end_offset_top = offset_top; 2078 if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr; 2079 RRETURN(MATCH_KETRPOS); 2080 } 2081 2082 /* For an ordinary non-repeating ket, just continue at this level. This 2083 also happens for a repeating ket if no characters were matched in the 2084 group. This is the forcible breaking of infinite loops as implemented in 2085 Perl 5.005. For a non-repeating atomic group that includes captures, 2086 establish a backup point by processing the rest of the pattern at a lower 2087 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the 2088 original OP_ONCE level, thereby bypassing intermediate backup points, but 2089 resetting any captures that happened along the way. */ 2090 2091 if (*ecode == OP_KET || eptr == saved_eptr) 2092 { 2093 if (*prev == OP_ONCE) 2094 { 2095 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM12); 2096 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2097 mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2098 RRETURN(MATCH_ONCE); 2099 } 2100 ecode += 1 + LINK_SIZE; /* Carry on at this level */ 2101 break; 2102 } 2103 2104 /* The normal repeating kets try the rest of the pattern or restart from 2105 the preceding bracket, in the appropriate order. In the second case, we can 2106 use tail recursion to avoid using another stack frame, unless we have an 2107 an atomic group or an unlimited repeat of a group that can match an empty 2108 string. */ 2109 2110 if (*ecode == OP_KETRMIN) 2111 { 2112 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM7); 2113 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2114 if (*prev == OP_ONCE) 2115 { 2116 RMATCH(eptr, prev, offset_top, mb, eptrb, RM8); 2117 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2118 mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2119 RRETURN(MATCH_ONCE); 2120 } 2121 if (*prev >= OP_SBRA) /* Could match an empty string */ 2122 { 2123 RMATCH(eptr, prev, offset_top, mb, eptrb, RM50); 2124 RRETURN(rrc); 2125 } 2126 ecode = prev; 2127 goto TAIL_RECURSE; 2128 } 2129 else /* OP_KETRMAX */ 2130 { 2131 RMATCH(eptr, prev, offset_top, mb, eptrb, RM13); 2132 if (rrc == MATCH_ONCE && mb->once_target == prev) rrc = MATCH_NOMATCH; 2133 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2134 if (*prev == OP_ONCE) 2135 { 2136 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM9); 2137 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2138 mb->once_target = prev; 2139 RRETURN(MATCH_ONCE); 2140 } 2141 ecode += 1 + LINK_SIZE; 2142 goto TAIL_RECURSE; 2143 } 2144 /* Control never gets here */ 2145 2146 /* Not multiline mode: start of subject assertion, unless notbol. */ 2147 2148 case OP_CIRC: 2149 if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject) 2150 RRETURN(MATCH_NOMATCH); 2151 2152 /* Start of subject assertion */ 2153 2154 case OP_SOD: 2155 if (eptr != mb->start_subject) RRETURN(MATCH_NOMATCH); 2156 ecode++; 2157 break; 2158 2159 /* Multiline mode: start of subject unless notbol, or after any newline 2160 except for one at the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ 2161 2162 case OP_CIRCM: 2163 if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject) 2164 RRETURN(MATCH_NOMATCH); 2165 if (eptr != mb->start_subject && 2166 ((eptr == mb->end_subject && 2167 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || 2168 !WAS_NEWLINE(eptr))) 2169 RRETURN(MATCH_NOMATCH); 2170 ecode++; 2171 break; 2172 2173 /* Start of match assertion */ 2174 2175 case OP_SOM: 2176 if (eptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); 2177 ecode++; 2178 break; 2179 2180 /* Reset the start of match point */ 2181 2182 case OP_SET_SOM: 2183 mstart = eptr; 2184 ecode++; 2185 break; 2186 2187 /* Multiline mode: assert before any newline, or before end of subject 2188 unless noteol is set. */ 2189 2190 case OP_DOLLM: 2191 if (eptr < mb->end_subject) 2192 { 2193 if (!IS_NEWLINE(eptr)) 2194 { 2195 if (mb->partial != 0 && 2196 eptr + 1 >= mb->end_subject && 2197 NLBLOCK->nltype == NLTYPE_FIXED && 2198 NLBLOCK->nllen == 2 && 2199 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2200 { 2201 mb->hitend = TRUE; 2202 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 2203 } 2204 RRETURN(MATCH_NOMATCH); 2205 } 2206 } 2207 else 2208 { 2209 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); 2210 SCHECK_PARTIAL(); 2211 } 2212 ecode++; 2213 break; 2214 2215 /* Not multiline mode: assert before a terminating newline or before end of 2216 subject unless noteol is set. */ 2217 2218 case OP_DOLL: 2219 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); 2220 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; 2221 2222 /* ... else fall through for endonly */ 2223 2224 /* End of subject assertion (\z) */ 2225 2226 case OP_EOD: 2227 if (eptr < mb->end_subject) RRETURN(MATCH_NOMATCH); 2228 SCHECK_PARTIAL(); 2229 ecode++; 2230 break; 2231 2232 /* End of subject or ending \n assertion (\Z) */ 2233 2234 case OP_EODN: 2235 ASSERT_NL_OR_EOS: 2236 if (eptr < mb->end_subject && 2237 (!IS_NEWLINE(eptr) || eptr != mb->end_subject - mb->nllen)) 2238 { 2239 if (mb->partial != 0 && 2240 eptr + 1 >= mb->end_subject && 2241 NLBLOCK->nltype == NLTYPE_FIXED && 2242 NLBLOCK->nllen == 2 && 2243 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2244 { 2245 mb->hitend = TRUE; 2246 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 2247 } 2248 RRETURN(MATCH_NOMATCH); 2249 } 2250 2251 /* Either at end of string or \n before end. */ 2252 2253 SCHECK_PARTIAL(); 2254 ecode++; 2255 break; 2256 2257 /* Word boundary assertions */ 2258 2259 case OP_NOT_WORD_BOUNDARY: 2260 case OP_WORD_BOUNDARY: 2261 { 2262 2263 /* Find out if the previous and current characters are "word" characters. 2264 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 2265 be "non-word" characters. Remember the earliest consulted character for 2266 partial matching. */ 2267 2268 #ifdef SUPPORT_UNICODE 2269 if (utf) 2270 { 2271 /* Get status of previous character */ 2272 2273 if (eptr == mb->start_subject) prev_is_word = FALSE; else 2274 { 2275 PCRE2_SPTR lastptr = eptr - 1; 2276 BACKCHAR(lastptr); 2277 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; 2278 GETCHAR(c, lastptr); 2279 if ((mb->poptions & PCRE2_UCP) != 0) 2280 { 2281 if (c == '_') prev_is_word = TRUE; else 2282 { 2283 int cat = UCD_CATEGORY(c); 2284 prev_is_word = (cat == ucp_L || cat == ucp_N); 2285 } 2286 } 2287 else 2288 prev_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0; 2289 } 2290 2291 /* Get status of next character */ 2292 2293 if (eptr >= mb->end_subject) 2294 { 2295 SCHECK_PARTIAL(); 2296 cur_is_word = FALSE; 2297 } 2298 else 2299 { 2300 PCRE2_SPTR nextptr = eptr + 1; 2301 FORWARDCHARTEST(nextptr, mb->end_subject); 2302 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; 2303 GETCHAR(c, eptr); 2304 if ((mb->poptions & PCRE2_UCP) != 0) 2305 { 2306 if (c == '_') cur_is_word = TRUE; else 2307 { 2308 int cat = UCD_CATEGORY(c); 2309 cur_is_word = (cat == ucp_L || cat == ucp_N); 2310 } 2311 } 2312 else 2313 cur_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0; 2314 } 2315 } 2316 else 2317 #endif /* SUPPORT UTF */ 2318 2319 /* Not in UTF-8 mode, but we may still have PCRE2_UCP set, and for 2320 consistency with the behaviour of \w we do use it in this case. */ 2321 2322 { 2323 /* Get status of previous character */ 2324 2325 if (eptr == mb->start_subject) prev_is_word = FALSE; else 2326 { 2327 if (eptr <= mb->start_used_ptr) mb->start_used_ptr = eptr - 1; 2328 #ifdef SUPPORT_UNICODE 2329 if ((mb->poptions & PCRE2_UCP) != 0) 2330 { 2331 c = eptr[-1]; 2332 if (c == '_') prev_is_word = TRUE; else 2333 { 2334 int cat = UCD_CATEGORY(c); 2335 prev_is_word = (cat == ucp_L || cat == ucp_N); 2336 } 2337 } 2338 else 2339 #endif 2340 prev_is_word = MAX_255(eptr[-1]) 2341 && ((mb->ctypes[eptr[-1]] & ctype_word) != 0); 2342 } 2343 2344 /* Get status of next character */ 2345 2346 if (eptr >= mb->end_subject) 2347 { 2348 SCHECK_PARTIAL(); 2349 cur_is_word = FALSE; 2350 } 2351 else 2352 { 2353 if (eptr >= mb->last_used_ptr) mb->last_used_ptr = eptr + 1; 2354 #ifdef SUPPORT_UNICODE 2355 if ((mb->poptions & PCRE2_UCP) != 0) 2356 { 2357 c = *eptr; 2358 if (c == '_') cur_is_word = TRUE; else 2359 { 2360 int cat = UCD_CATEGORY(c); 2361 cur_is_word = (cat == ucp_L || cat == ucp_N); 2362 } 2363 } 2364 else 2365 #endif 2366 cur_is_word = MAX_255(*eptr) 2367 && ((mb->ctypes[*eptr] & ctype_word) != 0); 2368 } 2369 } 2370 2371 /* Now see if the situation is what we want */ 2372 2373 if ((*ecode++ == OP_WORD_BOUNDARY)? 2374 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 2375 RRETURN(MATCH_NOMATCH); 2376 } 2377 break; 2378 2379 /* Match any single character type except newline; have to take care with 2380 CRLF newlines and partial matching. */ 2381 2382 case OP_ANY: 2383 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 2384 if (mb->partial != 0 && 2385 eptr + 1 >= mb->end_subject && 2386 NLBLOCK->nltype == NLTYPE_FIXED && 2387 NLBLOCK->nllen == 2 && 2388 UCHAR21TEST(eptr) == NLBLOCK->nl[0]) 2389 { 2390 mb->hitend = TRUE; 2391 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 2392 } 2393 2394 /* Fall through */ 2395 2396 /* Match any single character whatsoever. */ 2397 2398 case OP_ALLANY: 2399 if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */ 2400 { /* not be updated before SCHECK_PARTIAL. */ 2401 SCHECK_PARTIAL(); 2402 RRETURN(MATCH_NOMATCH); 2403 } 2404 eptr++; 2405 #ifdef SUPPORT_UNICODE 2406 if (utf) ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 2407 #endif 2408 ecode++; 2409 break; 2410 2411 /* Match a single code unit, even in UTF-8 mode. This opcode really does 2412 match any code unit, even newline. (It really should be called ANYCODEUNIT, 2413 of course - the byte name is from pre-16 bit days.) */ 2414 2415 case OP_ANYBYTE: 2416 if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */ 2417 { /* not be updated before SCHECK_PARTIAL. */ 2418 SCHECK_PARTIAL(); 2419 RRETURN(MATCH_NOMATCH); 2420 } 2421 eptr++; 2422 ecode++; 2423 break; 2424 2425 case OP_NOT_DIGIT: 2426 if (eptr >= mb->end_subject) 2427 { 2428 SCHECK_PARTIAL(); 2429 RRETURN(MATCH_NOMATCH); 2430 } 2431 GETCHARINCTEST(c, eptr); 2432 if ( 2433 #ifdef SUPPORT_WIDE_CHARS 2434 c < 256 && 2435 #endif 2436 (mb->ctypes[c] & ctype_digit) != 0 2437 ) 2438 RRETURN(MATCH_NOMATCH); 2439 ecode++; 2440 break; 2441 2442 case OP_DIGIT: 2443 if (eptr >= mb->end_subject) 2444 { 2445 SCHECK_PARTIAL(); 2446 RRETURN(MATCH_NOMATCH); 2447 } 2448 GETCHARINCTEST(c, eptr); 2449 if ( 2450 #ifdef SUPPORT_WIDE_CHARS 2451 c > 255 || 2452 #endif 2453 (mb->ctypes[c] & ctype_digit) == 0 2454 ) 2455 RRETURN(MATCH_NOMATCH); 2456 ecode++; 2457 break; 2458 2459 case OP_NOT_WHITESPACE: 2460 if (eptr >= mb->end_subject) 2461 { 2462 SCHECK_PARTIAL(); 2463 RRETURN(MATCH_NOMATCH); 2464 } 2465 GETCHARINCTEST(c, eptr); 2466 if ( 2467 #ifdef SUPPORT_WIDE_CHARS 2468 c < 256 && 2469 #endif 2470 (mb->ctypes[c] & ctype_space) != 0 2471 ) 2472 RRETURN(MATCH_NOMATCH); 2473 ecode++; 2474 break; 2475 2476 case OP_WHITESPACE: 2477 if (eptr >= mb->end_subject) 2478 { 2479 SCHECK_PARTIAL(); 2480 RRETURN(MATCH_NOMATCH); 2481 } 2482 GETCHARINCTEST(c, eptr); 2483 if ( 2484 #ifdef SUPPORT_WIDE_CHARS 2485 c > 255 || 2486 #endif 2487 (mb->ctypes[c] & ctype_space) == 0 2488 ) 2489 RRETURN(MATCH_NOMATCH); 2490 ecode++; 2491 break; 2492 2493 case OP_NOT_WORDCHAR: 2494 if (eptr >= mb->end_subject) 2495 { 2496 SCHECK_PARTIAL(); 2497 RRETURN(MATCH_NOMATCH); 2498 } 2499 GETCHARINCTEST(c, eptr); 2500 if ( 2501 #ifdef SUPPORT_WIDE_CHARS 2502 c < 256 && 2503 #endif 2504 (mb->ctypes[c] & ctype_word) != 0 2505 ) 2506 RRETURN(MATCH_NOMATCH); 2507 ecode++; 2508 break; 2509 2510 case OP_WORDCHAR: 2511 if (eptr >= mb->end_subject) 2512 { 2513 SCHECK_PARTIAL(); 2514 RRETURN(MATCH_NOMATCH); 2515 } 2516 GETCHARINCTEST(c, eptr); 2517 if ( 2518 #ifdef SUPPORT_WIDE_CHARS 2519 c > 255 || 2520 #endif 2521 (mb->ctypes[c] & ctype_word) == 0 2522 ) 2523 RRETURN(MATCH_NOMATCH); 2524 ecode++; 2525 break; 2526 2527 case OP_ANYNL: 2528 if (eptr >= mb->end_subject) 2529 { 2530 SCHECK_PARTIAL(); 2531 RRETURN(MATCH_NOMATCH); 2532 } 2533 GETCHARINCTEST(c, eptr); 2534 switch(c) 2535 { 2536 default: RRETURN(MATCH_NOMATCH); 2537 2538 case CHAR_CR: 2539 if (eptr >= mb->end_subject) 2540 { 2541 SCHECK_PARTIAL(); 2542 } 2543 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++; 2544 break; 2545 2546 case CHAR_LF: 2547 break; 2548 2549 case CHAR_VT: 2550 case CHAR_FF: 2551 case CHAR_NEL: 2552 #ifndef EBCDIC 2553 case 0x2028: 2554 case 0x2029: 2555 #endif /* Not EBCDIC */ 2556 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 2557 break; 2558 } 2559 ecode++; 2560 break; 2561 2562 case OP_NOT_HSPACE: 2563 if (eptr >= mb->end_subject) 2564 { 2565 SCHECK_PARTIAL(); 2566 RRETURN(MATCH_NOMATCH); 2567 } 2568 GETCHARINCTEST(c, eptr); 2569 switch(c) 2570 { 2571 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 2572 default: break; 2573 } 2574 ecode++; 2575 break; 2576 2577 case OP_HSPACE: 2578 if (eptr >= mb->end_subject) 2579 { 2580 SCHECK_PARTIAL(); 2581 RRETURN(MATCH_NOMATCH); 2582 } 2583 GETCHARINCTEST(c, eptr); 2584 switch(c) 2585 { 2586 HSPACE_CASES: break; /* Byte and multibyte cases */ 2587 default: RRETURN(MATCH_NOMATCH); 2588 } 2589 ecode++; 2590 break; 2591 2592 case OP_NOT_VSPACE: 2593 if (eptr >= mb->end_subject) 2594 { 2595 SCHECK_PARTIAL(); 2596 RRETURN(MATCH_NOMATCH); 2597 } 2598 GETCHARINCTEST(c, eptr); 2599 switch(c) 2600 { 2601 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2602 default: break; 2603 } 2604 ecode++; 2605 break; 2606 2607 case OP_VSPACE: 2608 if (eptr >= mb->end_subject) 2609 { 2610 SCHECK_PARTIAL(); 2611 RRETURN(MATCH_NOMATCH); 2612 } 2613 GETCHARINCTEST(c, eptr); 2614 switch(c) 2615 { 2616 VSPACE_CASES: break; 2617 default: RRETURN(MATCH_NOMATCH); 2618 } 2619 ecode++; 2620 break; 2621 2622 #ifdef SUPPORT_UNICODE 2623 /* Check the next character by Unicode property. We will get here only 2624 if the support is in the binary; otherwise a compile-time error occurs. */ 2625 2626 case OP_PROP: 2627 case OP_NOTPROP: 2628 if (eptr >= mb->end_subject) 2629 { 2630 SCHECK_PARTIAL(); 2631 RRETURN(MATCH_NOMATCH); 2632 } 2633 GETCHARINCTEST(c, eptr); 2634 { 2635 const uint32_t *cp; 2636 const ucd_record *prop = GET_UCD(c); 2637 2638 switch(ecode[1]) 2639 { 2640 case PT_ANY: 2641 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2642 break; 2643 2644 case PT_LAMP: 2645 if ((prop->chartype == ucp_Lu || 2646 prop->chartype == ucp_Ll || 2647 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2648 RRETURN(MATCH_NOMATCH); 2649 break; 2650 2651 case PT_GC: 2652 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) 2653 RRETURN(MATCH_NOMATCH); 2654 break; 2655 2656 case PT_PC: 2657 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2658 RRETURN(MATCH_NOMATCH); 2659 break; 2660 2661 case PT_SC: 2662 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2663 RRETURN(MATCH_NOMATCH); 2664 break; 2665 2666 /* These are specials */ 2667 2668 case PT_ALNUM: 2669 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2670 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2671 RRETURN(MATCH_NOMATCH); 2672 break; 2673 2674 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2675 which means that Perl space and POSIX space are now identical. PCRE 2676 was changed at release 8.34. */ 2677 2678 case PT_SPACE: /* Perl space */ 2679 case PT_PXSPACE: /* POSIX space */ 2680 switch(c) 2681 { 2682 HSPACE_CASES: 2683 VSPACE_CASES: 2684 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2685 break; 2686 2687 default: 2688 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == 2689 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); 2690 break; 2691 } 2692 break; 2693 2694 case PT_WORD: 2695 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2696 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2697 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2698 RRETURN(MATCH_NOMATCH); 2699 break; 2700 2701 case PT_CLIST: 2702 cp = PRIV(ucd_caseless_sets) + ecode[2]; 2703 for (;;) 2704 { 2705 if (c < *cp) 2706 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } 2707 if (c == *cp++) 2708 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } 2709 } 2710 break; 2711 2712 case PT_UCNC: 2713 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 2714 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 2715 c >= 0xe000) == (op == OP_NOTPROP)) 2716 RRETURN(MATCH_NOMATCH); 2717 break; 2718 2719 /* This should never occur */ 2720 2721 default: 2722 RRETURN(PCRE2_ERROR_INTERNAL); 2723 } 2724 2725 ecode += 3; 2726 } 2727 break; 2728 2729 /* Match an extended Unicode sequence. We will get here only if the support 2730 is in the binary; otherwise a compile-time error occurs. */ 2731 2732 case OP_EXTUNI: 2733 if (eptr >= mb->end_subject) 2734 { 2735 SCHECK_PARTIAL(); 2736 RRETURN(MATCH_NOMATCH); 2737 } 2738 else 2739 { 2740 int lgb, rgb; 2741 GETCHARINCTEST(c, eptr); 2742 lgb = UCD_GRAPHBREAK(c); 2743 while (eptr < mb->end_subject) 2744 { 2745 int len = 1; 2746 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 2747 rgb = UCD_GRAPHBREAK(c); 2748 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 2749 lgb = rgb; 2750 eptr += len; 2751 } 2752 } 2753 CHECK_PARTIAL(); 2754 ecode++; 2755 break; 2756 #endif /* SUPPORT_UNICODE */ 2757 2758 2759 /* Match a back reference, possibly repeatedly. Look past the end of the 2760 item to see if there is repeat information following. 2761 2762 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group 2763 or to a non-duplicated named group. For a duplicated named group, OP_DNREF 2764 and OP_DNREFI are used. In this case we must scan the list of groups to 2765 which the name refers, and use the first one that is set. */ 2766 2767 case OP_DNREF: 2768 case OP_DNREFI: 2769 caseless = op == OP_DNREFI; 2770 { 2771 int count = GET2(ecode, 1+IMM2_SIZE); 2772 PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size; 2773 ecode += 1 + 2*IMM2_SIZE; 2774 2775 /* Initializing 'offset' avoids a compiler warning in the REF_REPEAT 2776 code. */ 2777 2778 offset = 0; 2779 while (count-- > 0) 2780 { 2781 offset = GET2(slot, 0) << 1; 2782 if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break; 2783 slot += mb->name_entry_size; 2784 } 2785 } 2786 goto REF_REPEAT; 2787 2788 case OP_REF: 2789 case OP_REFI: 2790 caseless = op == OP_REFI; 2791 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2792 ecode += 1 + IMM2_SIZE; 2793 2794 /* Set up for repetition, or handle the non-repeated case */ 2795 2796 REF_REPEAT: 2797 switch (*ecode) 2798 { 2799 case OP_CRSTAR: 2800 case OP_CRMINSTAR: 2801 case OP_CRPLUS: 2802 case OP_CRMINPLUS: 2803 case OP_CRQUERY: 2804 case OP_CRMINQUERY: 2805 c = *ecode++ - OP_CRSTAR; 2806 minimize = (c & 1) != 0; 2807 min = rep_min[c]; /* Pick up values from tables; */ 2808 max = rep_max[c]; /* zero for max => infinity */ 2809 if (max == 0) max = INT_MAX; 2810 break; 2811 2812 case OP_CRRANGE: 2813 case OP_CRMINRANGE: 2814 minimize = (*ecode == OP_CRMINRANGE); 2815 min = GET2(ecode, 1); 2816 max = GET2(ecode, 1 + IMM2_SIZE); 2817 if (max == 0) max = INT_MAX; 2818 ecode += 1 + 2 * IMM2_SIZE; 2819 break; 2820 2821 default: /* No repeat follows */ 2822 { 2823 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length); 2824 if (rc != 0) 2825 { 2826 if (rc > 0) eptr = mb->end_subject; /* Partial match */ 2827 CHECK_PARTIAL(); 2828 RRETURN(MATCH_NOMATCH); 2829 } 2830 } 2831 eptr += length; 2832 continue; /* With the main loop */ 2833 } 2834 2835 /* Handle repeated back references. If a set group has length zero, just 2836 continue with the main loop, because it matches however many times. For an 2837 unset reference, if the minimum is zero, we can also just continue. We an 2838 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset 2839 group be have as a zero-length group. For any other unset cases, carrying 2840 on will result in NOMATCH. */ 2841 2842 if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) 2843 { 2844 if (mb->ovector[offset] == mb->ovector[offset + 1]) continue; 2845 } 2846 else /* Group is not set */ 2847 { 2848 if (min == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) 2849 continue; 2850 } 2851 2852 /* First, ensure the minimum number of matches are present. We get back 2853 the length of the reference string explicitly rather than passing the 2854 address of eptr, so that eptr can be a register variable. */ 2855 2856 for (i = 1; i <= min; i++) 2857 { 2858 PCRE2_SIZE slength; 2859 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength); 2860 if (rc != 0) 2861 { 2862 if (rc > 0) eptr = mb->end_subject; /* Partial match */ 2863 CHECK_PARTIAL(); 2864 RRETURN(MATCH_NOMATCH); 2865 } 2866 eptr += slength; 2867 } 2868 2869 /* If min = max, continue at the same level without recursion. 2870 They are not both allowed to be zero. */ 2871 2872 if (min == max) continue; 2873 2874 /* If minimizing, keep trying and advancing the pointer */ 2875 2876 if (minimize) 2877 { 2878 for (fi = min;; fi++) 2879 { 2880 int rc; 2881 PCRE2_SIZE slength; 2882 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14); 2883 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2884 if (fi >= max) RRETURN(MATCH_NOMATCH); 2885 rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength); 2886 if (rc != 0) 2887 { 2888 if (rc > 0) eptr = mb->end_subject; /* Partial match */ 2889 CHECK_PARTIAL(); 2890 RRETURN(MATCH_NOMATCH); 2891 } 2892 eptr += slength; 2893 } 2894 /* Control never gets here */ 2895 } 2896 2897 /* If maximizing, find the longest string and work backwards, as long as 2898 the matched lengths for each iteration are the same. */ 2899 2900 else 2901 { 2902 BOOL samelengths = TRUE; 2903 pp = eptr; 2904 length = mb->ovector[offset+1] - mb->ovector[offset]; 2905 2906 for (i = min; i < max; i++) 2907 { 2908 PCRE2_SIZE slength; 2909 int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength); 2910 2911 if (rc != 0) 2912 { 2913 /* Can't use CHECK_PARTIAL because we don't want to update eptr in 2914 the soft partial matching case. */ 2915 2916 if (rc > 0 && mb->partial != 0 && 2917 mb->end_subject > mb->start_used_ptr) 2918 { 2919 mb->hitend = TRUE; 2920 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 2921 } 2922 break; 2923 } 2924 2925 if (slength != length) samelengths = FALSE; 2926 eptr += slength; 2927 } 2928 2929 /* If the length matched for each repetition is the same as the length of 2930 the captured group, we can easily work backwards. This is the normal 2931 case. However, in caseless UTF-8 mode there are pairs of case-equivalent 2932 characters whose lengths (in terms of code units) differ. However, this 2933 is very rare, so we handle it by re-matching fewer and fewer times. */ 2934 2935 if (samelengths) 2936 { 2937 while (eptr >= pp) 2938 { 2939 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15); 2940 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2941 eptr -= length; 2942 } 2943 } 2944 2945 /* The rare case of non-matching lengths. Re-scan the repetition for each 2946 iteration. We know that match_ref() will succeed every time. */ 2947 2948 else 2949 { 2950 max = i; 2951 for (;;) 2952 { 2953 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM68); 2954 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2955 if (eptr == pp) break; /* Failed after minimal repetition */ 2956 eptr = pp; 2957 max--; 2958 for (i = min; i < max; i++) 2959 { 2960 PCRE2_SIZE slength; 2961 (void)match_ref(offset, offset_top, eptr, mb, caseless, &slength); 2962 eptr += slength; 2963 } 2964 } 2965 } 2966 2967 RRETURN(MATCH_NOMATCH); 2968 } 2969 /* Control never gets here */ 2970 2971 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2972 used when all the characters in the class have values in the range 0-255, 2973 and either the matching is caseful, or the characters are in the range 2974 0-127 when UTF-8 processing is enabled. The only difference between 2975 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2976 encountered. 2977 2978 First, look past the end of the item to see if there is repeat information 2979 following. Then obey similar code to character type repeats - written out 2980 again for speed. */ 2981 2982 case OP_NCLASS: 2983 case OP_CLASS: 2984 { 2985 /* The data variable is saved across frames, so the byte map needs to 2986 be stored there. */ 2987 #define BYTE_MAP ((uint8_t *)data) 2988 data = ecode + 1; /* Save for matching */ 2989 ecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ 2990 2991 switch (*ecode) 2992 { 2993 case OP_CRSTAR: 2994 case OP_CRMINSTAR: 2995 case OP_CRPLUS: 2996 case OP_CRMINPLUS: 2997 case OP_CRQUERY: 2998 case OP_CRMINQUERY: 2999 case OP_CRPOSSTAR: 3000 case OP_CRPOSPLUS: 3001 case OP_CRPOSQUERY: 3002 c = *ecode++ - OP_CRSTAR; 3003 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 3004 else possessive = TRUE; 3005 min = rep_min[c]; /* Pick up values from tables; */ 3006 max = rep_max[c]; /* zero for max => infinity */ 3007 if (max == 0) max = INT_MAX; 3008 break; 3009 3010 case OP_CRRANGE: 3011 case OP_CRMINRANGE: 3012 case OP_CRPOSRANGE: 3013 minimize = (*ecode == OP_CRMINRANGE); 3014 possessive = (*ecode == OP_CRPOSRANGE); 3015 min = GET2(ecode, 1); 3016 max = GET2(ecode, 1 + IMM2_SIZE); 3017 if (max == 0) max = INT_MAX; 3018 ecode += 1 + 2 * IMM2_SIZE; 3019 break; 3020 3021 default: /* No repeat follows */ 3022 min = max = 1; 3023 break; 3024 } 3025 3026 /* First, ensure the minimum number of matches are present. */ 3027 3028 #ifdef SUPPORT_UNICODE 3029 if (utf) 3030 { 3031 for (i = 1; i <= min; i++) 3032 { 3033 if (eptr >= mb->end_subject) 3034 { 3035 SCHECK_PARTIAL(); 3036 RRETURN(MATCH_NOMATCH); 3037 } 3038 GETCHARINC(c, eptr); 3039 if (c > 255) 3040 { 3041 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3042 } 3043 else 3044 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3045 } 3046 } 3047 else 3048 #endif 3049 /* Not UTF mode */ 3050 { 3051 for (i = 1; i <= min; i++) 3052 { 3053 if (eptr >= mb->end_subject) 3054 { 3055 SCHECK_PARTIAL(); 3056 RRETURN(MATCH_NOMATCH); 3057 } 3058 c = *eptr++; 3059 #if PCRE2_CODE_UNIT_WIDTH != 8 3060 if (c > 255) 3061 { 3062 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3063 } 3064 else 3065 #endif 3066 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3067 } 3068 } 3069 3070 /* If max == min we can continue with the main loop without the 3071 need to recurse. */ 3072 3073 if (min == max) continue; 3074 3075 /* If minimizing, keep testing the rest of the expression and advancing 3076 the pointer while it matches the class. */ 3077 3078 if (minimize) 3079 { 3080 #ifdef SUPPORT_UNICODE 3081 if (utf) 3082 { 3083 for (fi = min;; fi++) 3084 { 3085 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM16); 3086 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3087 if (fi >= max) RRETURN(MATCH_NOMATCH); 3088 if (eptr >= mb->end_subject) 3089 { 3090 SCHECK_PARTIAL(); 3091 RRETURN(MATCH_NOMATCH); 3092 } 3093 GETCHARINC(c, eptr); 3094 if (c > 255) 3095 { 3096 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3097 } 3098 else 3099 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3100 } 3101 } 3102 else 3103 #endif 3104 /* Not UTF mode */ 3105 { 3106 for (fi = min;; fi++) 3107 { 3108 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM17); 3109 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3110 if (fi >= max) RRETURN(MATCH_NOMATCH); 3111 if (eptr >= mb->end_subject) 3112 { 3113 SCHECK_PARTIAL(); 3114 RRETURN(MATCH_NOMATCH); 3115 } 3116 c = *eptr++; 3117 #if PCRE2_CODE_UNIT_WIDTH != 8 3118 if (c > 255) 3119 { 3120 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 3121 } 3122 else 3123 #endif 3124 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 3125 } 3126 } 3127 /* Control never gets here */ 3128 } 3129 3130 /* If maximizing, find the longest possible run, then work backwards. */ 3131 3132 else 3133 { 3134 pp = eptr; 3135 3136 #ifdef SUPPORT_UNICODE 3137 if (utf) 3138 { 3139 for (i = min; i < max; i++) 3140 { 3141 int len = 1; 3142 if (eptr >= mb->end_subject) 3143 { 3144 SCHECK_PARTIAL(); 3145 break; 3146 } 3147 GETCHARLEN(c, eptr, len); 3148 if (c > 255) 3149 { 3150 if (op == OP_CLASS) break; 3151 } 3152 else 3153 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3154 eptr += len; 3155 } 3156 3157 if (possessive) continue; /* No backtracking */ 3158 3159 for (;;) 3160 { 3161 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM18); 3162 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3163 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3164 BACKCHAR(eptr); 3165 } 3166 } 3167 else 3168 #endif 3169 /* Not UTF mode */ 3170 { 3171 for (i = min; i < max; i++) 3172 { 3173 if (eptr >= mb->end_subject) 3174 { 3175 SCHECK_PARTIAL(); 3176 break; 3177 } 3178 c = *eptr; 3179 #if PCRE2_CODE_UNIT_WIDTH != 8 3180 if (c > 255) 3181 { 3182 if (op == OP_CLASS) break; 3183 } 3184 else 3185 #endif 3186 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3187 eptr++; 3188 } 3189 3190 if (possessive) continue; /* No backtracking */ 3191 3192 while (eptr >= pp) 3193 { 3194 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM19); 3195 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3196 eptr--; 3197 } 3198 } 3199 3200 RRETURN(MATCH_NOMATCH); 3201 } 3202 #undef BYTE_MAP 3203 } 3204 /* Control never gets here */ 3205 3206 3207 /* Match an extended character class. In the 8-bit library, this opcode is 3208 encountered only when UTF-8 mode mode is supported. In the 16-bit and 3209 32-bit libraries, codepoints greater than 255 may be encountered even when 3210 UTF is not supported. */ 3211 3212 #ifdef SUPPORT_WIDE_CHARS 3213 case OP_XCLASS: 3214 { 3215 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 3216 ecode += GET(ecode, 1); /* Advance past the item */ 3217 3218 switch (*ecode) 3219 { 3220 case OP_CRSTAR: 3221 case OP_CRMINSTAR: 3222 case OP_CRPLUS: 3223 case OP_CRMINPLUS: 3224 case OP_CRQUERY: 3225 case OP_CRMINQUERY: 3226 case OP_CRPOSSTAR: 3227 case OP_CRPOSPLUS: 3228 case OP_CRPOSQUERY: 3229 c = *ecode++ - OP_CRSTAR; 3230 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; 3231 else possessive = TRUE; 3232 min = rep_min[c]; /* Pick up values from tables; */ 3233 max = rep_max[c]; /* zero for max => infinity */ 3234 if (max == 0) max = INT_MAX; 3235 break; 3236 3237 case OP_CRRANGE: 3238 case OP_CRMINRANGE: 3239 case OP_CRPOSRANGE: 3240 minimize = (*ecode == OP_CRMINRANGE); 3241 possessive = (*ecode == OP_CRPOSRANGE); 3242 min = GET2(ecode, 1); 3243 max = GET2(ecode, 1 + IMM2_SIZE); 3244 if (max == 0) max = INT_MAX; 3245 ecode += 1 + 2 * IMM2_SIZE; 3246 break; 3247 3248 default: /* No repeat follows */ 3249 min = max = 1; 3250 break; 3251 } 3252 3253 /* First, ensure the minimum number of matches are present. */ 3254 3255 for (i = 1; i <= min; i++) 3256 { 3257 if (eptr >= mb->end_subject) 3258 { 3259 SCHECK_PARTIAL(); 3260 RRETURN(MATCH_NOMATCH); 3261 } 3262 GETCHARINCTEST(c, eptr); 3263 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3264 } 3265 3266 /* If max == min we can continue with the main loop without the 3267 need to recurse. */ 3268 3269 if (min == max) continue; 3270 3271 /* If minimizing, keep testing the rest of the expression and advancing 3272 the pointer while it matches the class. */ 3273 3274 if (minimize) 3275 { 3276 for (fi = min;; fi++) 3277 { 3278 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM20); 3279 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3280 if (fi >= max) RRETURN(MATCH_NOMATCH); 3281 if (eptr >= mb->end_subject) 3282 { 3283 SCHECK_PARTIAL(); 3284 RRETURN(MATCH_NOMATCH); 3285 } 3286 GETCHARINCTEST(c, eptr); 3287 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3288 } 3289 /* Control never gets here */ 3290 } 3291 3292 /* If maximizing, find the longest possible run, then work backwards. */ 3293 3294 else 3295 { 3296 pp = eptr; 3297 for (i = min; i < max; i++) 3298 { 3299 int len = 1; 3300 if (eptr >= mb->end_subject) 3301 { 3302 SCHECK_PARTIAL(); 3303 break; 3304 } 3305 #ifdef SUPPORT_UNICODE 3306 GETCHARLENTEST(c, eptr, len); 3307 #else 3308 c = *eptr; 3309 #endif 3310 if (!PRIV(xclass)(c, data, utf)) break; 3311 eptr += len; 3312 } 3313 3314 if (possessive) continue; /* No backtracking */ 3315 3316 for(;;) 3317 { 3318 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM21); 3319 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3320 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3321 #ifdef SUPPORT_UNICODE 3322 if (utf) BACKCHAR(eptr); 3323 #endif 3324 } 3325 RRETURN(MATCH_NOMATCH); 3326 } 3327 3328 /* Control never gets here */ 3329 } 3330 #endif /* End of XCLASS */ 3331 3332 /* Match a single character, casefully */ 3333 3334 case OP_CHAR: 3335 #ifdef SUPPORT_UNICODE 3336 if (utf) 3337 { 3338 length = 1; 3339 ecode++; 3340 GETCHARLEN(fc, ecode, length); 3341 if (length > (PCRE2_SIZE)(mb->end_subject - eptr)) 3342 { 3343 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 3344 RRETURN(MATCH_NOMATCH); 3345 } 3346 for (; length > 0; length--) 3347 { 3348 if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); 3349 } 3350 } 3351 else 3352 #endif 3353 /* Not UTF mode */ 3354 { 3355 if (mb->end_subject - eptr < 1) 3356 { 3357 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 3358 RRETURN(MATCH_NOMATCH); 3359 } 3360 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 3361 ecode += 2; 3362 } 3363 break; 3364 3365 /* Match a single character, caselessly. If we are at the end of the 3366 subject, give up immediately. */ 3367 3368 case OP_CHARI: 3369 if (eptr >= mb->end_subject) 3370 { 3371 SCHECK_PARTIAL(); 3372 RRETURN(MATCH_NOMATCH); 3373 } 3374 3375 #ifdef SUPPORT_UNICODE 3376 if (utf) 3377 { 3378 length = 1; 3379 ecode++; 3380 GETCHARLEN(fc, ecode, length); 3381 3382 /* If the pattern character's value is < 128, we have only one byte, and 3383 we know that its other case must also be one byte long, so we can use the 3384 fast lookup table. We know that there is at least one byte left in the 3385 subject. */ 3386 3387 if (fc < 128) 3388 { 3389 uint32_t cc = UCHAR21(eptr); 3390 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); 3391 ecode++; 3392 eptr++; 3393 } 3394 3395 /* Otherwise we must pick up the subject character. Note that we cannot 3396 use the value of "length" to check for sufficient bytes left, because the 3397 other case of the character may have more or fewer bytes. */ 3398 3399 else 3400 { 3401 uint32_t dc; 3402 GETCHARINC(dc, eptr); 3403 ecode += length; 3404 3405 /* If we have Unicode property support, we can use it to test the other 3406 case of the character, if there is one. */ 3407 3408 if (fc != dc) 3409 { 3410 #ifdef SUPPORT_UNICODE 3411 if (dc != UCD_OTHERCASE(fc)) 3412 #endif 3413 RRETURN(MATCH_NOMATCH); 3414 } 3415 } 3416 } 3417 else 3418 #endif /* SUPPORT_UNICODE */ 3419 3420 /* Not UTF mode */ 3421 { 3422 if (TABLE_GET(ecode[1], mb->lcc, ecode[1]) 3423 != TABLE_GET(*eptr, mb->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3424 eptr++; 3425 ecode += 2; 3426 } 3427 break; 3428 3429 /* Match a single character repeatedly. */ 3430 3431 case OP_EXACT: 3432 case OP_EXACTI: 3433 min = max = GET2(ecode, 1); 3434 ecode += 1 + IMM2_SIZE; 3435 goto REPEATCHAR; 3436 3437 case OP_POSUPTO: 3438 case OP_POSUPTOI: 3439 possessive = TRUE; 3440 /* Fall through */ 3441 3442 case OP_UPTO: 3443 case OP_UPTOI: 3444 case OP_MINUPTO: 3445 case OP_MINUPTOI: 3446 min = 0; 3447 max = GET2(ecode, 1); 3448 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; 3449 ecode += 1 + IMM2_SIZE; 3450 goto REPEATCHAR; 3451 3452 case OP_POSSTAR: 3453 case OP_POSSTARI: 3454 possessive = TRUE; 3455 min = 0; 3456 max = INT_MAX; 3457 ecode++; 3458 goto REPEATCHAR; 3459 3460 case OP_POSPLUS: 3461 case OP_POSPLUSI: 3462 possessive = TRUE; 3463 min = 1; 3464 max = INT_MAX; 3465 ecode++; 3466 goto REPEATCHAR; 3467 3468 case OP_POSQUERY: 3469 case OP_POSQUERYI: 3470 possessive = TRUE; 3471 min = 0; 3472 max = 1; 3473 ecode++; 3474 goto REPEATCHAR; 3475 3476 case OP_STAR: 3477 case OP_STARI: 3478 case OP_MINSTAR: 3479 case OP_MINSTARI: 3480 case OP_PLUS: 3481 case OP_PLUSI: 3482 case OP_MINPLUS: 3483 case OP_MINPLUSI: 3484 case OP_QUERY: 3485 case OP_QUERYI: 3486 case OP_MINQUERY: 3487 case OP_MINQUERYI: 3488 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); 3489 minimize = (c & 1) != 0; 3490 min = rep_min[c]; /* Pick up values from tables; */ 3491 max = rep_max[c]; /* zero for max => infinity */ 3492 if (max == 0) max = INT_MAX; 3493 3494 /* Common code for all repeated single-character matches. We first check 3495 for the minimum number of characters. If the minimum equals the maximum, we 3496 are done. Otherwise, if minimizing, check the rest of the pattern for a 3497 match; if there isn't one, advance up to the maximum, one character at a 3498 time. 3499 3500 If maximizing, advance up to the maximum number of matching characters, 3501 until eptr is past the end of the maximum run. If possessive, we are 3502 then done (no backing up). Otherwise, match at this position; anything 3503 other than no match is immediately returned. For nomatch, back up one 3504 character, unless we are matching \R and the last thing matched was 3505 \r\n, in which case, back up two bytes. When we reach the first optional 3506 character position, we can save stack by doing a tail recurse. 3507 3508 The various UTF/non-UTF and caseful/caseless cases are handled separately, 3509 for speed. */ 3510 3511 REPEATCHAR: 3512 #ifdef SUPPORT_UNICODE 3513 if (utf) 3514 { 3515 length = 1; 3516 charptr = ecode; 3517 GETCHARLEN(fc, ecode, length); 3518 ecode += length; 3519 3520 /* Handle multibyte character matching specially here. There is 3521 support for caseless matching if UCP support is present. */ 3522 3523 if (length > 1) 3524 { 3525 uint32_t othercase; 3526 if (op >= OP_STARI && /* Caseless */ 3527 (othercase = UCD_OTHERCASE(fc)) != fc) 3528 oclength = PRIV(ord2utf)(othercase, occhars); 3529 else oclength = 0; 3530 3531 for (i = 1; i <= min; i++) 3532 { 3533 if (eptr <= mb->end_subject - length && 3534 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length; 3535 else if (oclength > 0 && 3536 eptr <= mb->end_subject - oclength && 3537 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength; 3538 else 3539 { 3540 CHECK_PARTIAL(); 3541 RRETURN(MATCH_NOMATCH); 3542 } 3543 } 3544 3545 if (min == max) continue; 3546 3547 if (minimize) 3548 { 3549 for (fi = min;; fi++) 3550 { 3551 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM22); 3552 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3553 if (fi >= max) RRETURN(MATCH_NOMATCH); 3554 if (eptr <= mb->end_subject - length && 3555 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length; 3556 else if (oclength > 0 && 3557 eptr <= mb->end_subject - oclength && 3558 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength; 3559 else 3560 { 3561 CHECK_PARTIAL(); 3562 RRETURN(MATCH_NOMATCH); 3563 } 3564 } 3565 /* Control never gets here */ 3566 } 3567 3568 else /* Maximize */ 3569 { 3570 pp = eptr; 3571 for (i = min; i < max; i++) 3572 { 3573 if (eptr <= mb->end_subject - length && 3574 memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length; 3575 else if (oclength > 0 && 3576 eptr <= mb->end_subject - oclength && 3577 memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength; 3578 else 3579 { 3580 CHECK_PARTIAL(); 3581 break; 3582 } 3583 } 3584 3585 if (possessive) continue; /* No backtracking */ 3586 3587 /* After \C in UTF mode, pp might be in the middle of a Unicode 3588 character. Use <= pp to ensure backtracking doesn't go too far. */ 3589 3590 for(;;) 3591 { 3592 if (eptr <= pp) goto TAIL_RECURSE; 3593 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM23); 3594 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3595 eptr--; 3596 BACKCHAR(eptr); 3597 } 3598 } 3599 /* Control never gets here */ 3600 } 3601 3602 /* If the length of a UTF-8 character is 1, we fall through here, and 3603 obey the code as for non-UTF-8 characters below, though in this case the 3604 value of fc will always be < 128. */ 3605 } 3606 else 3607 #endif /* SUPPORT_UNICODE */ 3608 3609 /* When not in UTF-8 mode, load a single-byte character. */ 3610 fc = *ecode++; 3611 3612 /* The value of fc at this point is always one character, though we may 3613 or may not be in UTF mode. The code is duplicated for the caseless and 3614 caseful cases, for speed, since matching characters is likely to be quite 3615 common. First, ensure the minimum number of matches are present. If min = 3616 max, continue at the same level without recursing. Otherwise, if 3617 minimizing, keep trying the rest of the expression and advancing one 3618 matching character if failing, up to the maximum. Alternatively, if 3619 maximizing, find the maximum number of characters and work backwards. */ 3620 3621 if (op >= OP_STARI) /* Caseless */ 3622 { 3623 #if PCRE2_CODE_UNIT_WIDTH == 8 3624 /* fc must be < 128 if UTF is enabled. */ 3625 foc = mb->fcc[fc]; 3626 #else 3627 #ifdef SUPPORT_UNICODE 3628 if (utf && fc > 127) 3629 foc = UCD_OTHERCASE(fc); 3630 else 3631 #endif /* SUPPORT_UNICODE */ 3632 foc = TABLE_GET(fc, mb->fcc, fc); 3633 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 3634 3635 for (i = 1; i <= min; i++) 3636 { 3637 uint32_t cc; /* Faster than PCRE2_UCHAR */ 3638 if (eptr >= mb->end_subject) 3639 { 3640 SCHECK_PARTIAL(); 3641 RRETURN(MATCH_NOMATCH); 3642 } 3643 cc = UCHAR21TEST(eptr); 3644 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3645 eptr++; 3646 } 3647 if (min == max) continue; 3648 if (minimize) 3649 { 3650 for (fi = min;; fi++) 3651 { 3652 uint32_t cc; /* Faster than PCRE2_UCHAR */ 3653 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM24); 3654 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3655 if (fi >= max) RRETURN(MATCH_NOMATCH); 3656 if (eptr >= mb->end_subject) 3657 { 3658 SCHECK_PARTIAL(); 3659 RRETURN(MATCH_NOMATCH); 3660 } 3661 cc = UCHAR21TEST(eptr); 3662 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); 3663 eptr++; 3664 } 3665 /* Control never gets here */ 3666 } 3667 else /* Maximize */ 3668 { 3669 pp = eptr; 3670 for (i = min; i < max; i++) 3671 { 3672 uint32_t cc; /* Faster than PCRE2_UCHAR */ 3673 if (eptr >= mb->end_subject) 3674 { 3675 SCHECK_PARTIAL(); 3676 break; 3677 } 3678 cc = UCHAR21TEST(eptr); 3679 if (fc != cc && foc != cc) break; 3680 eptr++; 3681 } 3682 if (possessive) continue; /* No backtracking */ 3683 for (;;) 3684 { 3685 if (eptr == pp) goto TAIL_RECURSE; 3686 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM25); 3687 eptr--; 3688 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3689 } 3690 /* Control never gets here */ 3691 } 3692 } 3693 3694 /* Caseful comparisons (includes all multi-byte characters) */ 3695 3696 else 3697 { 3698 for (i = 1; i <= min; i++) 3699 { 3700 if (eptr >= mb->end_subject) 3701 { 3702 SCHECK_PARTIAL(); 3703 RRETURN(MATCH_NOMATCH); 3704 } 3705 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3706 } 3707 3708 if (min == max) continue; 3709 3710 if (minimize) 3711 { 3712 for (fi = min;; fi++) 3713 { 3714 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM26); 3715 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3716 if (fi >= max) RRETURN(MATCH_NOMATCH); 3717 if (eptr >= mb->end_subject) 3718 { 3719 SCHECK_PARTIAL(); 3720 RRETURN(MATCH_NOMATCH); 3721 } 3722 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); 3723 } 3724 /* Control never gets here */ 3725 } 3726 else /* Maximize */ 3727 { 3728 pp = eptr; 3729 for (i = min; i < max; i++) 3730 { 3731 if (eptr >= mb->end_subject) 3732 { 3733 SCHECK_PARTIAL(); 3734 break; 3735 } 3736 if (fc != UCHAR21TEST(eptr)) break; 3737 eptr++; 3738 } 3739 if (possessive) continue; /* No backtracking */ 3740 for (;;) 3741 { 3742 if (eptr == pp) goto TAIL_RECURSE; 3743 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM27); 3744 eptr--; 3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3746 } 3747 /* Control never gets here */ 3748 } 3749 } 3750 /* Control never gets here */ 3751 3752 /* Match a negated single one-byte character. The character we are 3753 checking can be multibyte. */ 3754 3755 case OP_NOT: 3756 case OP_NOTI: 3757 if (eptr >= mb->end_subject) 3758 { 3759 SCHECK_PARTIAL(); 3760 RRETURN(MATCH_NOMATCH); 3761 } 3762 #ifdef SUPPORT_UNICODE 3763 if (utf) 3764 { 3765 register uint32_t ch, och; 3766 3767 ecode++; 3768 GETCHARINC(ch, ecode); 3769 GETCHARINC(c, eptr); 3770 3771 if (op == OP_NOT) 3772 { 3773 if (ch == c) RRETURN(MATCH_NOMATCH); 3774 } 3775 else 3776 { 3777 if (ch > 127) 3778 och = UCD_OTHERCASE(ch); 3779 else 3780 och = TABLE_GET(ch, mb->fcc, ch); 3781 if (ch == c || och == c) RRETURN(MATCH_NOMATCH); 3782 } 3783 } 3784 else 3785 #endif /* SUPPORT_UNICODE */ 3786 { 3787 register uint32_t ch = ecode[1]; 3788 c = *eptr++; 3789 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == c)) 3790 RRETURN(MATCH_NOMATCH); 3791 ecode += 2; 3792 } 3793 break; 3794 3795 /* Match a negated single one-byte character repeatedly. This is almost a 3796 repeat of the code for a repeated single character, but I haven't found a 3797 nice way of commoning these up that doesn't require a test of the 3798 positive/negative option for each character match. Maybe that wouldn't add 3799 very much to the time taken, but character matching *is* what this is all 3800 about... */ 3801 3802 case OP_NOTEXACT: 3803 case OP_NOTEXACTI: 3804 min = max = GET2(ecode, 1); 3805 ecode += 1 + IMM2_SIZE; 3806 goto REPEATNOTCHAR; 3807 3808 case OP_NOTUPTO: 3809 case OP_NOTUPTOI: 3810 case OP_NOTMINUPTO: 3811 case OP_NOTMINUPTOI: 3812 min = 0; 3813 max = GET2(ecode, 1); 3814 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; 3815 ecode += 1 + IMM2_SIZE; 3816 goto REPEATNOTCHAR; 3817 3818 case OP_NOTPOSSTAR: 3819 case OP_NOTPOSSTARI: 3820 possessive = TRUE; 3821 min = 0; 3822 max = INT_MAX; 3823 ecode++; 3824 goto REPEATNOTCHAR; 3825 3826 case OP_NOTPOSPLUS: 3827 case OP_NOTPOSPLUSI: 3828 possessive = TRUE; 3829 min = 1; 3830 max = INT_MAX; 3831 ecode++; 3832 goto REPEATNOTCHAR; 3833 3834 case OP_NOTPOSQUERY: 3835 case OP_NOTPOSQUERYI: 3836 possessive = TRUE; 3837 min = 0; 3838 max = 1; 3839 ecode++; 3840 goto REPEATNOTCHAR; 3841 3842 case OP_NOTPOSUPTO: 3843 case OP_NOTPOSUPTOI: 3844 possessive = TRUE; 3845 min = 0; 3846 max = GET2(ecode, 1); 3847 ecode += 1 + IMM2_SIZE; 3848 goto REPEATNOTCHAR; 3849 3850 case OP_NOTSTAR: 3851 case OP_NOTSTARI: 3852 case OP_NOTMINSTAR: 3853 case OP_NOTMINSTARI: 3854 case OP_NOTPLUS: 3855 case OP_NOTPLUSI: 3856 case OP_NOTMINPLUS: 3857 case OP_NOTMINPLUSI: 3858 case OP_NOTQUERY: 3859 case OP_NOTQUERYI: 3860 case OP_NOTMINQUERY: 3861 case OP_NOTMINQUERYI: 3862 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 3863 minimize = (c & 1) != 0; 3864 min = rep_min[c]; /* Pick up values from tables; */ 3865 max = rep_max[c]; /* zero for max => infinity */ 3866 if (max == 0) max = INT_MAX; 3867 3868 /* Common code for all repeated single-byte matches. */ 3869 3870 REPEATNOTCHAR: 3871 GETCHARINCTEST(fc, ecode); 3872 3873 /* The code is duplicated for the caseless and caseful cases, for speed, 3874 since matching characters is likely to be quite common. First, ensure the 3875 minimum number of matches are present. If min = max, continue at the same 3876 level without recursing. Otherwise, if minimizing, keep trying the rest of 3877 the expression and advancing one matching character if failing, up to the 3878 maximum. Alternatively, if maximizing, find the maximum number of 3879 characters and work backwards. */ 3880 3881 if (op >= OP_NOTSTARI) /* Caseless */ 3882 { 3883 #ifdef SUPPORT_UNICODE 3884 if (utf && fc > 127) 3885 foc = UCD_OTHERCASE(fc); 3886 else 3887 #endif /* SUPPORT_UNICODE */ 3888 foc = TABLE_GET(fc, mb->fcc, fc); 3889 3890 #ifdef SUPPORT_UNICODE 3891 if (utf) 3892 { 3893 register uint32_t d; 3894 for (i = 1; i <= min; i++) 3895 { 3896 if (eptr >= mb->end_subject) 3897 { 3898 SCHECK_PARTIAL(); 3899 RRETURN(MATCH_NOMATCH); 3900 } 3901 GETCHARINC(d, eptr); 3902 if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH); 3903 } 3904 } 3905 else 3906 #endif /* SUPPORT_UNICODE */ 3907 /* Not UTF mode */ 3908 { 3909 for (i = 1; i <= min; i++) 3910 { 3911 if (eptr >= mb->end_subject) 3912 { 3913 SCHECK_PARTIAL(); 3914 RRETURN(MATCH_NOMATCH); 3915 } 3916 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3917 eptr++; 3918 } 3919 } 3920 3921 if (min == max) continue; 3922 3923 if (minimize) 3924 { 3925 #ifdef SUPPORT_UNICODE 3926 if (utf) 3927 { 3928 register uint32_t d; 3929 for (fi = min;; fi++) 3930 { 3931 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM28); 3932 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3933 if (fi >= max) RRETURN(MATCH_NOMATCH); 3934 if (eptr >= mb->end_subject) 3935 { 3936 SCHECK_PARTIAL(); 3937 RRETURN(MATCH_NOMATCH); 3938 } 3939 GETCHARINC(d, eptr); 3940 if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH); 3941 } 3942 } 3943 else 3944 #endif /*SUPPORT_UNICODE */ 3945 /* Not UTF mode */ 3946 { 3947 for (fi = min;; fi++) 3948 { 3949 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM29); 3950 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3951 if (fi >= max) RRETURN(MATCH_NOMATCH); 3952 if (eptr >= mb->end_subject) 3953 { 3954 SCHECK_PARTIAL(); 3955 RRETURN(MATCH_NOMATCH); 3956 } 3957 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3958 eptr++; 3959 } 3960 } 3961 /* Control never gets here */ 3962 } 3963 3964 /* Maximize case */ 3965 3966 else 3967 { 3968 pp = eptr; 3969 3970 #ifdef SUPPORT_UNICODE 3971 if (utf) 3972 { 3973 register uint32_t d; 3974 for (i = min; i < max; i++) 3975 { 3976 int len = 1; 3977 if (eptr >= mb->end_subject) 3978 { 3979 SCHECK_PARTIAL(); 3980 break; 3981 } 3982 GETCHARLEN(d, eptr, len); 3983 if (fc == d || (uint32_t)foc == d) break; 3984 eptr += len; 3985 } 3986 if (possessive) continue; /* No backtracking */ 3987 3988 /* After \C in UTF mode, pp might be in the middle of a Unicode 3989 character. Use <= pp to ensure backtracking doesn't go too far. */ 3990 3991 for(;;) 3992 { 3993 if (eptr <= pp) goto TAIL_RECURSE; 3994 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM30); 3995 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3996 eptr--; 3997 BACKCHAR(eptr); 3998 } 3999 } 4000 else 4001 #endif /* SUPPORT_UNICODE */ 4002 /* Not UTF mode */ 4003 { 4004 for (i = min; i < max; i++) 4005 { 4006 if (eptr >= mb->end_subject) 4007 { 4008 SCHECK_PARTIAL(); 4009 break; 4010 } 4011 if (fc == *eptr || foc == *eptr) break; 4012 eptr++; 4013 } 4014 if (possessive) continue; /* No backtracking */ 4015 for (;;) 4016 { 4017 if (eptr == pp) goto TAIL_RECURSE; 4018 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM31); 4019 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4020 eptr--; 4021 } 4022 } 4023 /* Control never gets here */ 4024 } 4025 } 4026 4027 /* Caseful comparisons */ 4028 4029 else 4030 { 4031 #ifdef SUPPORT_UNICODE 4032 if (utf) 4033 { 4034 register uint32_t d; 4035 for (i = 1; i <= min; i++) 4036 { 4037 if (eptr >= mb->end_subject) 4038 { 4039 SCHECK_PARTIAL(); 4040 RRETURN(MATCH_NOMATCH); 4041 } 4042 GETCHARINC(d, eptr); 4043 if (fc == d) RRETURN(MATCH_NOMATCH); 4044 } 4045 } 4046 else 4047 #endif 4048 /* Not UTF mode */ 4049 { 4050 for (i = 1; i <= min; i++) 4051 { 4052 if (eptr >= mb->end_subject) 4053 { 4054 SCHECK_PARTIAL(); 4055 RRETURN(MATCH_NOMATCH); 4056 } 4057 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 4058 } 4059 } 4060 4061 if (min == max) continue; 4062 4063 if (minimize) 4064 { 4065 #ifdef SUPPORT_UNICODE 4066 if (utf) 4067 { 4068 register uint32_t d; 4069 for (fi = min;; fi++) 4070 { 4071 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM32); 4072 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4073 if (fi >= max) RRETURN(MATCH_NOMATCH); 4074 if (eptr >= mb->end_subject) 4075 { 4076 SCHECK_PARTIAL(); 4077 RRETURN(MATCH_NOMATCH); 4078 } 4079 GETCHARINC(d, eptr); 4080 if (fc == d) RRETURN(MATCH_NOMATCH); 4081 } 4082 } 4083 else 4084 #endif 4085 /* Not UTF mode */ 4086 { 4087 for (fi = min;; fi++) 4088 { 4089 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM33); 4090 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4091 if (fi >= max) RRETURN(MATCH_NOMATCH); 4092 if (eptr >= mb->end_subject) 4093 { 4094 SCHECK_PARTIAL(); 4095 RRETURN(MATCH_NOMATCH); 4096 } 4097 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 4098 } 4099 } 4100 /* Control never gets here */ 4101 } 4102 4103 /* Maximize case */ 4104 4105 else 4106 { 4107 pp = eptr; 4108 4109 #ifdef SUPPORT_UNICODE 4110 if (utf) 4111 { 4112 register uint32_t d; 4113 for (i = min; i < max; i++) 4114 { 4115 int len = 1; 4116 if (eptr >= mb->end_subject) 4117 { 4118 SCHECK_PARTIAL(); 4119 break; 4120 } 4121 GETCHARLEN(d, eptr, len); 4122 if (fc == d) break; 4123 eptr += len; 4124 } 4125 if (possessive) continue; /* No backtracking */ 4126 4127 /* After \C in UTF mode, pp might be in the middle of a Unicode 4128 character. Use <= pp to ensure backtracking doesn't go too far. */ 4129 4130 for(;;) 4131 { 4132 if (eptr <= pp) goto TAIL_RECURSE; 4133 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM34); 4134 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4135 eptr--; 4136 BACKCHAR(eptr); 4137 } 4138 } 4139 else 4140 #endif 4141 /* Not UTF mode */ 4142 { 4143 for (i = min; i < max; i++) 4144 { 4145 if (eptr >= mb->end_subject) 4146 { 4147 SCHECK_PARTIAL(); 4148 break; 4149 } 4150 if (fc == *eptr) break; 4151 eptr++; 4152 } 4153 if (possessive) continue; /* No backtracking */ 4154 for (;;) 4155 { 4156 if (eptr == pp) goto TAIL_RECURSE; 4157 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM35); 4158 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4159 eptr--; 4160 } 4161 } 4162 /* Control never gets here */ 4163 } 4164 } 4165 /* Control never gets here */ 4166 4167 /* Match a single character type repeatedly; several different opcodes 4168 share code. This is very similar to the code for single characters, but we 4169 repeat it in the interests of efficiency. */ 4170 4171 case OP_TYPEEXACT: 4172 min = max = GET2(ecode, 1); 4173 minimize = TRUE; 4174 ecode += 1 + IMM2_SIZE; 4175 goto REPEATTYPE; 4176 4177 case OP_TYPEUPTO: 4178 case OP_TYPEMINUPTO: 4179 min = 0; 4180 max = GET2(ecode, 1); 4181 minimize = *ecode == OP_TYPEMINUPTO; 4182 ecode += 1 + IMM2_SIZE; 4183 goto REPEATTYPE; 4184 4185 case OP_TYPEPOSSTAR: 4186 possessive = TRUE; 4187 min = 0; 4188 max = INT_MAX; 4189 ecode++; 4190 goto REPEATTYPE; 4191 4192 case OP_TYPEPOSPLUS: 4193 possessive = TRUE; 4194 min = 1; 4195 max = INT_MAX; 4196 ecode++; 4197 goto REPEATTYPE; 4198 4199 case OP_TYPEPOSQUERY: 4200 possessive = TRUE; 4201 min = 0; 4202 max = 1; 4203 ecode++; 4204 goto REPEATTYPE; 4205 4206 case OP_TYPEPOSUPTO: 4207 possessive = TRUE; 4208 min = 0; 4209 max = GET2(ecode, 1); 4210 ecode += 1 + IMM2_SIZE; 4211 goto REPEATTYPE; 4212 4213 case OP_TYPESTAR: 4214 case OP_TYPEMINSTAR: 4215 case OP_TYPEPLUS: 4216 case OP_TYPEMINPLUS: 4217 case OP_TYPEQUERY: 4218 case OP_TYPEMINQUERY: 4219 c = *ecode++ - OP_TYPESTAR; 4220 minimize = (c & 1) != 0; 4221 min = rep_min[c]; /* Pick up values from tables; */ 4222 max = rep_max[c]; /* zero for max => infinity */ 4223 if (max == 0) max = INT_MAX; 4224 4225 /* Common code for all repeated single character type matches. Note that 4226 in UTF-8 mode, '.' matches a character of any length, but for the other 4227 character types, the valid characters are all one-byte long. */ 4228 4229 REPEATTYPE: 4230 ctype = *ecode++; /* Code for the character type */ 4231 4232 #ifdef SUPPORT_UNICODE 4233 if (ctype == OP_PROP || ctype == OP_NOTPROP) 4234 { 4235 prop_fail_result = ctype == OP_NOTPROP; 4236 prop_type = *ecode++; 4237 prop_value = *ecode++; 4238 } 4239 else prop_type = -1; 4240 #endif 4241 4242 /* First, ensure the minimum number of matches are present. Use inline 4243 code for maximizing the speed, and do the type test once at the start 4244 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 4245 is tidier. Also separate the UCP code, which can be the same for both UTF-8 4246 and single-bytes. */ 4247 4248 if (min > 0) 4249 { 4250 #ifdef SUPPORT_UNICODE 4251 if (prop_type >= 0) 4252 { 4253 switch(prop_type) 4254 { 4255 case PT_ANY: 4256 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4257 for (i = 1; i <= min; i++) 4258 { 4259 if (eptr >= mb->end_subject) 4260 { 4261 SCHECK_PARTIAL(); 4262 RRETURN(MATCH_NOMATCH); 4263 } 4264 GETCHARINCTEST(c, eptr); 4265 } 4266 break; 4267 4268 case PT_LAMP: 4269 for (i = 1; i <= min; i++) 4270 { 4271 int chartype; 4272 if (eptr >= mb->end_subject) 4273 { 4274 SCHECK_PARTIAL(); 4275 RRETURN(MATCH_NOMATCH); 4276 } 4277 GETCHARINCTEST(c, eptr); 4278 chartype = UCD_CHARTYPE(c); 4279 if ((chartype == ucp_Lu || 4280 chartype == ucp_Ll || 4281 chartype == ucp_Lt) == prop_fail_result) 4282 RRETURN(MATCH_NOMATCH); 4283 } 4284 break; 4285 4286 case PT_GC: 4287 for (i = 1; i <= min; i++) 4288 { 4289 if (eptr >= mb->end_subject) 4290 { 4291 SCHECK_PARTIAL(); 4292 RRETURN(MATCH_NOMATCH); 4293 } 4294 GETCHARINCTEST(c, eptr); 4295 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4296 RRETURN(MATCH_NOMATCH); 4297 } 4298 break; 4299 4300 case PT_PC: 4301 for (i = 1; i <= min; i++) 4302 { 4303 if (eptr >= mb->end_subject) 4304 { 4305 SCHECK_PARTIAL(); 4306 RRETURN(MATCH_NOMATCH); 4307 } 4308 GETCHARINCTEST(c, eptr); 4309 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4310 RRETURN(MATCH_NOMATCH); 4311 } 4312 break; 4313 4314 case PT_SC: 4315 for (i = 1; i <= min; i++) 4316 { 4317 if (eptr >= mb->end_subject) 4318 { 4319 SCHECK_PARTIAL(); 4320 RRETURN(MATCH_NOMATCH); 4321 } 4322 GETCHARINCTEST(c, eptr); 4323 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4324 RRETURN(MATCH_NOMATCH); 4325 } 4326 break; 4327 4328 case PT_ALNUM: 4329 for (i = 1; i <= min; i++) 4330 { 4331 int category; 4332 if (eptr >= mb->end_subject) 4333 { 4334 SCHECK_PARTIAL(); 4335 RRETURN(MATCH_NOMATCH); 4336 } 4337 GETCHARINCTEST(c, eptr); 4338 category = UCD_CATEGORY(c); 4339 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4340 RRETURN(MATCH_NOMATCH); 4341 } 4342 break; 4343 4344 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 4345 which means that Perl space and POSIX space are now identical. PCRE 4346 was changed at release 8.34. */ 4347 4348 case PT_SPACE: /* Perl space */ 4349 case PT_PXSPACE: /* POSIX space */ 4350 for (i = 1; i <= min; i++) 4351 { 4352 if (eptr >= mb->end_subject) 4353 { 4354 SCHECK_PARTIAL(); 4355 RRETURN(MATCH_NOMATCH); 4356 } 4357 GETCHARINCTEST(c, eptr); 4358 switch(c) 4359 { 4360 HSPACE_CASES: 4361 VSPACE_CASES: 4362 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4363 break; 4364 4365 default: 4366 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 4367 RRETURN(MATCH_NOMATCH); 4368 break; 4369 } 4370 } 4371 break; 4372 4373 case PT_WORD: 4374 for (i = 1; i <= min; i++) 4375 { 4376 int category; 4377 if (eptr >= mb->end_subject) 4378 { 4379 SCHECK_PARTIAL(); 4380 RRETURN(MATCH_NOMATCH); 4381 } 4382 GETCHARINCTEST(c, eptr); 4383 category = UCD_CATEGORY(c); 4384 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) 4385 == prop_fail_result) 4386 RRETURN(MATCH_NOMATCH); 4387 } 4388 break; 4389 4390 case PT_CLIST: 4391 for (i = 1; i <= min; i++) 4392 { 4393 const uint32_t *cp; 4394 if (eptr >= mb->end_subject) 4395 { 4396 SCHECK_PARTIAL(); 4397 RRETURN(MATCH_NOMATCH); 4398 } 4399 GETCHARINCTEST(c, eptr); 4400 cp = PRIV(ucd_caseless_sets) + prop_value; 4401 for (;;) 4402 { 4403 if (c < *cp) 4404 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 4405 if (c == *cp++) 4406 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 4407 } 4408 } 4409 break; 4410 4411 case PT_UCNC: 4412 for (i = 1; i <= min; i++) 4413 { 4414 if (eptr >= mb->end_subject) 4415 { 4416 SCHECK_PARTIAL(); 4417 RRETURN(MATCH_NOMATCH); 4418 } 4419 GETCHARINCTEST(c, eptr); 4420 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 4421 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 4422 c >= 0xe000) == prop_fail_result) 4423 RRETURN(MATCH_NOMATCH); 4424 } 4425 break; 4426 4427 /* This should not occur */ 4428 4429 default: 4430 RRETURN(PCRE2_ERROR_INTERNAL); 4431 } 4432 } 4433 4434 /* Match extended Unicode sequences. We will get here only if the 4435 support is in the binary; otherwise a compile-time error occurs. */ 4436 4437 else if (ctype == OP_EXTUNI) 4438 { 4439 for (i = 1; i <= min; i++) 4440 { 4441 if (eptr >= mb->end_subject) 4442 { 4443 SCHECK_PARTIAL(); 4444 RRETURN(MATCH_NOMATCH); 4445 } 4446 else 4447 { 4448 int lgb, rgb; 4449 GETCHARINCTEST(c, eptr); 4450 lgb = UCD_GRAPHBREAK(c); 4451 while (eptr < mb->end_subject) 4452 { 4453 int len = 1; 4454 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 4455 rgb = UCD_GRAPHBREAK(c); 4456 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 4457 lgb = rgb; 4458 eptr += len; 4459 } 4460 } 4461 CHECK_PARTIAL(); 4462 } 4463 } 4464 4465 else 4466 #endif /* SUPPORT_UNICODE */ 4467 4468 /* Handle all other cases when the coding is UTF-8 */ 4469 4470 #ifdef SUPPORT_UNICODE 4471 if (utf) switch(ctype) 4472 { 4473 case OP_ANY: 4474 for (i = 1; i <= min; i++) 4475 { 4476 if (eptr >= mb->end_subject) 4477 { 4478 SCHECK_PARTIAL(); 4479 RRETURN(MATCH_NOMATCH); 4480 } 4481 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4482 if (mb->partial != 0 && 4483 eptr + 1 >= mb->end_subject && 4484 NLBLOCK->nltype == NLTYPE_FIXED && 4485 NLBLOCK->nllen == 2 && 4486 UCHAR21(eptr) == NLBLOCK->nl[0]) 4487 { 4488 mb->hitend = TRUE; 4489 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 4490 } 4491 eptr++; 4492 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 4493 } 4494 break; 4495 4496 case OP_ALLANY: 4497 for (i = 1; i <= min; i++) 4498 { 4499 if (eptr >= mb->end_subject) 4500 { 4501 SCHECK_PARTIAL(); 4502 RRETURN(MATCH_NOMATCH); 4503 } 4504 eptr++; 4505 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 4506 } 4507 break; 4508 4509 case OP_ANYBYTE: 4510 if (eptr > mb->end_subject - min) RRETURN(MATCH_NOMATCH); 4511 eptr += min; 4512 break; 4513 4514 case OP_ANYNL: 4515 for (i = 1; i <= min; i++) 4516 { 4517 if (eptr >= mb->end_subject) 4518 { 4519 SCHECK_PARTIAL(); 4520 RRETURN(MATCH_NOMATCH); 4521 } 4522 GETCHARINC(c, eptr); 4523 switch(c) 4524 { 4525 default: RRETURN(MATCH_NOMATCH); 4526 4527 case CHAR_CR: 4528 if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 4529 break; 4530 4531 case CHAR_LF: 4532 break; 4533 4534 case CHAR_VT: 4535 case CHAR_FF: 4536 case CHAR_NEL: 4537 #ifndef EBCDIC 4538 case 0x2028: 4539 case 0x2029: 4540 #endif /* Not EBCDIC */ 4541 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 4542 break; 4543 } 4544 } 4545 break; 4546 4547 case OP_NOT_HSPACE: 4548 for (i = 1; i <= min; i++) 4549 { 4550 if (eptr >= mb->end_subject) 4551 { 4552 SCHECK_PARTIAL(); 4553 RRETURN(MATCH_NOMATCH); 4554 } 4555 GETCHARINC(c, eptr); 4556 switch(c) 4557 { 4558 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 4559 default: break; 4560 } 4561 } 4562 break; 4563 4564 case OP_HSPACE: 4565 for (i = 1; i <= min; i++) 4566 { 4567 if (eptr >= mb->end_subject) 4568 { 4569 SCHECK_PARTIAL(); 4570 RRETURN(MATCH_NOMATCH); 4571 } 4572 GETCHARINC(c, eptr); 4573 switch(c) 4574 { 4575 HSPACE_CASES: break; /* Byte and multibyte cases */ 4576 default: RRETURN(MATCH_NOMATCH); 4577 } 4578 } 4579 break; 4580 4581 case OP_NOT_VSPACE: 4582 for (i = 1; i <= min; i++) 4583 { 4584 if (eptr >= mb->end_subject) 4585 { 4586 SCHECK_PARTIAL(); 4587 RRETURN(MATCH_NOMATCH); 4588 } 4589 GETCHARINC(c, eptr); 4590 switch(c) 4591 { 4592 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 4593 default: break; 4594 } 4595 } 4596 break; 4597 4598 case OP_VSPACE: 4599 for (i = 1; i <= min; i++) 4600 { 4601 if (eptr >= mb->end_subject) 4602 { 4603 SCHECK_PARTIAL(); 4604 RRETURN(MATCH_NOMATCH); 4605 } 4606 GETCHARINC(c, eptr); 4607 switch(c) 4608 { 4609 VSPACE_CASES: break; 4610 default: RRETURN(MATCH_NOMATCH); 4611 } 4612 } 4613 break; 4614 4615 case OP_NOT_DIGIT: 4616 for (i = 1; i <= min; i++) 4617 { 4618 if (eptr >= mb->end_subject) 4619 { 4620 SCHECK_PARTIAL(); 4621 RRETURN(MATCH_NOMATCH); 4622 } 4623 GETCHARINC(c, eptr); 4624 if (c < 128 && (mb->ctypes[c] & ctype_digit) != 0) 4625 RRETURN(MATCH_NOMATCH); 4626 } 4627 break; 4628 4629 case OP_DIGIT: 4630 for (i = 1; i <= min; i++) 4631 { 4632 uint32_t cc; 4633 if (eptr >= mb->end_subject) 4634 { 4635 SCHECK_PARTIAL(); 4636 RRETURN(MATCH_NOMATCH); 4637 } 4638 cc = UCHAR21(eptr); 4639 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) 4640 RRETURN(MATCH_NOMATCH); 4641 eptr++; 4642 /* No need to skip more bytes - we know it's a 1-byte character */ 4643 } 4644 break; 4645 4646 case OP_NOT_WHITESPACE: 4647 for (i = 1; i <= min; i++) 4648 { 4649 uint32_t cc; 4650 if (eptr >= mb->end_subject) 4651 { 4652 SCHECK_PARTIAL(); 4653 RRETURN(MATCH_NOMATCH); 4654 } 4655 cc = UCHAR21(eptr); 4656 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) 4657 RRETURN(MATCH_NOMATCH); 4658 eptr++; 4659 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 4660 } 4661 break; 4662 4663 case OP_WHITESPACE: 4664 for (i = 1; i <= min; i++) 4665 { 4666 uint32_t cc; 4667 if (eptr >= mb->end_subject) 4668 { 4669 SCHECK_PARTIAL(); 4670 RRETURN(MATCH_NOMATCH); 4671 } 4672 cc = UCHAR21(eptr); 4673 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) 4674 RRETURN(MATCH_NOMATCH); 4675 eptr++; 4676 /* No need to skip more bytes - we know it's a 1-byte character */ 4677 } 4678 break; 4679 4680 case OP_NOT_WORDCHAR: 4681 for (i = 1; i <= min; i++) 4682 { 4683 uint32_t cc; 4684 if (eptr >= mb->end_subject) 4685 { 4686 SCHECK_PARTIAL(); 4687 RRETURN(MATCH_NOMATCH); 4688 } 4689 cc = UCHAR21(eptr); 4690 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) 4691 RRETURN(MATCH_NOMATCH); 4692 eptr++; 4693 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 4694 } 4695 break; 4696 4697 case OP_WORDCHAR: 4698 for (i = 1; i <= min; i++) 4699 { 4700 uint32_t cc; 4701 if (eptr >= mb->end_subject) 4702 { 4703 SCHECK_PARTIAL(); 4704 RRETURN(MATCH_NOMATCH); 4705 } 4706 cc = UCHAR21(eptr); 4707 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) 4708 RRETURN(MATCH_NOMATCH); 4709 eptr++; 4710 /* No need to skip more bytes - we know it's a 1-byte character */ 4711 } 4712 break; 4713 4714 default: 4715 RRETURN(PCRE2_ERROR_INTERNAL); 4716 } /* End switch(ctype) */ 4717 4718 else 4719 #endif /* SUPPORT_UNICODE */ 4720 4721 /* Code for the non-UTF-8 case for minimum matching of operators other 4722 than OP_PROP and OP_NOTPROP. */ 4723 4724 switch(ctype) 4725 { 4726 case OP_ANY: 4727 for (i = 1; i <= min; i++) 4728 { 4729 if (eptr >= mb->end_subject) 4730 { 4731 SCHECK_PARTIAL(); 4732 RRETURN(MATCH_NOMATCH); 4733 } 4734 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4735 if (mb->partial != 0 && 4736 eptr + 1 >= mb->end_subject && 4737 NLBLOCK->nltype == NLTYPE_FIXED && 4738 NLBLOCK->nllen == 2 && 4739 *eptr == NLBLOCK->nl[0]) 4740 { 4741 mb->hitend = TRUE; 4742 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 4743 } 4744 eptr++; 4745 } 4746 break; 4747 4748 case OP_ALLANY: 4749 if (eptr > mb->end_subject - min) 4750 { 4751 SCHECK_PARTIAL(); 4752 RRETURN(MATCH_NOMATCH); 4753 } 4754 eptr += min; 4755 break; 4756 4757 case OP_ANYBYTE: 4758 if (eptr > mb->end_subject - min) 4759 { 4760 SCHECK_PARTIAL(); 4761 RRETURN(MATCH_NOMATCH); 4762 } 4763 eptr += min; 4764 break; 4765 4766 case OP_ANYNL: 4767 for (i = 1; i <= min; i++) 4768 { 4769 if (eptr >= mb->end_subject) 4770 { 4771 SCHECK_PARTIAL(); 4772 RRETURN(MATCH_NOMATCH); 4773 } 4774 switch(*eptr++) 4775 { 4776 default: RRETURN(MATCH_NOMATCH); 4777 4778 case CHAR_CR: 4779 if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++; 4780 break; 4781 4782 case CHAR_LF: 4783 break; 4784 4785 case CHAR_VT: 4786 case CHAR_FF: 4787 case CHAR_NEL: 4788 #if PCRE2_CODE_UNIT_WIDTH != 8 4789 case 0x2028: 4790 case 0x2029: 4791 #endif 4792 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 4793 break; 4794 } 4795 } 4796 break; 4797 4798 case OP_NOT_HSPACE: 4799 for (i = 1; i <= min; i++) 4800 { 4801 if (eptr >= mb->end_subject) 4802 { 4803 SCHECK_PARTIAL(); 4804 RRETURN(MATCH_NOMATCH); 4805 } 4806 switch(*eptr++) 4807 { 4808 default: break; 4809 HSPACE_BYTE_CASES: 4810 #if PCRE2_CODE_UNIT_WIDTH != 8 4811 HSPACE_MULTIBYTE_CASES: 4812 #endif 4813 RRETURN(MATCH_NOMATCH); 4814 } 4815 } 4816 break; 4817 4818 case OP_HSPACE: 4819 for (i = 1; i <= min; i++) 4820 { 4821 if (eptr >= mb->end_subject) 4822 { 4823 SCHECK_PARTIAL(); 4824 RRETURN(MATCH_NOMATCH); 4825 } 4826 switch(*eptr++) 4827 { 4828 default: RRETURN(MATCH_NOMATCH); 4829 HSPACE_BYTE_CASES: 4830 #if PCRE2_CODE_UNIT_WIDTH != 8 4831 HSPACE_MULTIBYTE_CASES: 4832 #endif 4833 break; 4834 } 4835 } 4836 break; 4837 4838 case OP_NOT_VSPACE: 4839 for (i = 1; i <= min; i++) 4840 { 4841 if (eptr >= mb->end_subject) 4842 { 4843 SCHECK_PARTIAL(); 4844 RRETURN(MATCH_NOMATCH); 4845 } 4846 switch(*eptr++) 4847 { 4848 VSPACE_BYTE_CASES: 4849 #if PCRE2_CODE_UNIT_WIDTH != 8 4850 VSPACE_MULTIBYTE_CASES: 4851 #endif 4852 RRETURN(MATCH_NOMATCH); 4853 default: break; 4854 } 4855 } 4856 break; 4857 4858 case OP_VSPACE: 4859 for (i = 1; i <= min; i++) 4860 { 4861 if (eptr >= mb->end_subject) 4862 { 4863 SCHECK_PARTIAL(); 4864 RRETURN(MATCH_NOMATCH); 4865 } 4866 switch(*eptr++) 4867 { 4868 default: RRETURN(MATCH_NOMATCH); 4869 VSPACE_BYTE_CASES: 4870 #if PCRE2_CODE_UNIT_WIDTH != 8 4871 VSPACE_MULTIBYTE_CASES: 4872 #endif 4873 break; 4874 } 4875 } 4876 break; 4877 4878 case OP_NOT_DIGIT: 4879 for (i = 1; i <= min; i++) 4880 { 4881 if (eptr >= mb->end_subject) 4882 { 4883 SCHECK_PARTIAL(); 4884 RRETURN(MATCH_NOMATCH); 4885 } 4886 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0) 4887 RRETURN(MATCH_NOMATCH); 4888 eptr++; 4889 } 4890 break; 4891 4892 case OP_DIGIT: 4893 for (i = 1; i <= min; i++) 4894 { 4895 if (eptr >= mb->end_subject) 4896 { 4897 SCHECK_PARTIAL(); 4898 RRETURN(MATCH_NOMATCH); 4899 } 4900 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0) 4901 RRETURN(MATCH_NOMATCH); 4902 eptr++; 4903 } 4904 break; 4905 4906 case OP_NOT_WHITESPACE: 4907 for (i = 1; i <= min; i++) 4908 { 4909 if (eptr >= mb->end_subject) 4910 { 4911 SCHECK_PARTIAL(); 4912 RRETURN(MATCH_NOMATCH); 4913 } 4914 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0) 4915 RRETURN(MATCH_NOMATCH); 4916 eptr++; 4917 } 4918 break; 4919 4920 case OP_WHITESPACE: 4921 for (i = 1; i <= min; i++) 4922 { 4923 if (eptr >= mb->end_subject) 4924 { 4925 SCHECK_PARTIAL(); 4926 RRETURN(MATCH_NOMATCH); 4927 } 4928 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0) 4929 RRETURN(MATCH_NOMATCH); 4930 eptr++; 4931 } 4932 break; 4933 4934 case OP_NOT_WORDCHAR: 4935 for (i = 1; i <= min; i++) 4936 { 4937 if (eptr >= mb->end_subject) 4938 { 4939 SCHECK_PARTIAL(); 4940 RRETURN(MATCH_NOMATCH); 4941 } 4942 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0) 4943 RRETURN(MATCH_NOMATCH); 4944 eptr++; 4945 } 4946 break; 4947 4948 case OP_WORDCHAR: 4949 for (i = 1; i <= min; i++) 4950 { 4951 if (eptr >= mb->end_subject) 4952 { 4953 SCHECK_PARTIAL(); 4954 RRETURN(MATCH_NOMATCH); 4955 } 4956 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0) 4957 RRETURN(MATCH_NOMATCH); 4958 eptr++; 4959 } 4960 break; 4961 4962 default: 4963 RRETURN(PCRE2_ERROR_INTERNAL); 4964 } 4965 } 4966 4967 /* If min = max, continue at the same level without recursing */ 4968 4969 if (min == max) continue; 4970 4971 /* If minimizing, we have to test the rest of the pattern before each 4972 subsequent match. Again, separate the UTF-8 case for speed, and also 4973 separate the UCP cases. */ 4974 4975 if (minimize) 4976 { 4977 #ifdef SUPPORT_UNICODE 4978 if (prop_type >= 0) 4979 { 4980 switch(prop_type) 4981 { 4982 case PT_ANY: 4983 for (fi = min;; fi++) 4984 { 4985 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM36); 4986 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4987 if (fi >= max) RRETURN(MATCH_NOMATCH); 4988 if (eptr >= mb->end_subject) 4989 { 4990 SCHECK_PARTIAL(); 4991 RRETURN(MATCH_NOMATCH); 4992 } 4993 GETCHARINCTEST(c, eptr); 4994 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4995 } 4996 /* Control never gets here */ 4997 4998 case PT_LAMP: 4999 for (fi = min;; fi++) 5000 { 5001 int chartype; 5002 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM37); 5003 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5004 if (fi >= max) RRETURN(MATCH_NOMATCH); 5005 if (eptr >= mb->end_subject) 5006 { 5007 SCHECK_PARTIAL(); 5008 RRETURN(MATCH_NOMATCH); 5009 } 5010 GETCHARINCTEST(c, eptr); 5011 chartype = UCD_CHARTYPE(c); 5012 if ((chartype == ucp_Lu || 5013 chartype == ucp_Ll || 5014 chartype == ucp_Lt) == prop_fail_result) 5015 RRETURN(MATCH_NOMATCH); 5016 } 5017 /* Control never gets here */ 5018 5019 case PT_GC: 5020 for (fi = min;; fi++) 5021 { 5022 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM38); 5023 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5024 if (fi >= max) RRETURN(MATCH_NOMATCH); 5025 if (eptr >= mb->end_subject) 5026 { 5027 SCHECK_PARTIAL(); 5028 RRETURN(MATCH_NOMATCH); 5029 } 5030 GETCHARINCTEST(c, eptr); 5031 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 5032 RRETURN(MATCH_NOMATCH); 5033 } 5034 /* Control never gets here */ 5035 5036 case PT_PC: 5037 for (fi = min;; fi++) 5038 { 5039 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM39); 5040 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5041 if (fi >= max) RRETURN(MATCH_NOMATCH); 5042 if (eptr >= mb->end_subject) 5043 { 5044 SCHECK_PARTIAL(); 5045 RRETURN(MATCH_NOMATCH); 5046 } 5047 GETCHARINCTEST(c, eptr); 5048 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 5049 RRETURN(MATCH_NOMATCH); 5050 } 5051 /* Control never gets here */ 5052 5053 case PT_SC: 5054 for (fi = min;; fi++) 5055 { 5056 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM40); 5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5058 if (fi >= max) RRETURN(MATCH_NOMATCH); 5059 if (eptr >= mb->end_subject) 5060 { 5061 SCHECK_PARTIAL(); 5062 RRETURN(MATCH_NOMATCH); 5063 } 5064 GETCHARINCTEST(c, eptr); 5065 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 5066 RRETURN(MATCH_NOMATCH); 5067 } 5068 /* Control never gets here */ 5069 5070 case PT_ALNUM: 5071 for (fi = min;; fi++) 5072 { 5073 int category; 5074 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM59); 5075 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5076 if (fi >= max) RRETURN(MATCH_NOMATCH); 5077 if (eptr >= mb->end_subject) 5078 { 5079 SCHECK_PARTIAL(); 5080 RRETURN(MATCH_NOMATCH); 5081 } 5082 GETCHARINCTEST(c, eptr); 5083 category = UCD_CATEGORY(c); 5084 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5085 RRETURN(MATCH_NOMATCH); 5086 } 5087 /* Control never gets here */ 5088 5089 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5090 which means that Perl space and POSIX space are now identical. PCRE 5091 was changed at release 8.34. */ 5092 5093 case PT_SPACE: /* Perl space */ 5094 case PT_PXSPACE: /* POSIX space */ 5095 for (fi = min;; fi++) 5096 { 5097 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM61); 5098 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5099 if (fi >= max) RRETURN(MATCH_NOMATCH); 5100 if (eptr >= mb->end_subject) 5101 { 5102 SCHECK_PARTIAL(); 5103 RRETURN(MATCH_NOMATCH); 5104 } 5105 GETCHARINCTEST(c, eptr); 5106 switch(c) 5107 { 5108 HSPACE_CASES: 5109 VSPACE_CASES: 5110 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 5111 break; 5112 5113 default: 5114 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5115 RRETURN(MATCH_NOMATCH); 5116 break; 5117 } 5118 } 5119 /* Control never gets here */ 5120 5121 case PT_WORD: 5122 for (fi = min;; fi++) 5123 { 5124 int category; 5125 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM62); 5126 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5127 if (fi >= max) RRETURN(MATCH_NOMATCH); 5128 if (eptr >= mb->end_subject) 5129 { 5130 SCHECK_PARTIAL(); 5131 RRETURN(MATCH_NOMATCH); 5132 } 5133 GETCHARINCTEST(c, eptr); 5134 category = UCD_CATEGORY(c); 5135 if ((category == ucp_L || 5136 category == ucp_N || 5137 c == CHAR_UNDERSCORE) 5138 == prop_fail_result) 5139 RRETURN(MATCH_NOMATCH); 5140 } 5141 /* Control never gets here */ 5142 5143 case PT_CLIST: 5144 for (fi = min;; fi++) 5145 { 5146 const uint32_t *cp; 5147 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM67); 5148 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5149 if (fi >= max) RRETURN(MATCH_NOMATCH); 5150 if (eptr >= mb->end_subject) 5151 { 5152 SCHECK_PARTIAL(); 5153 RRETURN(MATCH_NOMATCH); 5154 } 5155 GETCHARINCTEST(c, eptr); 5156 cp = PRIV(ucd_caseless_sets) + prop_value; 5157 for (;;) 5158 { 5159 if (c < *cp) 5160 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } } 5161 if (c == *cp++) 5162 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; } 5163 } 5164 } 5165 /* Control never gets here */ 5166 5167 case PT_UCNC: 5168 for (fi = min;; fi++) 5169 { 5170 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM60); 5171 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5172 if (fi >= max) RRETURN(MATCH_NOMATCH); 5173 if (eptr >= mb->end_subject) 5174 { 5175 SCHECK_PARTIAL(); 5176 RRETURN(MATCH_NOMATCH); 5177 } 5178 GETCHARINCTEST(c, eptr); 5179 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5180 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5181 c >= 0xe000) == prop_fail_result) 5182 RRETURN(MATCH_NOMATCH); 5183 } 5184 /* Control never gets here */ 5185 5186 /* This should never occur */ 5187 default: 5188 RRETURN(PCRE2_ERROR_INTERNAL); 5189 } 5190 } 5191 5192 /* Match extended Unicode sequences. We will get here only if the 5193 support is in the binary; otherwise a compile-time error occurs. */ 5194 5195 else if (ctype == OP_EXTUNI) 5196 { 5197 for (fi = min;; fi++) 5198 { 5199 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM41); 5200 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5201 if (fi >= max) RRETURN(MATCH_NOMATCH); 5202 if (eptr >= mb->end_subject) 5203 { 5204 SCHECK_PARTIAL(); 5205 RRETURN(MATCH_NOMATCH); 5206 } 5207 else 5208 { 5209 int lgb, rgb; 5210 GETCHARINCTEST(c, eptr); 5211 lgb = UCD_GRAPHBREAK(c); 5212 while (eptr < mb->end_subject) 5213 { 5214 int len = 1; 5215 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5216 rgb = UCD_GRAPHBREAK(c); 5217 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5218 lgb = rgb; 5219 eptr += len; 5220 } 5221 } 5222 CHECK_PARTIAL(); 5223 } 5224 } 5225 else 5226 #endif /* SUPPORT_UNICODE */ 5227 5228 #ifdef SUPPORT_UNICODE 5229 if (utf) 5230 { 5231 for (fi = min;; fi++) 5232 { 5233 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM42); 5234 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5235 if (fi >= max) RRETURN(MATCH_NOMATCH); 5236 if (eptr >= mb->end_subject) 5237 { 5238 SCHECK_PARTIAL(); 5239 RRETURN(MATCH_NOMATCH); 5240 } 5241 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5242 RRETURN(MATCH_NOMATCH); 5243 GETCHARINC(c, eptr); 5244 switch(ctype) 5245 { 5246 case OP_ANY: /* This is the non-NL case */ 5247 if (mb->partial != 0 && /* Take care with CRLF partial */ 5248 eptr >= mb->end_subject && 5249 NLBLOCK->nltype == NLTYPE_FIXED && 5250 NLBLOCK->nllen == 2 && 5251 c == NLBLOCK->nl[0]) 5252 { 5253 mb->hitend = TRUE; 5254 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 5255 } 5256 break; 5257 5258 case OP_ALLANY: 5259 case OP_ANYBYTE: 5260 break; 5261 5262 case OP_ANYNL: 5263 switch(c) 5264 { 5265 default: RRETURN(MATCH_NOMATCH); 5266 case CHAR_CR: 5267 if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; 5268 break; 5269 5270 case CHAR_LF: 5271 break; 5272 5273 case CHAR_VT: 5274 case CHAR_FF: 5275 case CHAR_NEL: 5276 #ifndef EBCDIC 5277 case 0x2028: 5278 case 0x2029: 5279 #endif /* Not EBCDIC */ 5280 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 5281 break; 5282 } 5283 break; 5284 5285 case OP_NOT_HSPACE: 5286 switch(c) 5287 { 5288 HSPACE_CASES: RRETURN(MATCH_NOMATCH); 5289 default: break; 5290 } 5291 break; 5292 5293 case OP_HSPACE: 5294 switch(c) 5295 { 5296 HSPACE_CASES: break; 5297 default: RRETURN(MATCH_NOMATCH); 5298 } 5299 break; 5300 5301 case OP_NOT_VSPACE: 5302 switch(c) 5303 { 5304 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 5305 default: break; 5306 } 5307 break; 5308 5309 case OP_VSPACE: 5310 switch(c) 5311 { 5312 VSPACE_CASES: break; 5313 default: RRETURN(MATCH_NOMATCH); 5314 } 5315 break; 5316 5317 case OP_NOT_DIGIT: 5318 if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0) 5319 RRETURN(MATCH_NOMATCH); 5320 break; 5321 5322 case OP_DIGIT: 5323 if (c >= 256 || (mb->ctypes[c] & ctype_digit) == 0) 5324 RRETURN(MATCH_NOMATCH); 5325 break; 5326 5327 case OP_NOT_WHITESPACE: 5328 if (c < 256 && (mb->ctypes[c] & ctype_space) != 0) 5329 RRETURN(MATCH_NOMATCH); 5330 break; 5331 5332 case OP_WHITESPACE: 5333 if (c >= 256 || (mb->ctypes[c] & ctype_space) == 0) 5334 RRETURN(MATCH_NOMATCH); 5335 break; 5336 5337 case OP_NOT_WORDCHAR: 5338 if (c < 256 && (mb->ctypes[c] & ctype_word) != 0) 5339 RRETURN(MATCH_NOMATCH); 5340 break; 5341 5342 case OP_WORDCHAR: 5343 if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0) 5344 RRETURN(MATCH_NOMATCH); 5345 break; 5346 5347 default: 5348 RRETURN(PCRE2_ERROR_INTERNAL); 5349 } 5350 } 5351 } 5352 else 5353 #endif 5354 /* Not UTF mode */ 5355 { 5356 for (fi = min;; fi++) 5357 { 5358 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM43); 5359 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5360 if (fi >= max) RRETURN(MATCH_NOMATCH); 5361 if (eptr >= mb->end_subject) 5362 { 5363 SCHECK_PARTIAL(); 5364 RRETURN(MATCH_NOMATCH); 5365 } 5366 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5367 RRETURN(MATCH_NOMATCH); 5368 c = *eptr++; 5369 switch(ctype) 5370 { 5371 case OP_ANY: /* This is the non-NL case */ 5372 if (mb->partial != 0 && /* Take care with CRLF partial */ 5373 eptr >= mb->end_subject && 5374 NLBLOCK->nltype == NLTYPE_FIXED && 5375 NLBLOCK->nllen == 2 && 5376 c == NLBLOCK->nl[0]) 5377 { 5378 mb->hitend = TRUE; 5379 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 5380 } 5381 break; 5382 5383 case OP_ALLANY: 5384 case OP_ANYBYTE: 5385 break; 5386 5387 case OP_ANYNL: 5388 switch(c) 5389 { 5390 default: RRETURN(MATCH_NOMATCH); 5391 case CHAR_CR: 5392 if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++; 5393 break; 5394 5395 case CHAR_LF: 5396 break; 5397 5398 case CHAR_VT: 5399 case CHAR_FF: 5400 case CHAR_NEL: 5401 #if PCRE2_CODE_UNIT_WIDTH != 8 5402 case 0x2028: 5403 case 0x2029: 5404 #endif 5405 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 5406 break; 5407 } 5408 break; 5409 5410 case OP_NOT_HSPACE: 5411 switch(c) 5412 { 5413 default: break; 5414 HSPACE_BYTE_CASES: 5415 #if PCRE2_CODE_UNIT_WIDTH != 8 5416 HSPACE_MULTIBYTE_CASES: 5417 #endif 5418 RRETURN(MATCH_NOMATCH); 5419 } 5420 break; 5421 5422 case OP_HSPACE: 5423 switch(c) 5424 { 5425 default: RRETURN(MATCH_NOMATCH); 5426 HSPACE_BYTE_CASES: 5427 #if PCRE2_CODE_UNIT_WIDTH != 8 5428 HSPACE_MULTIBYTE_CASES: 5429 #endif 5430 break; 5431 } 5432 break; 5433 5434 case OP_NOT_VSPACE: 5435 switch(c) 5436 { 5437 default: break; 5438 VSPACE_BYTE_CASES: 5439 #if PCRE2_CODE_UNIT_WIDTH != 8 5440 VSPACE_MULTIBYTE_CASES: 5441 #endif 5442 RRETURN(MATCH_NOMATCH); 5443 } 5444 break; 5445 5446 case OP_VSPACE: 5447 switch(c) 5448 { 5449 default: RRETURN(MATCH_NOMATCH); 5450 VSPACE_BYTE_CASES: 5451 #if PCRE2_CODE_UNIT_WIDTH != 8 5452 VSPACE_MULTIBYTE_CASES: 5453 #endif 5454 break; 5455 } 5456 break; 5457 5458 case OP_NOT_DIGIT: 5459 if (MAX_255(c) && (mb->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 5460 break; 5461 5462 case OP_DIGIT: 5463 if (!MAX_255(c) || (mb->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 5464 break; 5465 5466 case OP_NOT_WHITESPACE: 5467 if (MAX_255(c) && (mb->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 5468 break; 5469 5470 case OP_WHITESPACE: 5471 if (!MAX_255(c) || (mb->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 5472 break; 5473 5474 case OP_NOT_WORDCHAR: 5475 if (MAX_255(c) && (mb->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 5476 break; 5477 5478 case OP_WORDCHAR: 5479 if (!MAX_255(c) || (mb->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 5480 break; 5481 5482 default: 5483 RRETURN(PCRE2_ERROR_INTERNAL); 5484 } 5485 } 5486 } 5487 /* Control never gets here */ 5488 } 5489 5490 /* If maximizing, it is worth using inline code for speed, doing the type 5491 test once at the start (i.e. keep it out of the loop). Again, keep the 5492 UTF-8 and UCP stuff separate. */ 5493 5494 else 5495 { 5496 pp = eptr; /* Remember where we started */ 5497 5498 #ifdef SUPPORT_UNICODE 5499 if (prop_type >= 0) 5500 { 5501 switch(prop_type) 5502 { 5503 case PT_ANY: 5504 for (i = min; i < max; i++) 5505 { 5506 int len = 1; 5507 if (eptr >= mb->end_subject) 5508 { 5509 SCHECK_PARTIAL(); 5510 break; 5511 } 5512 GETCHARLENTEST(c, eptr, len); 5513 if (prop_fail_result) break; 5514 eptr+= len; 5515 } 5516 break; 5517 5518 case PT_LAMP: 5519 for (i = min; i < max; i++) 5520 { 5521 int chartype; 5522 int len = 1; 5523 if (eptr >= mb->end_subject) 5524 { 5525 SCHECK_PARTIAL(); 5526 break; 5527 } 5528 GETCHARLENTEST(c, eptr, len); 5529 chartype = UCD_CHARTYPE(c); 5530 if ((chartype == ucp_Lu || 5531 chartype == ucp_Ll || 5532 chartype == ucp_Lt) == prop_fail_result) 5533 break; 5534 eptr+= len; 5535 } 5536 break; 5537 5538 case PT_GC: 5539 for (i = min; i < max; i++) 5540 { 5541 int len = 1; 5542 if (eptr >= mb->end_subject) 5543 { 5544 SCHECK_PARTIAL(); 5545 break; 5546 } 5547 GETCHARLENTEST(c, eptr, len); 5548 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; 5549 eptr+= len; 5550 } 5551 break; 5552 5553 case PT_PC: 5554 for (i = min; i < max; i++) 5555 { 5556 int len = 1; 5557 if (eptr >= mb->end_subject) 5558 { 5559 SCHECK_PARTIAL(); 5560 break; 5561 } 5562 GETCHARLENTEST(c, eptr, len); 5563 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; 5564 eptr+= len; 5565 } 5566 break; 5567 5568 case PT_SC: 5569 for (i = min; i < max; i++) 5570 { 5571 int len = 1; 5572 if (eptr >= mb->end_subject) 5573 { 5574 SCHECK_PARTIAL(); 5575 break; 5576 } 5577 GETCHARLENTEST(c, eptr, len); 5578 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; 5579 eptr+= len; 5580 } 5581 break; 5582 5583 case PT_ALNUM: 5584 for (i = min; i < max; i++) 5585 { 5586 int category; 5587 int len = 1; 5588 if (eptr >= mb->end_subject) 5589 { 5590 SCHECK_PARTIAL(); 5591 break; 5592 } 5593 GETCHARLENTEST(c, eptr, len); 5594 category = UCD_CATEGORY(c); 5595 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5596 break; 5597 eptr+= len; 5598 } 5599 break; 5600 5601 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 5602 which means that Perl space and POSIX space are now identical. PCRE 5603 was changed at release 8.34. */ 5604 5605 case PT_SPACE: /* Perl space */ 5606 case PT_PXSPACE: /* POSIX space */ 5607 for (i = min; i < max; i++) 5608 { 5609 int len = 1; 5610 if (eptr >= mb->end_subject) 5611 { 5612 SCHECK_PARTIAL(); 5613 break; 5614 } 5615 GETCHARLENTEST(c, eptr, len); 5616 switch(c) 5617 { 5618 HSPACE_CASES: 5619 VSPACE_CASES: 5620 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ 5621 break; 5622 5623 default: 5624 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) 5625 goto ENDLOOP99; /* Break the loop */ 5626 break; 5627 } 5628 eptr+= len; 5629 } 5630 ENDLOOP99: 5631 break; 5632 5633 case PT_WORD: 5634 for (i = min; i < max; i++) 5635 { 5636 int category; 5637 int len = 1; 5638 if (eptr >= mb->end_subject) 5639 { 5640 SCHECK_PARTIAL(); 5641 break; 5642 } 5643 GETCHARLENTEST(c, eptr, len); 5644 category = UCD_CATEGORY(c); 5645 if ((category == ucp_L || category == ucp_N || 5646 c == CHAR_UNDERSCORE) == prop_fail_result) 5647 break; 5648 eptr+= len; 5649 } 5650 break; 5651 5652 case PT_CLIST: 5653 for (i = min; i < max; i++) 5654 { 5655 const uint32_t *cp; 5656 int len = 1; 5657 if (eptr >= mb->end_subject) 5658 { 5659 SCHECK_PARTIAL(); 5660 break; 5661 } 5662 GETCHARLENTEST(c, eptr, len); 5663 cp = PRIV(ucd_caseless_sets) + prop_value; 5664 for (;;) 5665 { 5666 if (c < *cp) 5667 { if (prop_fail_result) break; else goto GOT_MAX; } 5668 if (c == *cp++) 5669 { if (prop_fail_result) goto GOT_MAX; else break; } 5670 } 5671 eptr += len; 5672 } 5673 GOT_MAX: 5674 break; 5675 5676 case PT_UCNC: 5677 for (i = min; i < max; i++) 5678 { 5679 int len = 1; 5680 if (eptr >= mb->end_subject) 5681 { 5682 SCHECK_PARTIAL(); 5683 break; 5684 } 5685 GETCHARLENTEST(c, eptr, len); 5686 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 5687 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 5688 c >= 0xe000) == prop_fail_result) 5689 break; 5690 eptr += len; 5691 } 5692 break; 5693 5694 default: 5695 RRETURN(PCRE2_ERROR_INTERNAL); 5696 } 5697 5698 /* eptr is now past the end of the maximum run */ 5699 5700 if (possessive) continue; /* No backtracking */ 5701 5702 /* After \C in UTF mode, pp might be in the middle of a Unicode 5703 character. Use <= pp to ensure backtracking doesn't go too far. */ 5704 5705 for(;;) 5706 { 5707 if (eptr <= pp) goto TAIL_RECURSE; 5708 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM44); 5709 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5710 eptr--; 5711 if (utf) BACKCHAR(eptr); 5712 } 5713 } 5714 5715 /* Match extended Unicode grapheme clusters. We will get here only if the 5716 support is in the binary; otherwise a compile-time error occurs. */ 5717 5718 else if (ctype == OP_EXTUNI) 5719 { 5720 for (i = min; i < max; i++) 5721 { 5722 if (eptr >= mb->end_subject) 5723 { 5724 SCHECK_PARTIAL(); 5725 break; 5726 } 5727 else 5728 { 5729 int lgb, rgb; 5730 GETCHARINCTEST(c, eptr); 5731 lgb = UCD_GRAPHBREAK(c); 5732 while (eptr < mb->end_subject) 5733 { 5734 int len = 1; 5735 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5736 rgb = UCD_GRAPHBREAK(c); 5737 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5738 lgb = rgb; 5739 eptr += len; 5740 } 5741 } 5742 CHECK_PARTIAL(); 5743 } 5744 5745 /* eptr is now past the end of the maximum run */ 5746 5747 if (possessive) continue; /* No backtracking */ 5748 5749 /* We use <= pp rather than == pp to detect the start of the run while 5750 backtracking because the use of \C in UTF mode can cause BACKCHAR to 5751 move back past pp. This is just palliative; the use of \C in UTF mode 5752 is fraught with danger. */ 5753 5754 for(;;) 5755 { 5756 int lgb, rgb; 5757 PCRE2_SPTR fptr; 5758 5759 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ 5760 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45); 5761 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5762 5763 /* Backtracking over an extended grapheme cluster involves inspecting 5764 the previous two characters (if present) to see if a break is 5765 permitted between them. */ 5766 5767 eptr--; 5768 if (!utf) c = *eptr; else 5769 { 5770 BACKCHAR(eptr); 5771 GETCHAR(c, eptr); 5772 } 5773 rgb = UCD_GRAPHBREAK(c); 5774 5775 for (;;) 5776 { 5777 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ 5778 fptr = eptr - 1; 5779 if (!utf) c = *fptr; else 5780 { 5781 BACKCHAR(fptr); 5782 GETCHAR(c, fptr); 5783 } 5784 lgb = UCD_GRAPHBREAK(c); 5785 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 5786 eptr = fptr; 5787 rgb = lgb; 5788 } 5789 } 5790 } 5791 5792 else 5793 #endif /* SUPPORT_UNICODE */ 5794 5795 #ifdef SUPPORT_UNICODE 5796 if (utf) 5797 { 5798 switch(ctype) 5799 { 5800 case OP_ANY: 5801 for (i = min; i < max; i++) 5802 { 5803 if (eptr >= mb->end_subject) 5804 { 5805 SCHECK_PARTIAL(); 5806 break; 5807 } 5808 if (IS_NEWLINE(eptr)) break; 5809 if (mb->partial != 0 && /* Take care with CRLF partial */ 5810 eptr + 1 >= mb->end_subject && 5811 NLBLOCK->nltype == NLTYPE_FIXED && 5812 NLBLOCK->nllen == 2 && 5813 UCHAR21(eptr) == NLBLOCK->nl[0]) 5814 { 5815 mb->hitend = TRUE; 5816 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 5817 } 5818 eptr++; 5819 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 5820 } 5821 break; 5822 5823 case OP_ALLANY: 5824 if (max < INT_MAX) 5825 { 5826 for (i = min; i < max; i++) 5827 { 5828 if (eptr >= mb->end_subject) 5829 { 5830 SCHECK_PARTIAL(); 5831 break; 5832 } 5833 eptr++; 5834 ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++); 5835 } 5836 } 5837 else 5838 { 5839 eptr = mb->end_subject; /* Unlimited UTF-8 repeat */ 5840 SCHECK_PARTIAL(); 5841 } 5842 break; 5843 5844 /* The byte case is the same as non-UTF8 */ 5845 5846 case OP_ANYBYTE: 5847 c = max - min; 5848 if (c > (uint32_t)(mb->end_subject - eptr)) 5849 { 5850 eptr = mb->end_subject; 5851 SCHECK_PARTIAL(); 5852 } 5853 else eptr += c; 5854 break; 5855 5856 case OP_ANYNL: 5857 for (i = min; i < max; i++) 5858 { 5859 int len = 1; 5860 if (eptr >= mb->end_subject) 5861 { 5862 SCHECK_PARTIAL(); 5863 break; 5864 } 5865 GETCHARLEN(c, eptr, len); 5866 if (c == CHAR_CR) 5867 { 5868 if (++eptr >= mb->end_subject) break; 5869 if (UCHAR21(eptr) == CHAR_LF) eptr++; 5870 } 5871 else 5872 { 5873 if (c != CHAR_LF && 5874 (mb->bsr_convention == PCRE2_BSR_ANYCRLF || 5875 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 5876 #ifndef EBCDIC 5877 && c != 0x2028 && c != 0x2029 5878 #endif /* Not EBCDIC */ 5879 ))) 5880 break; 5881 eptr += len; 5882 } 5883 } 5884 break; 5885 5886 case OP_NOT_HSPACE: 5887 case OP_HSPACE: 5888 for (i = min; i < max; i++) 5889 { 5890 BOOL gotspace; 5891 int len = 1; 5892 if (eptr >= mb->end_subject) 5893 { 5894 SCHECK_PARTIAL(); 5895 break; 5896 } 5897 GETCHARLEN(c, eptr, len); 5898 switch(c) 5899 { 5900 HSPACE_CASES: gotspace = TRUE; break; 5901 default: gotspace = FALSE; break; 5902 } 5903 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5904 eptr += len; 5905 } 5906 break; 5907 5908 case OP_NOT_VSPACE: 5909 case OP_VSPACE: 5910 for (i = min; i < max; i++) 5911 { 5912 BOOL gotspace; 5913 int len = 1; 5914 if (eptr >= mb->end_subject) 5915 { 5916 SCHECK_PARTIAL(); 5917 break; 5918 } 5919 GETCHARLEN(c, eptr, len); 5920 switch(c) 5921 { 5922 VSPACE_CASES: gotspace = TRUE; break; 5923 default: gotspace = FALSE; break; 5924 } 5925 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5926 eptr += len; 5927 } 5928 break; 5929 5930 case OP_NOT_DIGIT: 5931 for (i = min; i < max; i++) 5932 { 5933 int len = 1; 5934 if (eptr >= mb->end_subject) 5935 { 5936 SCHECK_PARTIAL(); 5937 break; 5938 } 5939 GETCHARLEN(c, eptr, len); 5940 if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0) break; 5941 eptr+= len; 5942 } 5943 break; 5944 5945 case OP_DIGIT: 5946 for (i = min; i < max; i++) 5947 { 5948 int len = 1; 5949 if (eptr >= mb->end_subject) 5950 { 5951 SCHECK_PARTIAL(); 5952 break; 5953 } 5954 GETCHARLEN(c, eptr, len); 5955 if (c >= 256 ||(mb->ctypes[c] & ctype_digit) == 0) break; 5956 eptr+= len; 5957 } 5958 break; 5959 5960 case OP_NOT_WHITESPACE: 5961 for (i = min; i < max; i++) 5962 { 5963 int len = 1; 5964 if (eptr >= mb->end_subject) 5965 { 5966 SCHECK_PARTIAL(); 5967 break; 5968 } 5969 GETCHARLEN(c, eptr, len); 5970 if (c < 256 && (mb->ctypes[c] & ctype_space) != 0) break; 5971 eptr+= len; 5972 } 5973 break; 5974 5975 case OP_WHITESPACE: 5976 for (i = min; i < max; i++) 5977 { 5978 int len = 1; 5979 if (eptr >= mb->end_subject) 5980 { 5981 SCHECK_PARTIAL(); 5982 break; 5983 } 5984 GETCHARLEN(c, eptr, len); 5985 if (c >= 256 ||(mb->ctypes[c] & ctype_space) == 0) break; 5986 eptr+= len; 5987 } 5988 break; 5989 5990 case OP_NOT_WORDCHAR: 5991 for (i = min; i < max; i++) 5992 { 5993 int len = 1; 5994 if (eptr >= mb->end_subject) 5995 { 5996 SCHECK_PARTIAL(); 5997 break; 5998 } 5999 GETCHARLEN(c, eptr, len); 6000 if (c < 256 && (mb->ctypes[c] & ctype_word) != 0) break; 6001 eptr+= len; 6002 } 6003 break; 6004 6005 case OP_WORDCHAR: 6006 for (i = min; i < max; i++) 6007 { 6008 int len = 1; 6009 if (eptr >= mb->end_subject) 6010 { 6011 SCHECK_PARTIAL(); 6012 break; 6013 } 6014 GETCHARLEN(c, eptr, len); 6015 if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0) break; 6016 eptr+= len; 6017 } 6018 break; 6019 6020 default: 6021 RRETURN(PCRE2_ERROR_INTERNAL); 6022 } 6023 6024 if (possessive) continue; /* No backtracking */ 6025 6026 /* After \C in UTF mode, pp might be in the middle of a Unicode 6027 character. Use <= pp to ensure backtracking doesn't go too far. */ 6028 6029 for(;;) 6030 { 6031 if (eptr <= pp) goto TAIL_RECURSE; 6032 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM46); 6033 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6034 eptr--; 6035 BACKCHAR(eptr); 6036 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL && 6037 UCHAR21(eptr - 1) == CHAR_CR) eptr--; 6038 } 6039 } 6040 else 6041 #endif /* SUPPORT_UNICODE */ 6042 /* Not UTF mode */ 6043 { 6044 switch(ctype) 6045 { 6046 case OP_ANY: 6047 for (i = min; i < max; i++) 6048 { 6049 if (eptr >= mb->end_subject) 6050 { 6051 SCHECK_PARTIAL(); 6052 break; 6053 } 6054 if (IS_NEWLINE(eptr)) break; 6055 if (mb->partial != 0 && /* Take care with CRLF partial */ 6056 eptr + 1 >= mb->end_subject && 6057 NLBLOCK->nltype == NLTYPE_FIXED && 6058 NLBLOCK->nllen == 2 && 6059 *eptr == NLBLOCK->nl[0]) 6060 { 6061 mb->hitend = TRUE; 6062 if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); 6063 } 6064 eptr++; 6065 } 6066 break; 6067 6068 case OP_ALLANY: 6069 case OP_ANYBYTE: 6070 c = max - min; 6071 if (c > (uint32_t)(mb->end_subject - eptr)) 6072 { 6073 eptr = mb->end_subject; 6074 SCHECK_PARTIAL(); 6075 } 6076 else eptr += c; 6077 break; 6078 6079 case OP_ANYNL: 6080 for (i = min; i < max; i++) 6081 { 6082 if (eptr >= mb->end_subject) 6083 { 6084 SCHECK_PARTIAL(); 6085 break; 6086 } 6087 c = *eptr; 6088 if (c == CHAR_CR) 6089 { 6090 if (++eptr >= mb->end_subject) break; 6091 if (*eptr == CHAR_LF) eptr++; 6092 } 6093 else 6094 { 6095 if (c != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || 6096 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL 6097 #if PCRE2_CODE_UNIT_WIDTH != 8 6098 && c != 0x2028 && c != 0x2029 6099 #endif 6100 ))) break; 6101 eptr++; 6102 } 6103 } 6104 break; 6105 6106 case OP_NOT_HSPACE: 6107 for (i = min; i < max; i++) 6108 { 6109 if (eptr >= mb->end_subject) 6110 { 6111 SCHECK_PARTIAL(); 6112 break; 6113 } 6114 switch(*eptr) 6115 { 6116 default: eptr++; break; 6117 HSPACE_BYTE_CASES: 6118 #if PCRE2_CODE_UNIT_WIDTH != 8 6119 HSPACE_MULTIBYTE_CASES: 6120 #endif 6121 goto ENDLOOP00; 6122 } 6123 } 6124 ENDLOOP00: 6125 break; 6126 6127 case OP_HSPACE: 6128 for (i = min; i < max; i++) 6129 { 6130 if (eptr >= mb->end_subject) 6131 { 6132 SCHECK_PARTIAL(); 6133 break; 6134 } 6135 switch(*eptr) 6136 { 6137 default: goto ENDLOOP01; 6138 HSPACE_BYTE_CASES: 6139 #if PCRE2_CODE_UNIT_WIDTH != 8 6140 HSPACE_MULTIBYTE_CASES: 6141 #endif 6142 eptr++; break; 6143 } 6144 } 6145 ENDLOOP01: 6146 break; 6147 6148 case OP_NOT_VSPACE: 6149 for (i = min; i < max; i++) 6150 { 6151 if (eptr >= mb->end_subject) 6152 { 6153 SCHECK_PARTIAL(); 6154 break; 6155 } 6156 switch(*eptr) 6157 { 6158 default: eptr++; break; 6159 VSPACE_BYTE_CASES: 6160 #if PCRE2_CODE_UNIT_WIDTH != 8 6161 VSPACE_MULTIBYTE_CASES: 6162 #endif 6163 goto ENDLOOP02; 6164 } 6165 } 6166 ENDLOOP02: 6167 break; 6168 6169 case OP_VSPACE: 6170 for (i = min; i < max; i++) 6171 { 6172 if (eptr >= mb->end_subject) 6173 { 6174 SCHECK_PARTIAL(); 6175 break; 6176 } 6177 switch(*eptr) 6178 { 6179 default: goto ENDLOOP03; 6180 VSPACE_BYTE_CASES: 6181 #if PCRE2_CODE_UNIT_WIDTH != 8 6182 VSPACE_MULTIBYTE_CASES: 6183 #endif 6184 eptr++; break; 6185 } 6186 } 6187 ENDLOOP03: 6188 break; 6189 6190 case OP_NOT_DIGIT: 6191 for (i = min; i < max; i++) 6192 { 6193 if (eptr >= mb->end_subject) 6194 { 6195 SCHECK_PARTIAL(); 6196 break; 6197 } 6198 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0) break; 6199 eptr++; 6200 } 6201 break; 6202 6203 case OP_DIGIT: 6204 for (i = min; i < max; i++) 6205 { 6206 if (eptr >= mb->end_subject) 6207 { 6208 SCHECK_PARTIAL(); 6209 break; 6210 } 6211 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0) break; 6212 eptr++; 6213 } 6214 break; 6215 6216 case OP_NOT_WHITESPACE: 6217 for (i = min; i < max; i++) 6218 { 6219 if (eptr >= mb->end_subject) 6220 { 6221 SCHECK_PARTIAL(); 6222 break; 6223 } 6224 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0) break; 6225 eptr++; 6226 } 6227 break; 6228 6229 case OP_WHITESPACE: 6230 for (i = min; i < max; i++) 6231 { 6232 if (eptr >= mb->end_subject) 6233 { 6234 SCHECK_PARTIAL(); 6235 break; 6236 } 6237 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0) break; 6238 eptr++; 6239 } 6240 break; 6241 6242 case OP_NOT_WORDCHAR: 6243 for (i = min; i < max; i++) 6244 { 6245 if (eptr >= mb->end_subject) 6246 { 6247 SCHECK_PARTIAL(); 6248 break; 6249 } 6250 if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0) break; 6251 eptr++; 6252 } 6253 break; 6254 6255 case OP_WORDCHAR: 6256 for (i = min; i < max; i++) 6257 { 6258 if (eptr >= mb->end_subject) 6259 { 6260 SCHECK_PARTIAL(); 6261 break; 6262 } 6263 if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0) break; 6264 eptr++; 6265 } 6266 break; 6267 6268 default: 6269 RRETURN(PCRE2_ERROR_INTERNAL); 6270 } 6271 6272 if (possessive) continue; /* No backtracking */ 6273 for (;;) 6274 { 6275 if (eptr == pp) goto TAIL_RECURSE; 6276 RMATCH(eptr, ecode, offset_top, mb, eptrb, RM47); 6277 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6278 eptr--; 6279 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && 6280 eptr[-1] == CHAR_CR) eptr--; 6281 } 6282 } 6283 6284 /* Control never gets here */ 6285 } 6286 6287 /* There's been some horrible disaster. Arrival here can only mean there is 6288 something seriously wrong in the code above or the OP_xxx definitions. */ 6289 6290 default: 6291 RRETURN(PCRE2_ERROR_INTERNAL); 6292 } 6293 6294 /* Do not stick any code in here without much thought; it is assumed 6295 that "continue" in the code above comes out to here to repeat the main 6296 loop. */ 6297 6298 } /* End of main loop */ 6299 /* Control never reaches here */ 6300 6301 6302 /* When compiling to use the heap rather than the stack for recursive calls to 6303 match(), the RRETURN() macro jumps here. The number that is saved in 6304 frame->Xwhere indicates which label we actually want to return to. */ 6305 6306 #ifdef HEAP_MATCH_RECURSE 6307 #define LBL(val) case val: goto L_RM##val; 6308 HEAP_RETURN: 6309 switch (frame->Xwhere) 6310 { 6311 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 6312 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 6313 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 6314 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 6315 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) 6316 LBL(65) LBL(66) LBL(68) 6317 #ifdef SUPPORT_WIDE_CHARS 6318 LBL(20) LBL(21) 6319 #endif 6320 #ifdef SUPPORT_UNICODE 6321 LBL(16) LBL(18) 6322 LBL(22) LBL(23) LBL(28) LBL(30) 6323 LBL(32) LBL(34) LBL(42) LBL(46) 6324 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 6325 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) 6326 #endif /* SUPPORT_UNICODE */ 6327 default: 6328 return PCRE2_ERROR_INTERNAL; 6329 } 6330 #undef LBL 6331 #endif /* HEAP_MATCH_RECURSE */ 6332 } 6333 6334 6335 /*************************************************************************** 6336 **************************************************************************** 6337 RECURSION IN THE match() FUNCTION 6338 6339 Undefine all the macros that were defined above to handle this. */ 6340 6341 #ifdef HEAP_MATCH_RECURSE 6342 #undef eptr 6343 #undef ecode 6344 #undef mstart 6345 #undef offset_top 6346 #undef eptrb 6347 #undef flags 6348 6349 #undef callpat 6350 #undef charptr 6351 #undef data 6352 #undef next_ecode 6353 #undef pp 6354 #undef prev 6355 #undef saved_eptr 6356 6357 #undef new_recursive 6358 6359 #undef cur_is_word 6360 #undef condition 6361 #undef prev_is_word 6362 6363 #undef ctype 6364 #undef length 6365 #undef max 6366 #undef min 6367 #undef number 6368 #undef offset 6369 #undef op 6370 #undef save_capture_last 6371 #undef save_offset1 6372 #undef save_offset2 6373 #undef save_offset3 6374 6375 #undef newptrb 6376 #endif /* HEAP_MATCH_RECURSE */ 6377 6378 /* These two are defined as macros in both cases */ 6379 6380 #undef fc 6381 #undef fi 6382 6383 /*************************************************************************** 6384 ***************************************************************************/ 6385 6386 6387 #ifdef HEAP_MATCH_RECURSE 6388 /************************************************* 6389 * Release allocated heap frames * 6390 *************************************************/ 6391 6392 /* This function releases all the allocated frames. The base frame is on the 6393 machine stack, and so must not be freed. 6394 6395 Argument: 6396 frame_base the address of the base frame 6397 mb the match block 6398 6399 Returns: nothing 6400 */ 6401 6402 static void 6403 release_match_heapframes (heapframe *frame_base, match_block *mb) 6404 { 6405 heapframe *nextframe = frame_base->Xnextframe; 6406 while (nextframe != NULL) 6407 { 6408 heapframe *oldframe = nextframe; 6409 nextframe = nextframe->Xnextframe; 6410 mb->stack_memctl.free(oldframe, mb->stack_memctl.memory_data); 6411 } 6412 } 6413 #endif /* HEAP_MATCH_RECURSE */ 6414 6415 6416 6417 /************************************************* 6418 * Match a Regular Expression * 6419 *************************************************/ 6420 6421 /* This function applies a compiled pattern to a subject string and picks out 6422 portions of the string if it matches. Two elements in the vector are set for 6423 each substring: the offsets to the start and end of the substring. 6424 6425 Arguments: 6426 code points to the compiled expression 6427 subject points to the subject string 6428 length length of subject string (may contain binary zeros) 6429 start_offset where to start in the subject string 6430 options option bits 6431 match_data points to a match_data block 6432 mcontext points a PCRE2 context 6433 6434 Returns: > 0 => success; value is the number of ovector pairs filled 6435 = 0 => success, but ovector is not big enough 6436 -1 => failed to match (PCRE2_ERROR_NOMATCH) 6437 -2 => partial match (PCRE2_ERROR_PARTIAL) 6438 < -2 => some kind of unexpected problem 6439 */ 6440 6441 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 6442 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 6443 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 6444 pcre2_match_context *mcontext) 6445 { 6446 int rc; 6447 int ocount; 6448 6449 const uint8_t *start_bits = NULL; 6450 6451 const pcre2_real_code *re = (const pcre2_real_code *)code; 6452 6453 BOOL anchored; 6454 BOOL firstline; 6455 BOOL has_first_cu = FALSE; 6456 BOOL has_req_cu = FALSE; 6457 BOOL startline; 6458 BOOL using_temporary_offsets = FALSE; 6459 BOOL utf; 6460 6461 PCRE2_UCHAR first_cu = 0; 6462 PCRE2_UCHAR first_cu2 = 0; 6463 PCRE2_UCHAR req_cu = 0; 6464 PCRE2_UCHAR req_cu2 = 0; 6465 6466 PCRE2_SPTR bumpalong_limit; 6467 PCRE2_SPTR end_subject; 6468 PCRE2_SPTR start_match = subject + start_offset; 6469 PCRE2_SPTR req_cu_ptr = start_match - 1; 6470 PCRE2_SPTR start_partial = NULL; 6471 PCRE2_SPTR match_partial = NULL; 6472 6473 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro 6474 is used below, and it expects NLBLOCK to be defined as a pointer. */ 6475 6476 match_block actual_match_block; 6477 match_block *mb = &actual_match_block; 6478 6479 #ifdef HEAP_MATCH_RECURSE 6480 heapframe frame_zero; 6481 frame_zero.Xprevframe = NULL; /* Marks the top level */ 6482 frame_zero.Xnextframe = NULL; /* None are allocated yet */ 6483 mb->match_frames_base = &frame_zero; 6484 #endif 6485 6486 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated 6487 subject string. */ 6488 6489 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); 6490 end_subject = subject + length; 6491 6492 /* Plausibility checks */ 6493 6494 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; 6495 if (code == NULL || subject == NULL || match_data == NULL) 6496 return PCRE2_ERROR_NULL; 6497 if (start_offset > length) return PCRE2_ERROR_BADOFFSET; 6498 6499 /* Check that the first field in the block is the magic number. */ 6500 6501 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; 6502 6503 /* Check the code unit width. */ 6504 6505 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) 6506 return PCRE2_ERROR_BADMODE; 6507 6508 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the 6509 options variable for this function. Users of PCRE2 who are not calling the 6510 function directly would like to have a way of setting these flags, in the same 6511 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with 6512 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and 6513 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be 6514 transferred to the options for this function. The bits are guaranteed to be 6515 adjacent, but do not have the same values. This bit of Boolean trickery assumes 6516 that the match-time bits are not more significant than the flag bits. If by 6517 accident this is not the case, a compile-time division by zero error will 6518 occur. */ 6519 6520 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) 6521 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) 6522 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); 6523 #undef FF 6524 #undef OO 6525 6526 /* A NULL match context means "use a default context" */ 6527 6528 if (mcontext == NULL) 6529 mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); 6530 6531 /* These two settings are used in the code for checking a UTF string that 6532 follows immediately afterwards. Other values in the mb block are used only 6533 during interpretive pcre_match() processing, not when the JIT support is in 6534 use, so they are set up later. */ 6535 6536 utf = (re->overall_options & PCRE2_UTF) != 0; 6537 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : 6538 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; 6539 6540 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, 6541 we must also check that a starting offset does not point into the middle of a 6542 multiunit character. We check only the portion of the subject that is going to 6543 be inspected during matching - from the offset minus the maximum back reference 6544 to the given length. This saves time when a small part of a large subject is 6545 being matched by the use of a starting offset. Note that the maximum lookbehind 6546 is a number of characters, not code units. */ 6547 6548 #ifdef SUPPORT_UNICODE 6549 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) 6550 { 6551 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ 6552 6553 if (start_offset > 0) 6554 { 6555 #if PCRE2_CODE_UNIT_WIDTH != 32 6556 unsigned int i; 6557 if (start_match < end_subject && NOT_FIRSTCU(*start_match)) 6558 return PCRE2_ERROR_BADUTFOFFSET; 6559 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) 6560 { 6561 check_subject--; 6562 while (check_subject > subject && 6563 #if PCRE2_CODE_UNIT_WIDTH == 8 6564 (*check_subject & 0xc0) == 0x80) 6565 #else /* 16-bit */ 6566 (*check_subject & 0xfc00) == 0xdc00) 6567 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 6568 check_subject--; 6569 } 6570 #else 6571 /* In the 32-bit library, one code unit equals one character. However, 6572 we cannot just subtract the lookbehind and then compare pointers, because 6573 a very large lookbehind could create an invalid pointer. */ 6574 6575 if (start_offset >= re->max_lookbehind) 6576 check_subject -= re->max_lookbehind; 6577 else 6578 check_subject = subject; 6579 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ 6580 } 6581 6582 /* Validate the relevant portion of the subject. After an error, adjust the 6583 offset to be an absolute offset in the whole string. */ 6584 6585 match_data->rc = PRIV(valid_utf)(check_subject, 6586 length - (check_subject - subject), &(match_data->startchar)); 6587 if (match_data->rc != 0) 6588 { 6589 match_data->startchar += check_subject - subject; 6590 return match_data->rc; 6591 } 6592 } 6593 #endif /* SUPPORT_UNICODE */ 6594 6595 /* It is an error to set an offset limit without setting the flag at compile 6596 time. */ 6597 6598 if (mcontext->offset_limit != PCRE2_UNSET && 6599 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) 6600 return PCRE2_ERROR_BADOFFSETLIMIT; 6601 6602 /* If the pattern was successfully studied with JIT support, run the JIT 6603 executable instead of the rest of this function. Most options must be set at 6604 compile time for the JIT code to be usable. Fallback to the normal code path if 6605 an unsupported option is set or if JIT returns BADOPTION (which means that the 6606 selected normal or partial matching mode was not compiled). */ 6607 6608 #ifdef SUPPORT_JIT 6609 if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) 6610 { 6611 rc = pcre2_jit_match(code, subject, length, start_offset, options, 6612 match_data, mcontext); 6613 if (rc != PCRE2_ERROR_JIT_BADOPTION) return rc; 6614 } 6615 #endif 6616 6617 /* Carry on with non-JIT matching. */ 6618 6619 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; 6620 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; 6621 startline = (re->flags & PCRE2_STARTLINE) != 0; 6622 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? 6623 end_subject : subject + mcontext->offset_limit; 6624 6625 /* Fill in the fields in the match block. */ 6626 6627 mb->callout = mcontext->callout; 6628 mb->callout_data = mcontext->callout_data; 6629 mb->memctl = mcontext->memctl; 6630 #ifdef HEAP_MATCH_RECURSE 6631 mb->stack_memctl = mcontext->stack_memctl; 6632 #endif 6633 6634 mb->start_subject = subject; 6635 mb->start_offset = start_offset; 6636 mb->end_subject = end_subject; 6637 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; 6638 6639 mb->moptions = options; /* Match options */ 6640 mb->poptions = re->overall_options; /* Pattern options */ 6641 6642 mb->ignore_skip_arg = 0; 6643 mb->mark = mb->nomatch_mark = NULL; /* In case never set */ 6644 mb->recursive = NULL; /* No recursion at top level */ 6645 mb->ovecsave_chain = NULL; /* No ovecsave blocks yet */ 6646 mb->hitend = FALSE; 6647 6648 /* The name table is needed for finding all the numbers associated with a 6649 given name, for condition testing. The code follows the name table. */ 6650 6651 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); 6652 mb->name_count = re->name_count; 6653 mb->name_entry_size = re->name_entry_size; 6654 mb->start_code = mb->name_table + re->name_count * re->name_entry_size; 6655 6656 /* Limits set in the pattern override the match context only if they are 6657 smaller. */ 6658 6659 mb->match_limit = (mcontext->match_limit < re->limit_match)? 6660 mcontext->match_limit : re->limit_match; 6661 mb->match_limit_recursion = (mcontext->recursion_limit < re->limit_recursion)? 6662 mcontext->recursion_limit : re->limit_recursion; 6663 6664 /* Pointers to the individual character tables */ 6665 6666 mb->lcc = re->tables + lcc_offset; 6667 mb->fcc = re->tables + fcc_offset; 6668 mb->ctypes = re->tables + ctypes_offset; 6669 6670 /* Process the \R and newline settings. */ 6671 6672 mb->bsr_convention = re->bsr_convention; 6673 mb->nltype = NLTYPE_FIXED; 6674 switch(re->newline_convention) 6675 { 6676 case PCRE2_NEWLINE_CR: 6677 mb->nllen = 1; 6678 mb->nl[0] = CHAR_CR; 6679 break; 6680 6681 case PCRE2_NEWLINE_LF: 6682 mb->nllen = 1; 6683 mb->nl[0] = CHAR_NL; 6684 break; 6685 6686 case PCRE2_NEWLINE_CRLF: 6687 mb->nllen = 2; 6688 mb->nl[0] = CHAR_CR; 6689 mb->nl[1] = CHAR_NL; 6690 break; 6691 6692 case PCRE2_NEWLINE_ANY: 6693 mb->nltype = NLTYPE_ANY; 6694 break; 6695 6696 case PCRE2_NEWLINE_ANYCRLF: 6697 mb->nltype = NLTYPE_ANYCRLF; 6698 break; 6699 6700 default: return PCRE2_ERROR_INTERNAL; 6701 } 6702 6703 /* If the expression has got more back references than the offsets supplied can 6704 hold, we get a temporary chunk of memory to use during the matching. Otherwise, 6705 we can use the vector supplied. The size of the ovector is three times the 6706 value in the oveccount field. Two-thirds of it is pairs for storing matching 6707 offsets, and the top third is working space. */ 6708 6709 if (re->top_backref >= match_data->oveccount) 6710 { 6711 ocount = re->top_backref * 3 + 3; 6712 mb->ovector = (PCRE2_SIZE *)(mb->memctl.malloc(ocount * sizeof(PCRE2_SIZE), 6713 mb->memctl.memory_data)); 6714 if (mb->ovector == NULL) return PCRE2_ERROR_NOMEMORY; 6715 using_temporary_offsets = TRUE; 6716 } 6717 else 6718 { 6719 ocount = 3 * match_data->oveccount; 6720 mb->ovector = match_data->ovector; 6721 } 6722 6723 mb->offset_end = ocount; 6724 mb->offset_max = (2*ocount)/3; 6725 6726 /* Reset the working variable associated with each extraction. These should 6727 never be used unless previously set, but they get saved and restored, and so we 6728 initialize them to avoid reading uninitialized locations. Also, unset the 6729 offsets for the matched string. This is really just for tidiness with callouts, 6730 in case they inspect these fields. */ 6731 6732 if (ocount > 0) 6733 { 6734 register PCRE2_SIZE *iptr = mb->ovector + ocount; 6735 register PCRE2_SIZE *iend = iptr - re->top_bracket; 6736 if (iend < mb->ovector + 2) iend = mb->ovector + 2; 6737 while (--iptr >= iend) *iptr = PCRE2_UNSET; 6738 mb->ovector[0] = mb->ovector[1] = PCRE2_UNSET; 6739 } 6740 6741 /* Set up the first code unit to match, if available. The first_codeunit value 6742 is never set for an anchored regular expression, but the anchoring may be 6743 forced at run time, so we have to test for anchoring. The first code unit may 6744 be unset for an unanchored pattern, of course. If there's no first code unit 6745 there may be a bitmap of possible first characters. */ 6746 6747 if (!anchored) 6748 { 6749 if ((re->flags & PCRE2_FIRSTSET) != 0) 6750 { 6751 has_first_cu = TRUE; 6752 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); 6753 if ((re->flags & PCRE2_FIRSTCASELESS) != 0) 6754 { 6755 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); 6756 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 6757 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); 6758 #endif 6759 } 6760 } 6761 else 6762 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) 6763 start_bits = re->start_bitmap; 6764 } 6765 6766 /* For anchored or unanchored matches, there may be a "last known required 6767 character" set. */ 6768 6769 if ((re->flags & PCRE2_LASTSET) != 0) 6770 { 6771 has_req_cu = TRUE; 6772 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); 6773 if ((re->flags & PCRE2_LASTCASELESS) != 0) 6774 { 6775 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); 6776 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 6777 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); 6778 #endif 6779 } 6780 } 6781 6782 6783 /* ==========================================================================*/ 6784 6785 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 6786 the loop runs just once. */ 6787 6788 for(;;) 6789 { 6790 PCRE2_SPTR new_start_match; 6791 mb->capture_last = 0; 6792 6793 /* ----------------- Start of match optimizations ---------------- */ 6794 6795 /* There are some optimizations that avoid running the match if a known 6796 starting point is not found, or if a known later code unit is not present. 6797 However, there is an option (settable at compile time) that disables these, 6798 for testing and for ensuring that all callouts do actually occur. */ 6799 6800 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) 6801 { 6802 PCRE2_SPTR save_end_subject = end_subject; 6803 6804 /* If firstline is TRUE, the start of the match is constrained to the first 6805 line of a multiline string. That is, the match must be before or at the 6806 first newline. Implement this by temporarily adjusting end_subject so that 6807 we stop the optimization scans at a newline. If the match fails at the 6808 newline, later code breaks this loop. */ 6809 6810 if (firstline) 6811 { 6812 PCRE2_SPTR t = start_match; 6813 #ifdef SUPPORT_UNICODE 6814 if (utf) 6815 { 6816 while (t < mb->end_subject && !IS_NEWLINE(t)) 6817 { 6818 t++; 6819 ACROSSCHAR(t < end_subject, *t, t++); 6820 } 6821 } 6822 else 6823 #endif 6824 while (t < mb->end_subject && !IS_NEWLINE(t)) t++; 6825 end_subject = t; 6826 } 6827 6828 /* Advance to a unique first code unit if there is one. In 8-bit mode, the 6829 use of memchr() gives a big speed up. */ 6830 6831 if (has_first_cu) 6832 { 6833 PCRE2_UCHAR smc; 6834 if (first_cu != first_cu2) 6835 while (start_match < end_subject && 6836 (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) 6837 start_match++; 6838 else 6839 { 6840 #if PCRE2_CODE_UNIT_WIDTH != 8 6841 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) 6842 start_match++; 6843 #else 6844 start_match = memchr(start_match, first_cu, end_subject - start_match); 6845 if (start_match == NULL) start_match = end_subject; 6846 #endif 6847 } 6848 } 6849 6850 /* Or to just after a linebreak for a multiline match */ 6851 6852 else if (startline) 6853 { 6854 if (start_match > mb->start_subject + start_offset) 6855 { 6856 #ifdef SUPPORT_UNICODE 6857 if (utf) 6858 { 6859 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6860 { 6861 start_match++; 6862 ACROSSCHAR(start_match < end_subject, *start_match, 6863 start_match++); 6864 } 6865 } 6866 else 6867 #endif 6868 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6869 start_match++; 6870 6871 /* If we have just passed a CR and the newline option is ANY or 6872 ANYCRLF, and we are now at a LF, advance the match position by one more 6873 code unit. */ 6874 6875 if (start_match[-1] == CHAR_CR && 6876 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && 6877 start_match < end_subject && 6878 UCHAR21TEST(start_match) == CHAR_NL) 6879 start_match++; 6880 } 6881 } 6882 6883 /* Or to a non-unique first code unit if any have been identified. The 6884 bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all 6885 code units greater than 254 set the 255 bit. */ 6886 6887 else if (start_bits != NULL) 6888 { 6889 while (start_match < end_subject) 6890 { 6891 register uint32_t c = UCHAR21TEST(start_match); 6892 #if PCRE2_CODE_UNIT_WIDTH != 8 6893 if (c > 255) c = 255; 6894 #endif 6895 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 6896 start_match++; 6897 } 6898 } 6899 6900 /* Restore fudged end_subject */ 6901 6902 end_subject = save_end_subject; 6903 6904 /* The following two optimizations are disabled for partial matching. */ 6905 6906 if (!mb->partial) 6907 { 6908 /* The minimum matching length is a lower bound; no actual string of that 6909 length may actually match the pattern. Although the value is, strictly, 6910 in characters, we treat it as code units to avoid spending too much time 6911 in this optimization. */ 6912 6913 if (end_subject - start_match < re->minlength) 6914 { 6915 rc = MATCH_NOMATCH; 6916 break; 6917 } 6918 6919 /* If req_cu is set, we know that that code unit must appear in the 6920 subject for the match to succeed. If the first code unit is set, req_cu 6921 must be later in the subject; otherwise the test starts at the match 6922 point. This optimization can save a huge amount of backtracking in 6923 patterns with nested unlimited repeats that aren't going to match. 6924 Writing separate code for cased/caseless versions makes it go faster, as 6925 does using an autoincrement and backing off on a match. 6926 6927 HOWEVER: when the subject string is very, very long, searching to its end 6928 can take a long time, and give bad performance on quite ordinary 6929 patterns. This showed up when somebody was matching something like 6930 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is 6931 sufficiently long. */ 6932 6933 if (has_req_cu && end_subject - start_match < REQ_CU_MAX) 6934 { 6935 register PCRE2_SPTR p = start_match + (has_first_cu? 1:0); 6936 6937 /* We don't need to repeat the search if we haven't yet reached the 6938 place we found it at last time. */ 6939 6940 if (p > req_cu_ptr) 6941 { 6942 if (req_cu != req_cu2) 6943 { 6944 while (p < end_subject) 6945 { 6946 register uint32_t pp = UCHAR21INCTEST(p); 6947 if (pp == req_cu || pp == req_cu2) { p--; break; } 6948 } 6949 } 6950 else 6951 { 6952 while (p < end_subject) 6953 { 6954 if (UCHAR21INCTEST(p) == req_cu) { p--; break; } 6955 } 6956 } 6957 6958 /* If we can't find the required code unit, break the matching loop, 6959 forcing a match failure. */ 6960 6961 if (p >= end_subject) 6962 { 6963 rc = MATCH_NOMATCH; 6964 break; 6965 } 6966 6967 /* If we have found the required code unit, save the point where we 6968 found it, so that we don't search again next time round the loop if 6969 the start hasn't passed this code unit yet. */ 6970 6971 req_cu_ptr = p; 6972 } 6973 } 6974 } 6975 } 6976 6977 /* ------------ End of start of match optimizations ------------ */ 6978 6979 /* Give no match if we have passed the bumpalong limit. */ 6980 6981 if (start_match > bumpalong_limit) 6982 { 6983 rc = MATCH_NOMATCH; 6984 break; 6985 } 6986 6987 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6988 first starting point for which a partial match was found. */ 6989 6990 mb->start_match_ptr = start_match; 6991 mb->start_used_ptr = start_match; 6992 mb->last_used_ptr = start_match; 6993 mb->match_call_count = 0; 6994 mb->match_function_type = 0; 6995 mb->end_offset_top = 0; 6996 mb->skip_arg_count = 0; 6997 rc = match(start_match, mb->start_code, start_match, 2, mb, NULL, 0); 6998 6999 if (mb->hitend && start_partial == NULL) 7000 { 7001 start_partial = mb->start_used_ptr; 7002 match_partial = start_match; 7003 } 7004 7005 switch(rc) 7006 { 7007 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 7008 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 7009 entirely. The only way we can do that is to re-do the match at the same 7010 point, with a flag to force SKIP with an argument to be ignored. Just 7011 treating this case as NOMATCH does not work because it does not check other 7012 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 7013 7014 case MATCH_SKIP_ARG: 7015 new_start_match = start_match; 7016 mb->ignore_skip_arg = mb->skip_arg_count; 7017 break; 7018 7019 /* SKIP passes back the next starting point explicitly, but if it is no 7020 greater than the match we have just done, treat it as NOMATCH. */ 7021 7022 case MATCH_SKIP: 7023 if (mb->start_match_ptr > start_match) 7024 { 7025 new_start_match = mb->start_match_ptr; 7026 break; 7027 } 7028 /* Fall through */ 7029 7030 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 7031 exactly like PRUNE. Unset ignore SKIP-with-argument. */ 7032 7033 case MATCH_NOMATCH: 7034 case MATCH_PRUNE: 7035 case MATCH_THEN: 7036 mb->ignore_skip_arg = 0; 7037 new_start_match = start_match + 1; 7038 #ifdef SUPPORT_UNICODE 7039 if (utf) 7040 ACROSSCHAR(new_start_match < end_subject, *new_start_match, 7041 new_start_match++); 7042 #endif 7043 break; 7044 7045 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 7046 7047 case MATCH_COMMIT: 7048 rc = MATCH_NOMATCH; 7049 goto ENDLOOP; 7050 7051 /* Any other return is either a match, or some kind of error. */ 7052 7053 default: 7054 goto ENDLOOP; 7055 } 7056 7057 /* Control reaches here for the various types of "no match at this point" 7058 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 7059 7060 rc = MATCH_NOMATCH; 7061 7062 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first 7063 newline in the subject (though it may continue over the newline). Therefore, 7064 if we have just failed to match, starting at a newline, do not continue. */ 7065 7066 if (firstline && IS_NEWLINE(start_match)) break; 7067 7068 /* Advance to new matching position */ 7069 7070 start_match = new_start_match; 7071 7072 /* Break the loop if the pattern is anchored or if we have passed the end of 7073 the subject. */ 7074 7075 if (anchored || start_match > end_subject) break; 7076 7077 /* If we have just passed a CR and we are now at a LF, and the pattern does 7078 not contain any explicit matches for \r or \n, and the newline option is CRLF 7079 or ANY or ANYCRLF, advance the match position by one more code unit. In 7080 normal matching start_match will aways be greater than the first position at 7081 this stage, but a failed *SKIP can cause a return at the same point, which is 7082 why the first test exists. */ 7083 7084 if (start_match > subject + start_offset && 7085 start_match[-1] == CHAR_CR && 7086 start_match < end_subject && 7087 *start_match == CHAR_NL && 7088 (re->flags & PCRE2_HASCRORLF) == 0 && 7089 (mb->nltype == NLTYPE_ANY || 7090 mb->nltype == NLTYPE_ANYCRLF || 7091 mb->nllen == 2)) 7092 start_match++; 7093 7094 mb->mark = NULL; /* Reset for start of next match attempt */ 7095 } /* End of for(;;) "bumpalong" loop */ 7096 7097 /* ==========================================================================*/ 7098 7099 /* When we reach here, one of the stopping conditions is true: 7100 7101 (1) The match succeeded, either completely, or partially; 7102 7103 (2) The pattern is anchored or the match was failed by (*COMMIT); 7104 7105 (3) We are past the end of the subject or the bumpalong limit; 7106 7107 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because 7108 this option requests that a match occur at or before the first newline in 7109 the subject. 7110 7111 (5) Some kind of error occurred. 7112 7113 */ 7114 7115 ENDLOOP: 7116 7117 #ifdef HEAP_MATCH_RECURSE 7118 release_match_heapframes(&frame_zero, mb); 7119 #endif 7120 7121 /* Release any frames that were saved from recursions. */ 7122 7123 while (mb->ovecsave_chain != NULL) 7124 { 7125 ovecsave_frame *this = mb->ovecsave_chain; 7126 mb->ovecsave_chain = this->next; 7127 mb->memctl.free(this, mb->memctl.memory_data); 7128 } 7129 7130 /* Fill in fields that are always returned in the match data. */ 7131 7132 match_data->code = re; 7133 match_data->subject = subject; 7134 match_data->mark = mb->mark; 7135 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; 7136 7137 /* Handle a fully successful match. */ 7138 7139 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 7140 { 7141 uint32_t arg_offset_max = 2 * match_data->oveccount; 7142 7143 /* When the offset vector is big enough to deal with any backreferences, 7144 captured substring offsets will already be set up. In the case where we had 7145 to get some local memory to hold offsets for backreference processing, copy 7146 those that we can. In this case there need not be overflow if certain parts 7147 of the pattern were not used, even though there are more capturing 7148 parentheses than vector slots. */ 7149 7150 if (using_temporary_offsets) 7151 { 7152 if (arg_offset_max >= 4) 7153 { 7154 memcpy(match_data->ovector + 2, mb->ovector + 2, 7155 (arg_offset_max - 2) * sizeof(PCRE2_SIZE)); 7156 } 7157 if (mb->end_offset_top > arg_offset_max) mb->capture_last |= OVFLBIT; 7158 mb->memctl.free(mb->ovector, mb->memctl.memory_data); 7159 } 7160 7161 /* Set the return code to the number of captured strings, or 0 if there were 7162 too many to fit into the ovector. */ 7163 7164 match_data->rc = ((mb->capture_last & OVFLBIT) != 0)? 7165 0 : mb->end_offset_top/2; 7166 7167 /* If there is space in the offset vector, set any pairs that follow the 7168 highest-numbered captured string but are less than the number of capturing 7169 groups in the pattern (and are within the ovector) to PCRE2_UNSET. It is 7170 documented that this happens. In earlier versions, the whole set of potential 7171 capturing offsets was initialized each time round the loop, but this is 7172 handled differently now. "Gaps" are set to PCRE2_UNSET dynamically instead 7173 (this fixed a bug). Thus, it is only those at the end that need setting here. 7174 We can't just mark them all unset at the start of the whole thing because 7175 they may get set in one branch that is not the final matching branch. */ 7176 7177 if (mb->end_offset_top/2 <= re->top_bracket) 7178 { 7179 register PCRE2_SIZE *iptr, *iend; 7180 int resetcount = re->top_bracket + 1; 7181 if (resetcount > match_data->oveccount) resetcount = match_data->oveccount; 7182 iptr = match_data->ovector + mb->end_offset_top; 7183 iend = match_data->ovector + 2 * resetcount; 7184 while (iptr < iend) *iptr++ = PCRE2_UNSET; 7185 } 7186 7187 /* If there is space, set up the whole thing as substring 0. The value of 7188 mb->start_match_ptr might be modified if \K was encountered on the success 7189 matching path. */ 7190 7191 if (match_data->oveccount < 1) rc = 0; else 7192 { 7193 match_data->ovector[0] = mb->start_match_ptr - mb->start_subject; 7194 match_data->ovector[1] = mb->end_match_ptr - mb->start_subject; 7195 } 7196 7197 /* Set the remaining returned values */ 7198 7199 match_data->startchar = start_match - subject; 7200 match_data->leftchar = mb->start_used_ptr - subject; 7201 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? 7202 mb->last_used_ptr : mb->end_match_ptr) - subject; 7203 return match_data->rc; 7204 } 7205 7206 /* Control gets here if there has been a partial match, an error, or if the 7207 overall match attempt has failed at all permitted starting positions. Any mark 7208 data is in the nomatch_mark field. */ 7209 7210 match_data->mark = mb->nomatch_mark; 7211 7212 /* For anything other than nomatch or partial match, just return the code. */ 7213 7214 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) 7215 match_data->rc = rc; 7216 7217 /* Else handle a partial match. */ 7218 7219 else if (match_partial != NULL) 7220 { 7221 if (match_data->oveccount > 0) 7222 { 7223 match_data->ovector[0] = match_partial - subject; 7224 match_data->ovector[1] = end_subject - subject; 7225 } 7226 match_data->startchar = match_partial - subject; 7227 match_data->leftchar = start_partial - subject; 7228 match_data->rightchar = end_subject - subject; 7229 match_data->rc = PCRE2_ERROR_PARTIAL; 7230 } 7231 7232 /* Else this is the classic nomatch case. */ 7233 7234 else match_data->rc = PCRE2_ERROR_NOMATCH; 7235 7236 /* Free any temporary offsets. */ 7237 7238 if (using_temporary_offsets) 7239 mb->memctl.free(mb->ovector, mb->memctl.memory_data); 7240 return match_data->rc; 7241 } 7242 7243 /* End of pcre2_match.c */ 7244