1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2015-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 #ifdef HAVE_CONFIG_H 43 #include "config.h" 44 #endif 45 46 /* These defines enable debugging code */ 47 48 /* #define DEBUG_FRAMES_DISPLAY */ 49 /* #define DEBUG_SHOW_OPS */ 50 /* #define DEBUG_SHOW_RMATCH */ 51 52 #ifdef DEBUG_FRAME_DISPLAY 53 #include <stdarg.h> 54 #endif 55 56 /* These defines identify the name of the block containing "static" 57 information, and fields within it. */ 58 59 #define NLBLOCK mb /* Block containing newline information */ 60 #define PSSTART start_subject /* Field containing processed string start */ 61 #define PSEND end_subject /* Field containing processed string end */ 62 63 #include "pcre2_internal.h" 64 65 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ 66 67 /* Masks for identifying the public options that are permitted at match time. */ 68 69 #define PUBLIC_MATCH_OPTIONS \ 70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ 71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ 72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) 73 74 #define PUBLIC_JIT_MATCH_OPTIONS \ 75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ 76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD) 77 78 /* Non-error returns from and within the match() function. Error returns are 79 externally defined PCRE2_ERROR_xxx codes, which are all negative. */ 80 81 #define MATCH_MATCH 1 82 #define MATCH_NOMATCH 0 83 84 /* Special internal returns used in the match() function. Make them 85 sufficiently negative to avoid the external error codes. */ 86 87 #define MATCH_ACCEPT (-999) 88 #define MATCH_KETRPOS (-998) 89 /* The next 5 must be kept together and in sequence so that a test that checks 90 for any one of them can use a range. */ 91 #define MATCH_COMMIT (-997) 92 #define MATCH_PRUNE (-996) 93 #define MATCH_SKIP (-995) 94 #define MATCH_SKIP_ARG (-994) 95 #define MATCH_THEN (-993) 96 #define MATCH_BACKTRACK_MAX MATCH_THEN 97 #define MATCH_BACKTRACK_MIN MATCH_COMMIT 98 99 /* Group frame type values. Zero means the frame is not a group frame. The 100 lower 16 bits are used for data (e.g. the capture number). Group frames are 101 used for most groups so that information about the start is easily available at 102 the end without having to scan back through intermediate frames (backtrack 103 points). */ 104 105 #define GF_CAPTURE 0x00010000u 106 #define GF_NOCAPTURE 0x00020000u 107 #define GF_CONDASSERT 0x00030000u 108 #define GF_RECURSE 0x00040000u 109 110 /* Masks for the identity and data parts of the group frame type. */ 111 112 #define GF_IDMASK(a) ((a) & 0xffff0000u) 113 #define GF_DATAMASK(a) ((a) & 0x0000ffffu) 114 115 /* Repetition types */ 116 117 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; 118 119 /* Min and max values for the common repeats; a maximum of UINT32_MAX => 120 infinity. */ 121 122 static const uint32_t rep_min[] = { 123 0, 0, /* * and *? */ 124 1, 1, /* + and +? */ 125 0, 0, /* ? and ?? */ 126 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ 127 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ 128 129 static const uint32_t rep_max[] = { 130 UINT32_MAX, UINT32_MAX, /* * and *? */ 131 UINT32_MAX, UINT32_MAX, /* + and +? */ 132 1, 1, /* ? and ?? */ 133 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ 134 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ 135 136 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ 137 138 static const uint32_t rep_typ[] = { 139 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ 140 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ 141 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ 142 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ 143 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ 144 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ 145 146 /* Numbers for RMATCH calls at backtracking points. When these lists are 147 changed, the code at RETURN_SWITCH below must be updated in sync. */ 148 149 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 150 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 151 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 152 RM31, RM32, RM33, RM34, RM35, RM36 }; 153 154 #ifdef SUPPORT_WIDE_CHARS 155 enum { RM100=100, RM101 }; 156 #endif 157 158 #ifdef SUPPORT_UNICODE 159 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, 160 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, 161 RM216, RM217, RM218, RM219, RM220, RM221, RM222 }; 162 #endif 163 164 /* Define short names for general fields in the current backtrack frame, which 165 is always pointed to by the F variable. Occasional references to fields in 166 other frames are written out explicitly. There are also some fields in the 167 current frame whose names start with "temp" that are used for short-term, 168 localised backtracking memory. These are #defined with Lxxx names at the point 169 of use and undefined afterwards. */ 170 171 #define Fback_frame F->back_frame 172 #define Fcapture_last F->capture_last 173 #define Fcurrent_recurse F->current_recurse 174 #define Fecode F->ecode 175 #define Feptr F->eptr 176 #define Fgroup_frame_type F->group_frame_type 177 #define Flast_group_offset F->last_group_offset 178 #define Flength F->length 179 #define Fmark F->mark 180 #define Frdepth F->rdepth 181 #define Fstart_match F->start_match 182 #define Foffset_top F->offset_top 183 #define Foccu F->occu 184 #define Fop F->op 185 #define Fovector F->ovector 186 #define Freturn_id F->return_id 187 188 189 #ifdef DEBUG_FRAMES_DISPLAY 190 /************************************************* 191 * Display current frames and contents * 192 *************************************************/ 193 194 /* This debugging function displays the current set of frames and their 195 contents. It is not called automatically from anywhere, the intention being 196 that calls can be inserted where necessary when debugging frame-related 197 problems. 198 199 Arguments: 200 f the file to write to 201 F the current top frame 202 P a previous frame of interest 203 frame_size the frame size 204 mb points to the match block 205 s identification text 206 207 Returns: nothing 208 */ 209 210 static void 211 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, 212 match_block *mb, const char *s, ...) 213 { 214 uint32_t i; 215 heapframe *Q; 216 va_list ap; 217 va_start(ap, s); 218 219 fprintf(f, "FRAMES "); 220 vfprintf(f, s, ap); 221 va_end(ap); 222 223 if (P != NULL) fprintf(f, " P=%lu", 224 ((char *)P - (char *)(mb->match_frames))/frame_size); 225 fprintf(f, "\n"); 226 227 for (i = 0, Q = mb->match_frames; 228 Q <= F; 229 i++, Q = (heapframe *)((char *)Q + frame_size)) 230 { 231 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d", 232 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), 233 Q->back_frame, Q->return_id); 234 235 if (Q->last_group_offset == PCRE2_UNSET) 236 fprintf(f, " lgoffset=unset\n"); 237 else 238 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size); 239 } 240 } 241 242 #endif 243 244 245 246 /************************************************* 247 * Process a callout * 248 *************************************************/ 249 250 /* This function is called for all callouts, whether "standalone" or at the 251 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or 252 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized 253 with fixed values. 254 255 Arguments: 256 F points to the current backtracking frame 257 mb points to the match block 258 lengthptr where to return the length of the callout item 259 260 Returns: the return from the callout 261 or 0 if no callout function exists 262 */ 263 264 static int 265 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) 266 { 267 int rc; 268 PCRE2_SIZE save0, save1; 269 PCRE2_SIZE *callout_ovector; 270 pcre2_callout_block *cb; 271 272 *lengthptr = (*Fecode == OP_CALLOUT)? 273 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); 274 275 if (mb->callout == NULL) return 0; /* No callout function provided */ 276 277 /* The original matching code (pre 10.30) worked directly with the ovector 278 passed by the user, and this was passed to callouts. Now that the working 279 ovector is in the backtracking frame, it no longer needs to reserve space for 280 the overall match offsets (which would waste space in the frame). For backward 281 compatibility, however, we pass capture_top and offset_vector to the callout as 282 if for the extended ovector, and we ensure that the first two slots are unset 283 by preserving and restoring their current contents. Picky compilers complain if 284 references such as Fovector[-2] are use directly, so we set up a separate 285 pointer. */ 286 287 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; 288 289 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields 290 are set externally. The first 3 never change; the last is updated for each 291 bumpalong. */ 292 293 cb = mb->cb; 294 cb->capture_top = (uint32_t)Foffset_top/2 + 1; 295 cb->capture_last = Fcapture_last; 296 cb->offset_vector = callout_ovector; 297 cb->mark = mb->nomatch_mark; 298 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); 299 cb->pattern_position = GET(Fecode, 1); 300 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); 301 302 if (*Fecode == OP_CALLOUT) /* Numerical callout */ 303 { 304 cb->callout_number = Fecode[1 + 2*LINK_SIZE]; 305 cb->callout_string_offset = 0; 306 cb->callout_string = NULL; 307 cb->callout_string_length = 0; 308 } 309 else /* String callout */ 310 { 311 cb->callout_number = 0; 312 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); 313 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; 314 cb->callout_string_length = 315 *lengthptr - (1 + 4*LINK_SIZE) - 2; 316 } 317 318 save0 = callout_ovector[0]; 319 save1 = callout_ovector[1]; 320 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; 321 rc = mb->callout(cb, mb->callout_data); 322 callout_ovector[0] = save0; 323 callout_ovector[1] = save1; 324 cb->callout_flags = 0; 325 return rc; 326 } 327 328 329 330 /************************************************* 331 * Match a back-reference * 332 *************************************************/ 333 334 /* This function is called only when it is known that the offset lies within 335 the offsets that have so far been used in the match. Note that in caseless 336 UTF-8 mode, the number of subject bytes matched may be different to the number 337 of reference bytes. (In theory this could also happen in UTF-16 mode, but it 338 seems unlikely.) 339 340 Arguments: 341 offset index into the offset vector 342 caseless TRUE if caseless 343 F the current backtracking frame pointer 344 mb points to match block 345 lengthptr pointer for returning the length matched 346 347 Returns: = 0 sucessful match; number of code units matched is set 348 < 0 no match 349 > 0 partial match 350 */ 351 352 static int 353 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, 354 PCRE2_SIZE *lengthptr) 355 { 356 PCRE2_SPTR p; 357 PCRE2_SIZE length; 358 PCRE2_SPTR eptr; 359 PCRE2_SPTR eptr_start; 360 361 /* Deal with an unset group. The default is no match, but there is an option to 362 match an empty string. */ 363 364 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) 365 { 366 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) 367 { 368 *lengthptr = 0; 369 return 0; /* Match */ 370 } 371 else return -1; /* No match */ 372 } 373 374 /* Separate the caseless and UTF cases for speed. */ 375 376 eptr = eptr_start = Feptr; 377 p = mb->start_subject + Fovector[offset]; 378 length = Fovector[offset+1] - Fovector[offset]; 379 380 if (caseless) 381 { 382 #if defined SUPPORT_UNICODE 383 if ((mb->poptions & PCRE2_UTF) != 0) 384 { 385 /* Match characters up to the end of the reference. NOTE: the number of 386 code units matched may differ, because in UTF-8 there are some characters 387 whose upper and lower case codes have different numbers of bytes. For 388 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 389 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a 390 sequence of two of the latter. It is important, therefore, to check the 391 length along the reference, not along the subject (earlier code did this 392 wrong). */ 393 394 PCRE2_SPTR endptr = p + length; 395 while (p < endptr) 396 { 397 uint32_t c, d; 398 const ucd_record *ur; 399 if (eptr >= mb->end_subject) return 1; /* Partial match */ 400 GETCHARINC(c, eptr); 401 GETCHARINC(d, p); 402 ur = GET_UCD(d); 403 if (c != d && c != (uint32_t)((int)d + ur->other_case)) 404 { 405 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; 406 for (;;) 407 { 408 if (c < *pp) return -1; /* No match */ 409 if (c == *pp++) break; 410 } 411 } 412 } 413 } 414 else 415 #endif 416 417 /* Not in UTF mode */ 418 419 { 420 for (; length > 0; length--) 421 { 422 uint32_t cc, cp; 423 if (eptr >= mb->end_subject) return 1; /* Partial match */ 424 cc = UCHAR21TEST(eptr); 425 cp = UCHAR21TEST(p); 426 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) 427 return -1; /* No match */ 428 p++; 429 eptr++; 430 } 431 } 432 } 433 434 /* In the caseful case, we can just compare the code units, whether or not we 435 are in UTF mode. When partial matching, we have to do this unit-by-unit. */ 436 437 else 438 { 439 if (mb->partial != 0) 440 { 441 for (; length > 0; length--) 442 { 443 if (eptr >= mb->end_subject) return 1; /* Partial match */ 444 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ 445 } 446 } 447 448 /* Not partial matching */ 449 450 else 451 { 452 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ 453 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ 454 eptr += length; 455 } 456 } 457 458 *lengthptr = eptr - eptr_start; 459 return 0; /* Match */ 460 } 461 462 463 464 /****************************************************************************** 465 ******************************************************************************* 466 "Recursion" in the match() function 467 468 The original match() function was highly recursive, but this proved to be the 469 source of a number of problems over the years, mostly because of the relatively 470 small system stacks that are commonly found. As new features were added to 471 patterns, various kludges were invented to reduce the amount of stack used, 472 making the code hard to understand in places. 473 474 A version did exist that used individual frames on the heap instead of calling 475 match() recursively, but this ran substantially slower. The current version is 476 a refactoring that uses a vector of frames to remember backtracking points. 477 This runs no slower, and possibly even a bit faster than the original recursive 478 implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe 479 50 frames) is allocated on the system stack. If this is not big enough, the 480 heap is used for a larger vector. 481 482 ******************************************************************************* 483 ******************************************************************************/ 484 485 486 487 488 /************************************************* 489 * Macros for the match() function * 490 *************************************************/ 491 492 /* These macros pack up tests that are used for partial matching several times 493 in the code. We set the "hit end" flag if the pointer is at the end of the 494 subject and also past the earliest inspected character (i.e. something has been 495 matched, even if not part of the actual matched string). For hard partial 496 matching, we then return immediately. The second one is used when we already 497 know we are past the end of the subject. */ 498 499 #define CHECK_PARTIAL()\ 500 if (mb->partial != 0 && Feptr >= mb->end_subject && \ 501 Feptr > mb->start_used_ptr) \ 502 { \ 503 mb->hitend = TRUE; \ 504 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ 505 } 506 507 #define SCHECK_PARTIAL()\ 508 if (mb->partial != 0 && Feptr > mb->start_used_ptr) \ 509 { \ 510 mb->hitend = TRUE; \ 511 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ 512 } 513 514 /* These macros are used to implement backtracking. They simulate a recursive 515 call to the match() function by means of a local vector of frames which 516 remember the backtracking points. */ 517 518 #define RMATCH(ra,rb)\ 519 {\ 520 start_ecode = ra;\ 521 Freturn_id = rb;\ 522 goto MATCH_RECURSE;\ 523 L_##rb:;\ 524 } 525 526 #define RRETURN(ra)\ 527 {\ 528 rrc = ra;\ 529 goto RETURN_SWITCH;\ 530 } 531 532 533 534 /************************************************* 535 * Match from current position * 536 *************************************************/ 537 538 /* This function is called to run one match attempt at a single starting point 539 in the subject. 540 541 Performance note: It might be tempting to extract commonly used fields from the 542 mb structure (e.g. end_subject) into individual variables to improve 543 performance. Tests using gcc on a SPARC disproved this; in the first case, it 544 made performance worse. 545 546 Arguments: 547 start_eptr starting character in subject 548 start_ecode starting position in compiled code 549 ovector pointer to the final output vector 550 oveccount number of pairs in ovector 551 top_bracket number of capturing parentheses in the pattern 552 frame_size size of each backtracking frame 553 mb pointer to "static" variables block 554 555 Returns: MATCH_MATCH if matched ) these values are >= 0 556 MATCH_NOMATCH if failed to match ) 557 negative MATCH_xxx value for PRUNE, SKIP, etc 558 negative PCRE2_ERROR_xxx value if aborted by an error condition 559 (e.g. stopped by repeated call or depth limit) 560 */ 561 562 static int 563 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, 564 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size, 565 match_block *mb) 566 { 567 /* Frame-handling variables */ 568 569 heapframe *F; /* Current frame pointer */ 570 heapframe *N = NULL; /* Temporary frame pointers */ 571 heapframe *P = NULL; 572 heapframe *assert_accept_frame; /* For passing back the frame with captures */ 573 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ 574 575 /* Local variables that do not need to be preserved over calls to RRMATCH(). */ 576 577 PCRE2_SPTR bracode; /* Temp pointer to start of group */ 578 PCRE2_SIZE offset; /* Used for group offsets */ 579 PCRE2_SIZE length; /* Used for various length calculations */ 580 581 int rrc; /* Return from functions & backtracking "recursions" */ 582 #ifdef SUPPORT_UNICODE 583 int proptype; /* Type of character property */ 584 #endif 585 586 uint32_t i; /* Used for local loops */ 587 uint32_t fc; /* Character values */ 588 uint32_t number; /* Used for group and other numbers */ 589 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ 590 uint32_t group_frame_type; /* Specifies type for new group frames */ 591 592 BOOL condition; /* Used in conditional groups */ 593 BOOL cur_is_word; /* Used in "word" tests */ 594 BOOL prev_is_word; /* Used in "word" tests */ 595 596 /* UTF flag */ 597 598 #ifdef SUPPORT_UNICODE 599 BOOL utf = (mb->poptions & PCRE2_UTF) != 0; 600 #else 601 BOOL utf = FALSE; 602 #endif 603 604 /* This is the length of the last part of a backtracking frame that must be 605 copied when a new frame is created. */ 606 607 frame_copy_size = frame_size - offsetof(heapframe, eptr); 608 609 /* Set up the first current frame at the start of the vector, and initialize 610 fields that are not reset for new frames. */ 611 612 F = mb->match_frames; 613 Frdepth = 0; /* "Recursion" depth */ 614 Fcapture_last = 0; /* Number of most recent capture */ 615 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ 616 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ 617 Fmark = NULL; /* Most recent mark */ 618 Foffset_top = 0; /* End of captures within the frame */ 619 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ 620 group_frame_type = 0; /* Not a start of group frame */ 621 goto NEW_FRAME; /* Start processing with this frame */ 622 623 /* Come back here when we want to create a new frame for remembering a 624 backtracking point. */ 625 626 MATCH_RECURSE: 627 628 /* Set up a new backtracking frame. If the vector is full, get a new one 629 on the heap, doubling the size, but constrained by the heap limit. */ 630 631 N = (heapframe *)((char *)F + frame_size); 632 if (N >= mb->match_frames_top) 633 { 634 PCRE2_SIZE newsize = mb->frame_vector_size * 2; 635 heapframe *new; 636 637 if ((newsize / 1024) > mb->heap_limit) 638 { 639 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size; 640 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT; 641 newsize = maxsize; 642 } 643 644 new = mb->memctl.malloc(newsize, mb->memctl.memory_data); 645 if (new == NULL) return PCRE2_ERROR_NOMEMORY; 646 memcpy(new, mb->match_frames, mb->frame_vector_size); 647 648 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames)); 649 N = (heapframe *)((char *)F + frame_size); 650 651 if (mb->match_frames != mb->stack_frames) 652 mb->memctl.free(mb->match_frames, mb->memctl.memory_data); 653 mb->match_frames = new; 654 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize); 655 mb->frame_vector_size = newsize; 656 } 657 658 #ifdef DEBUG_SHOW_RMATCH 659 fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1); 660 if (group_frame_type != 0) 661 { 662 fprintf(stderr, " type=%x ", group_frame_type); 663 switch (GF_IDMASK(group_frame_type)) 664 { 665 case GF_CAPTURE: 666 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type)); 667 break; 668 669 case GF_NOCAPTURE: 670 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type)); 671 break; 672 673 case GF_CONDASSERT: 674 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type)); 675 break; 676 677 case GF_RECURSE: 678 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type)); 679 break; 680 681 default: 682 fprintf(stderr, "*** unknown ***"); 683 break; 684 } 685 } 686 fprintf(stderr, "\n"); 687 #endif 688 689 /* Copy those fields that must be copied into the new frame, increase the 690 "recursion" depth (i.e. the new frame's index) and then make the new frame 691 current. */ 692 693 memcpy((char *)N + offsetof(heapframe, eptr), 694 (char *)F + offsetof(heapframe, eptr), 695 frame_copy_size); 696 697 N->rdepth = Frdepth + 1; 698 F = N; 699 700 /* Carry on processing with a new frame. */ 701 702 NEW_FRAME: 703 Fgroup_frame_type = group_frame_type; 704 Fecode = start_ecode; /* Starting code pointer */ 705 Fback_frame = frame_size; /* Default is go back one frame */ 706 707 /* If this is a special type of group frame, remember its offset for quick 708 access at the end of the group. If this is a recursion, set a new current 709 recursion value. */ 710 711 if (group_frame_type != 0) 712 { 713 Flast_group_offset = (char *)F - (char *)mb->match_frames; 714 if (GF_IDMASK(group_frame_type) == GF_RECURSE) 715 Fcurrent_recurse = GF_DATAMASK(group_frame_type); 716 group_frame_type = 0; 717 } 718 719 720 /* ========================================================================= */ 721 /* This is the main processing loop. First check that we haven't recorded too 722 many backtracks (search tree is too large), or that we haven't exceeded the 723 recursive depth limit (used too many backtracking frames). If not, process the 724 opcodes. */ 725 726 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; 727 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; 728 729 for (;;) 730 { 731 #ifdef DEBUG_SHOW_OPS 732 fprintf(stderr, "++ op=%d\n", *Fecode); 733 #endif 734 735 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ 736 switch(Fop) 737 { 738 /* ===================================================================== */ 739 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close 740 any currently open capturing brackets. Unlike reaching the end of a group, 741 where we know the starting frame is at the top of the chained frames, in 742 this case we have to search back for the relevant frame in case other types 743 of group that use chained frames have intervened. Multiple OP_CLOSEs always 744 come innermost first, which matches the chain order. We can ignore this in 745 a recursion, because captures are not passed out of recursions. */ 746 747 case OP_CLOSE: 748 if (Fcurrent_recurse == RECURSE_UNSET) 749 { 750 number = GET2(Fecode, 1); 751 offset = Flast_group_offset; 752 for(;;) 753 { 754 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; 755 N = (heapframe *)((char *)mb->match_frames + offset); 756 P = (heapframe *)((char *)N - frame_size); 757 if (N->group_frame_type == (GF_CAPTURE | number)) break; 758 offset = P->last_group_offset; 759 } 760 offset = (number << 1) - 2; 761 Fcapture_last = number; 762 Fovector[offset] = P->eptr - mb->start_subject; 763 Fovector[offset+1] = Feptr - mb->start_subject; 764 if (offset >= Foffset_top) Foffset_top = offset + 2; 765 } 766 Fecode += PRIV(OP_lengths)[*Fecode]; 767 break; 768 769 770 /* ===================================================================== */ 771 /* Real or forced end of the pattern, assertion, or recursion. In an 772 assertion ACCEPT, update the last used pointer and remember the current 773 frame so that the captures and mark can be fished out of it. */ 774 775 case OP_ASSERT_ACCEPT: 776 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; 777 assert_accept_frame = F; 778 RRETURN(MATCH_ACCEPT); 779 780 /* If recursing, we have to find the most recent recursion. */ 781 782 case OP_ACCEPT: 783 case OP_END: 784 785 /* Handle end of a recursion. */ 786 787 if (Fcurrent_recurse != RECURSE_UNSET) 788 { 789 offset = Flast_group_offset; 790 for(;;) 791 { 792 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; 793 N = (heapframe *)((char *)mb->match_frames + offset); 794 P = (heapframe *)((char *)N - frame_size); 795 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; 796 offset = P->last_group_offset; 797 } 798 799 /* N is now the frame of the recursion; the previous frame is at the 800 OP_RECURSE position. Go back there, copying the current subject position 801 and mark, and move on past the OP_RECURSE. */ 802 803 P->eptr = Feptr; 804 P->mark = Fmark; 805 F = P; 806 Fecode += 1 + LINK_SIZE; 807 continue; 808 } 809 810 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY 811 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the 812 start of the subject. In both cases, backtracking will then try other 813 alternatives, if any. */ 814 815 if (Feptr == Fstart_match && 816 ((mb->moptions & PCRE2_NOTEMPTY) != 0 || 817 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && 818 Fstart_match == mb->start_subject + mb->start_offset))) 819 RRETURN(MATCH_NOMATCH); 820 821 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not 822 the end of the subject. After (*ACCEPT) we fail the entire match (at this 823 position) but backtrack on reaching the end of the pattern. */ 824 825 if (Feptr < mb->end_subject && 826 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) 827 { 828 if (Fop == OP_END) RRETURN(MATCH_NOMATCH); 829 return MATCH_NOMATCH; 830 } 831 832 /* We have a successful match of the whole pattern. Record the result and 833 then do a direct return from the function. If there is space in the offset 834 vector, set any pairs that follow the highest-numbered captured string but 835 are less than the number of capturing groups in the pattern to PCRE2_UNSET. 836 It is documented that this happens. "Gaps" are set to PCRE2_UNSET 837 dynamically. It is only those at the end that need setting here. */ 838 839 mb->end_match_ptr = Feptr; /* Record where we ended */ 840 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ 841 mb->mark = Fmark; /* and the last success mark */ 842 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; 843 844 ovector[0] = Fstart_match - mb->start_subject; 845 ovector[1] = Feptr - mb->start_subject; 846 847 /* Set i to the smaller of the sizes of the external and frame ovectors. */ 848 849 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1); 850 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); 851 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET; 852 return MATCH_MATCH; /* Note: NOT RRETURN */ 853 854 855 /*===================================================================== */ 856 /* Match any single character type except newline; have to take care with 857 CRLF newlines and partial matching. */ 858 859 case OP_ANY: 860 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); 861 if (mb->partial != 0 && 862 Feptr == mb->end_subject - 1 && 863 NLBLOCK->nltype == NLTYPE_FIXED && 864 NLBLOCK->nllen == 2 && 865 UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) 866 { 867 mb->hitend = TRUE; 868 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 869 } 870 /* Fall through */ 871 872 /* Match any single character whatsoever. */ 873 874 case OP_ALLANY: 875 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ 876 { /* not be updated before SCHECK_PARTIAL. */ 877 SCHECK_PARTIAL(); 878 RRETURN(MATCH_NOMATCH); 879 } 880 Feptr++; 881 #ifdef SUPPORT_UNICODE 882 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 883 #endif 884 Fecode++; 885 break; 886 887 888 /* ===================================================================== */ 889 /* Match a single code unit, even in UTF mode. This opcode really does 890 match any code unit, even newline. (It really should be called ANYCODEUNIT, 891 of course - the byte name is from pre-16 bit days.) */ 892 893 case OP_ANYBYTE: 894 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ 895 { /* not be updated before SCHECK_PARTIAL. */ 896 SCHECK_PARTIAL(); 897 RRETURN(MATCH_NOMATCH); 898 } 899 Feptr++; 900 Fecode++; 901 break; 902 903 904 /* ===================================================================== */ 905 /* Match a single character, casefully */ 906 907 case OP_CHAR: 908 #ifdef SUPPORT_UNICODE 909 if (utf) 910 { 911 Flength = 1; 912 Fecode++; 913 GETCHARLEN(fc, Fecode, Flength); 914 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) 915 { 916 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 917 RRETURN(MATCH_NOMATCH); 918 } 919 for (; Flength > 0; Flength--) 920 { 921 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); 922 } 923 } 924 else 925 #endif 926 /* Not UTF mode */ 927 { 928 if (mb->end_subject - Feptr < 1) 929 { 930 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 931 RRETURN(MATCH_NOMATCH); 932 } 933 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); 934 Fecode += 2; 935 } 936 break; 937 938 939 /* ===================================================================== */ 940 /* Match a single character, caselessly. If we are at the end of the 941 subject, give up immediately. We get here only when the pattern character 942 has at most one other case. Characters with more than two cases are coded 943 as OP_PROP with the pseudo-property PT_CLIST. */ 944 945 case OP_CHARI: 946 if (Feptr >= mb->end_subject) 947 { 948 SCHECK_PARTIAL(); 949 RRETURN(MATCH_NOMATCH); 950 } 951 952 #ifdef SUPPORT_UNICODE 953 if (utf) 954 { 955 Flength = 1; 956 Fecode++; 957 GETCHARLEN(fc, Fecode, Flength); 958 959 /* If the pattern character's value is < 128, we know that its other case 960 (if any) is also < 128 (and therefore only one code unit long in all 961 code-unit widths), so we can use the fast lookup table. We checked above 962 that there is at least one character left in the subject. */ 963 964 if (fc < 128) 965 { 966 uint32_t cc = UCHAR21(Feptr); 967 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); 968 Fecode++; 969 Feptr++; 970 } 971 972 /* Otherwise we must pick up the subject character and use Unicode 973 property support to test its other case. Note that we cannot use the 974 value of "Flength" to check for sufficient bytes left, because the other 975 case of the character may have more or fewer code units. */ 976 977 else 978 { 979 uint32_t dc; 980 GETCHARINC(dc, Feptr); 981 Fecode += Flength; 982 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); 983 } 984 } 985 else 986 #endif /* SUPPORT_UNICODE */ 987 988 /* Not UTF mode; use the table for characters < 256. */ 989 { 990 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) 991 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); 992 Feptr++; 993 Fecode += 2; 994 } 995 break; 996 997 998 /* ===================================================================== */ 999 /* Match not a single character. */ 1000 1001 case OP_NOT: 1002 case OP_NOTI: 1003 if (Feptr >= mb->end_subject) 1004 { 1005 SCHECK_PARTIAL(); 1006 RRETURN(MATCH_NOMATCH); 1007 } 1008 #ifdef SUPPORT_UNICODE 1009 if (utf) 1010 { 1011 uint32_t ch; 1012 Fecode++; 1013 GETCHARINC(ch, Fecode); 1014 GETCHARINC(fc, Feptr); 1015 if (ch == fc) 1016 { 1017 RRETURN(MATCH_NOMATCH); /* Caseful match */ 1018 } 1019 else if (Fop == OP_NOTI) /* If caseless */ 1020 { 1021 if (ch > 127) 1022 ch = UCD_OTHERCASE(ch); 1023 else 1024 ch = TABLE_GET(ch, mb->fcc, ch); 1025 if (ch == fc) RRETURN(MATCH_NOMATCH); 1026 } 1027 } 1028 else 1029 #endif /* SUPPORT_UNICODE */ 1030 { 1031 uint32_t ch = Fecode[1]; 1032 fc = *Feptr++; 1033 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) 1034 RRETURN(MATCH_NOMATCH); 1035 Fecode += 2; 1036 } 1037 break; 1038 1039 1040 /* ===================================================================== */ 1041 /* Match a single character repeatedly. */ 1042 1043 #define Loclength F->temp_size 1044 #define Lstart_eptr F->temp_sptr[0] 1045 #define Lcharptr F->temp_sptr[1] 1046 #define Lmin F->temp_32[0] 1047 #define Lmax F->temp_32[1] 1048 #define Lc F->temp_32[2] 1049 #define Loc F->temp_32[3] 1050 1051 case OP_EXACT: 1052 case OP_EXACTI: 1053 Lmin = Lmax = GET2(Fecode, 1); 1054 Fecode += 1 + IMM2_SIZE; 1055 goto REPEATCHAR; 1056 1057 case OP_POSUPTO: 1058 case OP_POSUPTOI: 1059 reptype = REPTYPE_POS; 1060 Lmin = 0; 1061 Lmax = GET2(Fecode, 1); 1062 Fecode += 1 + IMM2_SIZE; 1063 goto REPEATCHAR; 1064 1065 case OP_UPTO: 1066 case OP_UPTOI: 1067 reptype = REPTYPE_MAX; 1068 Lmin = 0; 1069 Lmax = GET2(Fecode, 1); 1070 Fecode += 1 + IMM2_SIZE; 1071 goto REPEATCHAR; 1072 1073 case OP_MINUPTO: 1074 case OP_MINUPTOI: 1075 reptype = REPTYPE_MIN; 1076 Lmin = 0; 1077 Lmax = GET2(Fecode, 1); 1078 Fecode += 1 + IMM2_SIZE; 1079 goto REPEATCHAR; 1080 1081 case OP_POSSTAR: 1082 case OP_POSSTARI: 1083 reptype = REPTYPE_POS; 1084 Lmin = 0; 1085 Lmax = UINT32_MAX; 1086 Fecode++; 1087 goto REPEATCHAR; 1088 1089 case OP_POSPLUS: 1090 case OP_POSPLUSI: 1091 reptype = REPTYPE_POS; 1092 Lmin = 1; 1093 Lmax = UINT32_MAX; 1094 Fecode++; 1095 goto REPEATCHAR; 1096 1097 case OP_POSQUERY: 1098 case OP_POSQUERYI: 1099 reptype = REPTYPE_POS; 1100 Lmin = 0; 1101 Lmax = 1; 1102 Fecode++; 1103 goto REPEATCHAR; 1104 1105 case OP_STAR: 1106 case OP_STARI: 1107 case OP_MINSTAR: 1108 case OP_MINSTARI: 1109 case OP_PLUS: 1110 case OP_PLUSI: 1111 case OP_MINPLUS: 1112 case OP_MINPLUSI: 1113 case OP_QUERY: 1114 case OP_QUERYI: 1115 case OP_MINQUERY: 1116 case OP_MINQUERYI: 1117 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); 1118 Lmin = rep_min[fc]; 1119 Lmax = rep_max[fc]; 1120 reptype = rep_typ[fc]; 1121 1122 /* Common code for all repeated single-character matches. We first check 1123 for the minimum number of characters. If the minimum equals the maximum, we 1124 are done. Otherwise, if minimizing, check the rest of the pattern for a 1125 match; if there isn't one, advance up to the maximum, one character at a 1126 time. 1127 1128 If maximizing, advance up to the maximum number of matching characters, 1129 until Feptr is past the end of the maximum run. If possessive, we are 1130 then done (no backing up). Otherwise, match at this position; anything 1131 other than no match is immediately returned. For nomatch, back up one 1132 character, unless we are matching \R and the last thing matched was 1133 \r\n, in which case, back up two code units until we reach the first 1134 optional character position. 1135 1136 The various UTF/non-UTF and caseful/caseless cases are handled separately, 1137 for speed. */ 1138 1139 REPEATCHAR: 1140 #ifdef SUPPORT_UNICODE 1141 if (utf) 1142 { 1143 Flength = 1; 1144 Lcharptr = Fecode; 1145 GETCHARLEN(fc, Fecode, Flength); 1146 Fecode += Flength; 1147 1148 /* Handle multi-code-unit character matching, caseful and caseless. */ 1149 1150 if (Flength > 1) 1151 { 1152 uint32_t othercase; 1153 1154 if (Fop >= OP_STARI && /* Caseless */ 1155 (othercase = UCD_OTHERCASE(fc)) != fc) 1156 Loclength = PRIV(ord2utf)(othercase, Foccu); 1157 else Loclength = 0; 1158 1159 for (i = 1; i <= Lmin; i++) 1160 { 1161 if (Feptr <= mb->end_subject - Flength && 1162 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; 1163 else if (Loclength > 0 && 1164 Feptr <= mb->end_subject - Loclength && 1165 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) 1166 Feptr += Loclength; 1167 else 1168 { 1169 CHECK_PARTIAL(); 1170 RRETURN(MATCH_NOMATCH); 1171 } 1172 } 1173 1174 if (Lmin == Lmax) continue; 1175 1176 if (reptype == REPTYPE_MIN) 1177 { 1178 for (;;) 1179 { 1180 RMATCH(Fecode, RM202); 1181 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1182 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1183 if (Feptr <= mb->end_subject - Flength && 1184 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; 1185 else if (Loclength > 0 && 1186 Feptr <= mb->end_subject - Loclength && 1187 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) 1188 Feptr += Loclength; 1189 else 1190 { 1191 CHECK_PARTIAL(); 1192 RRETURN(MATCH_NOMATCH); 1193 } 1194 } 1195 /* Control never gets here */ 1196 } 1197 1198 else /* Maximize */ 1199 { 1200 Lstart_eptr = Feptr; 1201 for (i = Lmin; i < Lmax; i++) 1202 { 1203 if (Feptr <= mb->end_subject - Flength && 1204 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) 1205 Feptr += Flength; 1206 else if (Loclength > 0 && 1207 Feptr <= mb->end_subject - Loclength && 1208 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) 1209 Feptr += Loclength; 1210 else 1211 { 1212 CHECK_PARTIAL(); 1213 break; 1214 } 1215 } 1216 1217 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 1218 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 1219 go too far. */ 1220 1221 if (reptype != REPTYPE_POS) for(;;) 1222 { 1223 if (Feptr <= Lstart_eptr) break; 1224 RMATCH(Fecode, RM203); 1225 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1226 Feptr--; 1227 BACKCHAR(Feptr); 1228 } 1229 } 1230 break; /* End of repeated wide character handling */ 1231 } 1232 1233 /* Length of UTF character is 1. Put it into the preserved variable and 1234 fall through to the non-UTF code. */ 1235 1236 Lc = fc; 1237 } 1238 else 1239 #endif /* SUPPORT_UNICODE */ 1240 1241 /* When not in UTF mode, load a single-code-unit character. Then proceed as 1242 above. */ 1243 1244 Lc = *Fecode++; 1245 1246 /* Caseless comparison */ 1247 1248 if (Fop >= OP_STARI) 1249 { 1250 #if PCRE2_CODE_UNIT_WIDTH == 8 1251 /* Lc must be < 128 in UTF-8 mode. */ 1252 Loc = mb->fcc[Lc]; 1253 #else /* 16-bit & 32-bit */ 1254 #ifdef SUPPORT_UNICODE 1255 if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); 1256 else 1257 #endif /* SUPPORT_UNICODE */ 1258 Loc = TABLE_GET(Lc, mb->fcc, Lc); 1259 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 1260 1261 for (i = 1; i <= Lmin; i++) 1262 { 1263 uint32_t cc; /* Faster than PCRE2_UCHAR */ 1264 if (Feptr >= mb->end_subject) 1265 { 1266 SCHECK_PARTIAL(); 1267 RRETURN(MATCH_NOMATCH); 1268 } 1269 cc = UCHAR21TEST(Feptr); 1270 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); 1271 Feptr++; 1272 } 1273 if (Lmin == Lmax) continue; 1274 1275 if (reptype == REPTYPE_MIN) 1276 { 1277 for (;;) 1278 { 1279 uint32_t cc; /* Faster than PCRE2_UCHAR */ 1280 RMATCH(Fecode, RM25); 1281 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1282 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1283 if (Feptr >= mb->end_subject) 1284 { 1285 SCHECK_PARTIAL(); 1286 RRETURN(MATCH_NOMATCH); 1287 } 1288 cc = UCHAR21TEST(Feptr); 1289 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); 1290 Feptr++; 1291 } 1292 /* Control never gets here */ 1293 } 1294 1295 else /* Maximize */ 1296 { 1297 Lstart_eptr = Feptr; 1298 for (i = Lmin; i < Lmax; i++) 1299 { 1300 uint32_t cc; /* Faster than PCRE2_UCHAR */ 1301 if (Feptr >= mb->end_subject) 1302 { 1303 SCHECK_PARTIAL(); 1304 break; 1305 } 1306 cc = UCHAR21TEST(Feptr); 1307 if (Lc != cc && Loc != cc) break; 1308 Feptr++; 1309 } 1310 if (reptype != REPTYPE_POS) for (;;) 1311 { 1312 if (Feptr == Lstart_eptr) break; 1313 RMATCH(Fecode, RM26); 1314 Feptr--; 1315 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1316 } 1317 } 1318 } 1319 1320 /* Caseful comparisons (includes all multi-byte characters) */ 1321 1322 else 1323 { 1324 for (i = 1; i <= Lmin; i++) 1325 { 1326 if (Feptr >= mb->end_subject) 1327 { 1328 SCHECK_PARTIAL(); 1329 RRETURN(MATCH_NOMATCH); 1330 } 1331 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); 1332 } 1333 1334 if (Lmin == Lmax) continue; 1335 1336 if (reptype == REPTYPE_MIN) 1337 { 1338 for (;;) 1339 { 1340 RMATCH(Fecode, RM27); 1341 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1342 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1343 if (Feptr >= mb->end_subject) 1344 { 1345 SCHECK_PARTIAL(); 1346 RRETURN(MATCH_NOMATCH); 1347 } 1348 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); 1349 } 1350 /* Control never gets here */ 1351 } 1352 else /* Maximize */ 1353 { 1354 Lstart_eptr = Feptr; 1355 for (i = Lmin; i < Lmax; i++) 1356 { 1357 if (Feptr >= mb->end_subject) 1358 { 1359 SCHECK_PARTIAL(); 1360 break; 1361 } 1362 1363 if (Lc != UCHAR21TEST(Feptr)) break; 1364 Feptr++; 1365 } 1366 1367 if (reptype != REPTYPE_POS) for (;;) 1368 { 1369 if (Feptr <= Lstart_eptr) break; 1370 RMATCH(Fecode, RM28); 1371 Feptr--; 1372 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1373 } 1374 } 1375 } 1376 break; 1377 1378 #undef Loclength 1379 #undef Lstart_eptr 1380 #undef Lcharptr 1381 #undef Lmin 1382 #undef Lmax 1383 #undef Lc 1384 #undef Loc 1385 1386 1387 /* ===================================================================== */ 1388 /* Match a negated single one-byte character repeatedly. This is almost a 1389 repeat of the code for a repeated single character, but I haven't found a 1390 nice way of commoning these up that doesn't require a test of the 1391 positive/negative option for each character match. Maybe that wouldn't add 1392 very much to the time taken, but character matching *is* what this is all 1393 about... */ 1394 1395 #define Lstart_eptr F->temp_sptr[0] 1396 #define Lmin F->temp_32[0] 1397 #define Lmax F->temp_32[1] 1398 #define Lc F->temp_32[2] 1399 #define Loc F->temp_32[3] 1400 1401 case OP_NOTEXACT: 1402 case OP_NOTEXACTI: 1403 Lmin = Lmax = GET2(Fecode, 1); 1404 Fecode += 1 + IMM2_SIZE; 1405 goto REPEATNOTCHAR; 1406 1407 case OP_NOTUPTO: 1408 case OP_NOTUPTOI: 1409 Lmin = 0; 1410 Lmax = GET2(Fecode, 1); 1411 reptype = REPTYPE_MAX; 1412 Fecode += 1 + IMM2_SIZE; 1413 goto REPEATNOTCHAR; 1414 1415 case OP_NOTMINUPTO: 1416 case OP_NOTMINUPTOI: 1417 Lmin = 0; 1418 Lmax = GET2(Fecode, 1); 1419 reptype = REPTYPE_MIN; 1420 Fecode += 1 + IMM2_SIZE; 1421 goto REPEATNOTCHAR; 1422 1423 case OP_NOTPOSSTAR: 1424 case OP_NOTPOSSTARI: 1425 reptype = REPTYPE_POS; 1426 Lmin = 0; 1427 Lmax = UINT32_MAX; 1428 Fecode++; 1429 goto REPEATNOTCHAR; 1430 1431 case OP_NOTPOSPLUS: 1432 case OP_NOTPOSPLUSI: 1433 reptype = REPTYPE_POS; 1434 Lmin = 1; 1435 Lmax = UINT32_MAX; 1436 Fecode++; 1437 goto REPEATNOTCHAR; 1438 1439 case OP_NOTPOSQUERY: 1440 case OP_NOTPOSQUERYI: 1441 reptype = REPTYPE_POS; 1442 Lmin = 0; 1443 Lmax = 1; 1444 Fecode++; 1445 goto REPEATNOTCHAR; 1446 1447 case OP_NOTPOSUPTO: 1448 case OP_NOTPOSUPTOI: 1449 reptype = REPTYPE_POS; 1450 Lmin = 0; 1451 Lmax = GET2(Fecode, 1); 1452 Fecode += 1 + IMM2_SIZE; 1453 goto REPEATNOTCHAR; 1454 1455 case OP_NOTSTAR: 1456 case OP_NOTSTARI: 1457 case OP_NOTMINSTAR: 1458 case OP_NOTMINSTARI: 1459 case OP_NOTPLUS: 1460 case OP_NOTPLUSI: 1461 case OP_NOTMINPLUS: 1462 case OP_NOTMINPLUSI: 1463 case OP_NOTQUERY: 1464 case OP_NOTQUERYI: 1465 case OP_NOTMINQUERY: 1466 case OP_NOTMINQUERYI: 1467 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 1468 Lmin = rep_min[fc]; 1469 Lmax = rep_max[fc]; 1470 reptype = rep_typ[fc]; 1471 1472 /* Common code for all repeated single-character non-matches. */ 1473 1474 REPEATNOTCHAR: 1475 GETCHARINCTEST(Lc, Fecode); 1476 1477 /* The code is duplicated for the caseless and caseful cases, for speed, 1478 since matching characters is likely to be quite common. First, ensure the 1479 minimum number of matches are present. If Lmin = Lmax, we are done. 1480 Otherwise, if minimizing, keep trying the rest of the expression and 1481 advancing one matching character if failing, up to the maximum. 1482 Alternatively, if maximizing, find the maximum number of characters and 1483 work backwards. */ 1484 1485 if (Fop >= OP_NOTSTARI) /* Caseless */ 1486 { 1487 #ifdef SUPPORT_UNICODE 1488 if (utf && Lc > 127) 1489 Loc = UCD_OTHERCASE(Lc); 1490 else 1491 #endif /* SUPPORT_UNICODE */ 1492 1493 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ 1494 1495 #ifdef SUPPORT_UNICODE 1496 if (utf) 1497 { 1498 uint32_t d; 1499 for (i = 1; i <= Lmin; i++) 1500 { 1501 if (Feptr >= mb->end_subject) 1502 { 1503 SCHECK_PARTIAL(); 1504 RRETURN(MATCH_NOMATCH); 1505 } 1506 GETCHARINC(d, Feptr); 1507 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); 1508 } 1509 } 1510 else 1511 #endif /* SUPPORT_UNICODE */ 1512 1513 /* Not UTF mode */ 1514 { 1515 for (i = 1; i <= Lmin; i++) 1516 { 1517 if (Feptr >= mb->end_subject) 1518 { 1519 SCHECK_PARTIAL(); 1520 RRETURN(MATCH_NOMATCH); 1521 } 1522 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); 1523 Feptr++; 1524 } 1525 } 1526 1527 if (Lmin == Lmax) continue; /* Finished for exact count */ 1528 1529 if (reptype == REPTYPE_MIN) 1530 { 1531 #ifdef SUPPORT_UNICODE 1532 if (utf) 1533 { 1534 uint32_t d; 1535 for (;;) 1536 { 1537 RMATCH(Fecode, RM204); 1538 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1539 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1540 if (Feptr >= mb->end_subject) 1541 { 1542 SCHECK_PARTIAL(); 1543 RRETURN(MATCH_NOMATCH); 1544 } 1545 GETCHARINC(d, Feptr); 1546 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); 1547 } 1548 } 1549 else 1550 #endif /*SUPPORT_UNICODE */ 1551 1552 /* Not UTF mode */ 1553 { 1554 for (;;) 1555 { 1556 RMATCH(Fecode, RM29); 1557 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1558 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1559 if (Feptr >= mb->end_subject) 1560 { 1561 SCHECK_PARTIAL(); 1562 RRETURN(MATCH_NOMATCH); 1563 } 1564 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); 1565 Feptr++; 1566 } 1567 } 1568 /* Control never gets here */ 1569 } 1570 1571 /* Maximize case */ 1572 1573 else 1574 { 1575 Lstart_eptr = Feptr; 1576 1577 #ifdef SUPPORT_UNICODE 1578 if (utf) 1579 { 1580 uint32_t d; 1581 for (i = Lmin; i < Lmax; i++) 1582 { 1583 int len = 1; 1584 if (Feptr >= mb->end_subject) 1585 { 1586 SCHECK_PARTIAL(); 1587 break; 1588 } 1589 GETCHARLEN(d, Feptr, len); 1590 if (Lc == d || Loc == d) break; 1591 Feptr += len; 1592 } 1593 1594 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 1595 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 1596 go too far. */ 1597 1598 if (reptype != REPTYPE_POS) for(;;) 1599 { 1600 if (Feptr <= Lstart_eptr) break; 1601 RMATCH(Fecode, RM205); 1602 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1603 Feptr--; 1604 BACKCHAR(Feptr); 1605 } 1606 } 1607 else 1608 #endif /* SUPPORT_UNICODE */ 1609 1610 /* Not UTF mode */ 1611 { 1612 for (i = Lmin; i < Lmax; i++) 1613 { 1614 if (Feptr >= mb->end_subject) 1615 { 1616 SCHECK_PARTIAL(); 1617 break; 1618 } 1619 if (Lc == *Feptr || Loc == *Feptr) break; 1620 Feptr++; 1621 } 1622 if (reptype != REPTYPE_POS) for (;;) 1623 { 1624 if (Feptr == Lstart_eptr) break; 1625 RMATCH(Fecode, RM30); 1626 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1627 Feptr--; 1628 } 1629 } 1630 } 1631 } 1632 1633 /* Caseful comparisons */ 1634 1635 else 1636 { 1637 #ifdef SUPPORT_UNICODE 1638 if (utf) 1639 { 1640 uint32_t d; 1641 for (i = 1; i <= Lmin; i++) 1642 { 1643 if (Feptr >= mb->end_subject) 1644 { 1645 SCHECK_PARTIAL(); 1646 RRETURN(MATCH_NOMATCH); 1647 } 1648 GETCHARINC(d, Feptr); 1649 if (Lc == d) RRETURN(MATCH_NOMATCH); 1650 } 1651 } 1652 else 1653 #endif 1654 /* Not UTF mode */ 1655 { 1656 for (i = 1; i <= Lmin; i++) 1657 { 1658 if (Feptr >= mb->end_subject) 1659 { 1660 SCHECK_PARTIAL(); 1661 RRETURN(MATCH_NOMATCH); 1662 } 1663 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); 1664 } 1665 } 1666 1667 if (Lmin == Lmax) continue; 1668 1669 if (reptype == REPTYPE_MIN) 1670 { 1671 #ifdef SUPPORT_UNICODE 1672 if (utf) 1673 { 1674 uint32_t d; 1675 for (;;) 1676 { 1677 RMATCH(Fecode, RM206); 1678 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1679 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1680 if (Feptr >= mb->end_subject) 1681 { 1682 SCHECK_PARTIAL(); 1683 RRETURN(MATCH_NOMATCH); 1684 } 1685 GETCHARINC(d, Feptr); 1686 if (Lc == d) RRETURN(MATCH_NOMATCH); 1687 } 1688 } 1689 else 1690 #endif 1691 /* Not UTF mode */ 1692 { 1693 for (;;) 1694 { 1695 RMATCH(Fecode, RM31); 1696 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1697 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1698 if (Feptr >= mb->end_subject) 1699 { 1700 SCHECK_PARTIAL(); 1701 RRETURN(MATCH_NOMATCH); 1702 } 1703 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); 1704 } 1705 } 1706 /* Control never gets here */ 1707 } 1708 1709 /* Maximize case */ 1710 1711 else 1712 { 1713 Lstart_eptr = Feptr; 1714 1715 #ifdef SUPPORT_UNICODE 1716 if (utf) 1717 { 1718 uint32_t d; 1719 for (i = Lmin; i < Lmax; i++) 1720 { 1721 int len = 1; 1722 if (Feptr >= mb->end_subject) 1723 { 1724 SCHECK_PARTIAL(); 1725 break; 1726 } 1727 GETCHARLEN(d, Feptr, len); 1728 if (Lc == d) break; 1729 Feptr += len; 1730 } 1731 1732 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 1733 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 1734 go too far. */ 1735 1736 if (reptype != REPTYPE_POS) for(;;) 1737 { 1738 if (Feptr <= Lstart_eptr) break; 1739 RMATCH(Fecode, RM207); 1740 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1741 Feptr--; 1742 BACKCHAR(Feptr); 1743 } 1744 } 1745 else 1746 #endif 1747 /* Not UTF mode */ 1748 { 1749 for (i = Lmin; i < Lmax; i++) 1750 { 1751 if (Feptr >= mb->end_subject) 1752 { 1753 SCHECK_PARTIAL(); 1754 break; 1755 } 1756 if (Lc == *Feptr) break; 1757 Feptr++; 1758 } 1759 if (reptype != REPTYPE_POS) for (;;) 1760 { 1761 if (Feptr == Lstart_eptr) break; 1762 RMATCH(Fecode, RM32); 1763 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1764 Feptr--; 1765 } 1766 } 1767 } 1768 } 1769 break; 1770 1771 #undef Lstart_eptr 1772 #undef Lmin 1773 #undef Lmax 1774 #undef Lc 1775 #undef Loc 1776 1777 1778 /* ===================================================================== */ 1779 /* Match a bit-mapped character class, possibly repeatedly. These opcodes 1780 are used when all the characters in the class have values in the range 1781 0-255, and either the matching is caseful, or the characters are in the 1782 range 0-127 when UTF processing is enabled. The only difference between 1783 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 1784 encountered. */ 1785 1786 #define Lmin F->temp_32[0] 1787 #define Lmax F->temp_32[1] 1788 #define Lstart_eptr F->temp_sptr[0] 1789 #define Lbyte_map_address F->temp_sptr[1] 1790 #define Lbyte_map ((unsigned char *)Lbyte_map_address) 1791 1792 case OP_NCLASS: 1793 case OP_CLASS: 1794 { 1795 Lbyte_map_address = Fecode + 1; /* Save for matching */ 1796 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ 1797 1798 /* Look past the end of the item to see if there is repeat information 1799 following. Then obey similar code to character type repeats. */ 1800 1801 switch (*Fecode) 1802 { 1803 case OP_CRSTAR: 1804 case OP_CRMINSTAR: 1805 case OP_CRPLUS: 1806 case OP_CRMINPLUS: 1807 case OP_CRQUERY: 1808 case OP_CRMINQUERY: 1809 case OP_CRPOSSTAR: 1810 case OP_CRPOSPLUS: 1811 case OP_CRPOSQUERY: 1812 fc = *Fecode++ - OP_CRSTAR; 1813 Lmin = rep_min[fc]; 1814 Lmax = rep_max[fc]; 1815 reptype = rep_typ[fc]; 1816 break; 1817 1818 case OP_CRRANGE: 1819 case OP_CRMINRANGE: 1820 case OP_CRPOSRANGE: 1821 Lmin = GET2(Fecode, 1); 1822 Lmax = GET2(Fecode, 1 + IMM2_SIZE); 1823 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ 1824 reptype = rep_typ[*Fecode - OP_CRSTAR]; 1825 Fecode += 1 + 2 * IMM2_SIZE; 1826 break; 1827 1828 default: /* No repeat follows */ 1829 Lmin = Lmax = 1; 1830 break; 1831 } 1832 1833 /* First, ensure the minimum number of matches are present. */ 1834 1835 #ifdef SUPPORT_UNICODE 1836 if (utf) 1837 { 1838 for (i = 1; i <= Lmin; i++) 1839 { 1840 if (Feptr >= mb->end_subject) 1841 { 1842 SCHECK_PARTIAL(); 1843 RRETURN(MATCH_NOMATCH); 1844 } 1845 GETCHARINC(fc, Feptr); 1846 if (fc > 255) 1847 { 1848 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); 1849 } 1850 else 1851 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); 1852 } 1853 } 1854 else 1855 #endif 1856 /* Not UTF mode */ 1857 { 1858 for (i = 1; i <= Lmin; i++) 1859 { 1860 if (Feptr >= mb->end_subject) 1861 { 1862 SCHECK_PARTIAL(); 1863 RRETURN(MATCH_NOMATCH); 1864 } 1865 fc = *Feptr++; 1866 #if PCRE2_CODE_UNIT_WIDTH != 8 1867 if (fc > 255) 1868 { 1869 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); 1870 } 1871 else 1872 #endif 1873 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); 1874 } 1875 } 1876 1877 /* If Lmax == Lmin we are done. Continue with main loop. */ 1878 1879 if (Lmin == Lmax) continue; 1880 1881 /* If minimizing, keep testing the rest of the expression and advancing 1882 the pointer while it matches the class. */ 1883 1884 if (reptype == REPTYPE_MIN) 1885 { 1886 #ifdef SUPPORT_UNICODE 1887 if (utf) 1888 { 1889 for (;;) 1890 { 1891 RMATCH(Fecode, RM200); 1892 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1893 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1894 if (Feptr >= mb->end_subject) 1895 { 1896 SCHECK_PARTIAL(); 1897 RRETURN(MATCH_NOMATCH); 1898 } 1899 GETCHARINC(fc, Feptr); 1900 if (fc > 255) 1901 { 1902 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); 1903 } 1904 else 1905 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); 1906 } 1907 } 1908 else 1909 #endif 1910 /* Not UTF mode */ 1911 { 1912 for (;;) 1913 { 1914 RMATCH(Fecode, RM23); 1915 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1916 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 1917 if (Feptr >= mb->end_subject) 1918 { 1919 SCHECK_PARTIAL(); 1920 RRETURN(MATCH_NOMATCH); 1921 } 1922 fc = *Feptr++; 1923 #if PCRE2_CODE_UNIT_WIDTH != 8 1924 if (fc > 255) 1925 { 1926 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); 1927 } 1928 else 1929 #endif 1930 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); 1931 } 1932 } 1933 /* Control never gets here */ 1934 } 1935 1936 /* If maximizing, find the longest possible run, then work backwards. */ 1937 1938 else 1939 { 1940 Lstart_eptr = Feptr; 1941 1942 #ifdef SUPPORT_UNICODE 1943 if (utf) 1944 { 1945 for (i = Lmin; i < Lmax; i++) 1946 { 1947 int len = 1; 1948 if (Feptr >= mb->end_subject) 1949 { 1950 SCHECK_PARTIAL(); 1951 break; 1952 } 1953 GETCHARLEN(fc, Feptr, len); 1954 if (fc > 255) 1955 { 1956 if (Fop == OP_CLASS) break; 1957 } 1958 else 1959 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break; 1960 Feptr += len; 1961 } 1962 1963 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 1964 1965 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 1966 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 1967 go too far. */ 1968 1969 for (;;) 1970 { 1971 RMATCH(Fecode, RM201); 1972 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1973 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ 1974 BACKCHAR(Feptr); 1975 } 1976 } 1977 else 1978 #endif 1979 /* Not UTF mode */ 1980 { 1981 for (i = Lmin; i < Lmax; i++) 1982 { 1983 if (Feptr >= mb->end_subject) 1984 { 1985 SCHECK_PARTIAL(); 1986 break; 1987 } 1988 fc = *Feptr; 1989 #if PCRE2_CODE_UNIT_WIDTH != 8 1990 if (fc > 255) 1991 { 1992 if (Fop == OP_CLASS) break; 1993 } 1994 else 1995 #endif 1996 if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break; 1997 Feptr++; 1998 } 1999 2000 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 2001 2002 while (Feptr >= Lstart_eptr) 2003 { 2004 RMATCH(Fecode, RM24); 2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2006 Feptr--; 2007 } 2008 } 2009 2010 RRETURN(MATCH_NOMATCH); 2011 } 2012 } 2013 /* Control never gets here */ 2014 2015 #undef Lbyte_map_address 2016 #undef Lbyte_map 2017 #undef Lstart_eptr 2018 #undef Lmin 2019 #undef Lmax 2020 2021 2022 /* ===================================================================== */ 2023 /* Match an extended character class. In the 8-bit library, this opcode is 2024 encountered only when UTF-8 mode mode is supported. In the 16-bit and 2025 32-bit libraries, codepoints greater than 255 may be encountered even when 2026 UTF is not supported. */ 2027 2028 #define Lstart_eptr F->temp_sptr[0] 2029 #define Lxclass_data F->temp_sptr[1] 2030 #define Lmin F->temp_32[0] 2031 #define Lmax F->temp_32[1] 2032 2033 #ifdef SUPPORT_WIDE_CHARS 2034 case OP_XCLASS: 2035 { 2036 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ 2037 Fecode += GET(Fecode, 1); /* Advance past the item */ 2038 2039 switch (*Fecode) 2040 { 2041 case OP_CRSTAR: 2042 case OP_CRMINSTAR: 2043 case OP_CRPLUS: 2044 case OP_CRMINPLUS: 2045 case OP_CRQUERY: 2046 case OP_CRMINQUERY: 2047 case OP_CRPOSSTAR: 2048 case OP_CRPOSPLUS: 2049 case OP_CRPOSQUERY: 2050 fc = *Fecode++ - OP_CRSTAR; 2051 Lmin = rep_min[fc]; 2052 Lmax = rep_max[fc]; 2053 reptype = rep_typ[fc]; 2054 break; 2055 2056 case OP_CRRANGE: 2057 case OP_CRMINRANGE: 2058 case OP_CRPOSRANGE: 2059 Lmin = GET2(Fecode, 1); 2060 Lmax = GET2(Fecode, 1 + IMM2_SIZE); 2061 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ 2062 reptype = rep_typ[*Fecode - OP_CRSTAR]; 2063 Fecode += 1 + 2 * IMM2_SIZE; 2064 break; 2065 2066 default: /* No repeat follows */ 2067 Lmin = Lmax = 1; 2068 break; 2069 } 2070 2071 /* First, ensure the minimum number of matches are present. */ 2072 2073 for (i = 1; i <= Lmin; i++) 2074 { 2075 if (Feptr >= mb->end_subject) 2076 { 2077 SCHECK_PARTIAL(); 2078 RRETURN(MATCH_NOMATCH); 2079 } 2080 GETCHARINCTEST(fc, Feptr); 2081 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); 2082 } 2083 2084 /* If Lmax == Lmin we can just continue with the main loop. */ 2085 2086 if (Lmin == Lmax) continue; 2087 2088 /* If minimizing, keep testing the rest of the expression and advancing 2089 the pointer while it matches the class. */ 2090 2091 if (reptype == REPTYPE_MIN) 2092 { 2093 for (;;) 2094 { 2095 RMATCH(Fecode, RM100); 2096 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2097 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 2098 if (Feptr >= mb->end_subject) 2099 { 2100 SCHECK_PARTIAL(); 2101 RRETURN(MATCH_NOMATCH); 2102 } 2103 GETCHARINCTEST(fc, Feptr); 2104 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); 2105 } 2106 /* Control never gets here */ 2107 } 2108 2109 /* If maximizing, find the longest possible run, then work backwards. */ 2110 2111 else 2112 { 2113 Lstart_eptr = Feptr; 2114 for (i = Lmin; i < Lmax; i++) 2115 { 2116 int len = 1; 2117 if (Feptr >= mb->end_subject) 2118 { 2119 SCHECK_PARTIAL(); 2120 break; 2121 } 2122 #ifdef SUPPORT_UNICODE 2123 GETCHARLENTEST(fc, Feptr, len); 2124 #else 2125 fc = *Feptr; 2126 #endif 2127 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break; 2128 Feptr += len; 2129 } 2130 2131 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 2132 2133 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 2134 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 2135 go too far. */ 2136 2137 for(;;) 2138 { 2139 RMATCH(Fecode, RM101); 2140 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2141 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ 2142 #ifdef SUPPORT_UNICODE 2143 if (utf) BACKCHAR(Feptr); 2144 #endif 2145 } 2146 RRETURN(MATCH_NOMATCH); 2147 } 2148 2149 /* Control never gets here */ 2150 } 2151 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ 2152 2153 #undef Lstart_eptr 2154 #undef Lxclass_data 2155 #undef Lmin 2156 #undef Lmax 2157 2158 2159 /* ===================================================================== */ 2160 /* Match various character types when PCRE2_UCP is not set. These opcodes 2161 are not generated when PCRE2_UCP is set - instead appropriate property 2162 tests are compiled. */ 2163 2164 case OP_NOT_DIGIT: 2165 if (Feptr >= mb->end_subject) 2166 { 2167 SCHECK_PARTIAL(); 2168 RRETURN(MATCH_NOMATCH); 2169 } 2170 GETCHARINCTEST(fc, Feptr); 2171 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) 2172 RRETURN(MATCH_NOMATCH); 2173 Fecode++; 2174 break; 2175 2176 case OP_DIGIT: 2177 if (Feptr >= mb->end_subject) 2178 { 2179 SCHECK_PARTIAL(); 2180 RRETURN(MATCH_NOMATCH); 2181 } 2182 GETCHARINCTEST(fc, Feptr); 2183 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) 2184 RRETURN(MATCH_NOMATCH); 2185 Fecode++; 2186 break; 2187 2188 case OP_NOT_WHITESPACE: 2189 if (Feptr >= mb->end_subject) 2190 { 2191 SCHECK_PARTIAL(); 2192 RRETURN(MATCH_NOMATCH); 2193 } 2194 GETCHARINCTEST(fc, Feptr); 2195 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) 2196 RRETURN(MATCH_NOMATCH); 2197 Fecode++; 2198 break; 2199 2200 case OP_WHITESPACE: 2201 if (Feptr >= mb->end_subject) 2202 { 2203 SCHECK_PARTIAL(); 2204 RRETURN(MATCH_NOMATCH); 2205 } 2206 GETCHARINCTEST(fc, Feptr); 2207 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) 2208 RRETURN(MATCH_NOMATCH); 2209 Fecode++; 2210 break; 2211 2212 case OP_NOT_WORDCHAR: 2213 if (Feptr >= mb->end_subject) 2214 { 2215 SCHECK_PARTIAL(); 2216 RRETURN(MATCH_NOMATCH); 2217 } 2218 GETCHARINCTEST(fc, Feptr); 2219 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) 2220 RRETURN(MATCH_NOMATCH); 2221 Fecode++; 2222 break; 2223 2224 case OP_WORDCHAR: 2225 if (Feptr >= mb->end_subject) 2226 { 2227 SCHECK_PARTIAL(); 2228 RRETURN(MATCH_NOMATCH); 2229 } 2230 GETCHARINCTEST(fc, Feptr); 2231 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) 2232 RRETURN(MATCH_NOMATCH); 2233 Fecode++; 2234 break; 2235 2236 case OP_ANYNL: 2237 if (Feptr >= mb->end_subject) 2238 { 2239 SCHECK_PARTIAL(); 2240 RRETURN(MATCH_NOMATCH); 2241 } 2242 GETCHARINCTEST(fc, Feptr); 2243 switch(fc) 2244 { 2245 default: RRETURN(MATCH_NOMATCH); 2246 2247 case CHAR_CR: 2248 if (Feptr >= mb->end_subject) 2249 { 2250 SCHECK_PARTIAL(); 2251 } 2252 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; 2253 break; 2254 2255 case CHAR_LF: 2256 break; 2257 2258 case CHAR_VT: 2259 case CHAR_FF: 2260 case CHAR_NEL: 2261 #ifndef EBCDIC 2262 case 0x2028: 2263 case 0x2029: 2264 #endif /* Not EBCDIC */ 2265 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 2266 break; 2267 } 2268 Fecode++; 2269 break; 2270 2271 case OP_NOT_HSPACE: 2272 if (Feptr >= mb->end_subject) 2273 { 2274 SCHECK_PARTIAL(); 2275 RRETURN(MATCH_NOMATCH); 2276 } 2277 GETCHARINCTEST(fc, Feptr); 2278 switch(fc) 2279 { 2280 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ 2281 default: break; 2282 } 2283 Fecode++; 2284 break; 2285 2286 case OP_HSPACE: 2287 if (Feptr >= mb->end_subject) 2288 { 2289 SCHECK_PARTIAL(); 2290 RRETURN(MATCH_NOMATCH); 2291 } 2292 GETCHARINCTEST(fc, Feptr); 2293 switch(fc) 2294 { 2295 HSPACE_CASES: break; /* Byte and multibyte cases */ 2296 default: RRETURN(MATCH_NOMATCH); 2297 } 2298 Fecode++; 2299 break; 2300 2301 case OP_NOT_VSPACE: 2302 if (Feptr >= mb->end_subject) 2303 { 2304 SCHECK_PARTIAL(); 2305 RRETURN(MATCH_NOMATCH); 2306 } 2307 GETCHARINCTEST(fc, Feptr); 2308 switch(fc) 2309 { 2310 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2311 default: break; 2312 } 2313 Fecode++; 2314 break; 2315 2316 case OP_VSPACE: 2317 if (Feptr >= mb->end_subject) 2318 { 2319 SCHECK_PARTIAL(); 2320 RRETURN(MATCH_NOMATCH); 2321 } 2322 GETCHARINCTEST(fc, Feptr); 2323 switch(fc) 2324 { 2325 VSPACE_CASES: break; 2326 default: RRETURN(MATCH_NOMATCH); 2327 } 2328 Fecode++; 2329 break; 2330 2331 2332 #ifdef SUPPORT_UNICODE 2333 2334 /* ===================================================================== */ 2335 /* Check the next character by Unicode property. We will get here only 2336 if the support is in the binary; otherwise a compile-time error occurs. */ 2337 2338 case OP_PROP: 2339 case OP_NOTPROP: 2340 if (Feptr >= mb->end_subject) 2341 { 2342 SCHECK_PARTIAL(); 2343 RRETURN(MATCH_NOMATCH); 2344 } 2345 GETCHARINCTEST(fc, Feptr); 2346 { 2347 const uint32_t *cp; 2348 const ucd_record *prop = GET_UCD(fc); 2349 2350 switch(Fecode[1]) 2351 { 2352 case PT_ANY: 2353 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2354 break; 2355 2356 case PT_LAMP: 2357 if ((prop->chartype == ucp_Lu || 2358 prop->chartype == ucp_Ll || 2359 prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP)) 2360 RRETURN(MATCH_NOMATCH); 2361 break; 2362 2363 case PT_GC: 2364 if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP)) 2365 RRETURN(MATCH_NOMATCH); 2366 break; 2367 2368 case PT_PC: 2369 if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP)) 2370 RRETURN(MATCH_NOMATCH); 2371 break; 2372 2373 case PT_SC: 2374 if ((Fecode[2] != prop->script) == (Fop == OP_PROP)) 2375 RRETURN(MATCH_NOMATCH); 2376 break; 2377 2378 /* These are specials */ 2379 2380 case PT_ALNUM: 2381 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2382 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP)) 2383 RRETURN(MATCH_NOMATCH); 2384 break; 2385 2386 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2387 which means that Perl space and POSIX space are now identical. PCRE 2388 was changed at release 8.34. */ 2389 2390 case PT_SPACE: /* Perl space */ 2391 case PT_PXSPACE: /* POSIX space */ 2392 switch(fc) 2393 { 2394 HSPACE_CASES: 2395 VSPACE_CASES: 2396 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2397 break; 2398 2399 default: 2400 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == 2401 (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); 2402 break; 2403 } 2404 break; 2405 2406 case PT_WORD: 2407 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2408 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2409 fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP)) 2410 RRETURN(MATCH_NOMATCH); 2411 break; 2412 2413 case PT_CLIST: 2414 cp = PRIV(ucd_caseless_sets) + Fecode[2]; 2415 for (;;) 2416 { 2417 if (fc < *cp) 2418 { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; } 2419 if (fc == *cp++) 2420 { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } } 2421 } 2422 break; 2423 2424 case PT_UCNC: 2425 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || 2426 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || 2427 fc >= 0xe000) == (Fop == OP_NOTPROP)) 2428 RRETURN(MATCH_NOMATCH); 2429 break; 2430 2431 /* This should never occur */ 2432 2433 default: 2434 return PCRE2_ERROR_INTERNAL; 2435 } 2436 2437 Fecode += 3; 2438 } 2439 break; 2440 2441 2442 /* ===================================================================== */ 2443 /* Match an extended Unicode sequence. We will get here only if the support 2444 is in the binary; otherwise a compile-time error occurs. */ 2445 2446 case OP_EXTUNI: 2447 if (Feptr >= mb->end_subject) 2448 { 2449 SCHECK_PARTIAL(); 2450 RRETURN(MATCH_NOMATCH); 2451 } 2452 else 2453 { 2454 GETCHARINCTEST(fc, Feptr); 2455 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, 2456 NULL); 2457 } 2458 CHECK_PARTIAL(); 2459 Fecode++; 2460 break; 2461 2462 #endif /* SUPPORT_UNICODE */ 2463 2464 2465 /* ===================================================================== */ 2466 /* Match a single character type repeatedly. Note that the property type 2467 does not need to be in a stack frame as it is not used within an RMATCH() 2468 loop. */ 2469 2470 #define Lstart_eptr F->temp_sptr[0] 2471 #define Lmin F->temp_32[0] 2472 #define Lmax F->temp_32[1] 2473 #define Lctype F->temp_32[2] 2474 #define Lpropvalue F->temp_32[3] 2475 2476 case OP_TYPEEXACT: 2477 Lmin = Lmax = GET2(Fecode, 1); 2478 Fecode += 1 + IMM2_SIZE; 2479 goto REPEATTYPE; 2480 2481 case OP_TYPEUPTO: 2482 case OP_TYPEMINUPTO: 2483 Lmin = 0; 2484 Lmax = GET2(Fecode, 1); 2485 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; 2486 Fecode += 1 + IMM2_SIZE; 2487 goto REPEATTYPE; 2488 2489 case OP_TYPEPOSSTAR: 2490 reptype = REPTYPE_POS; 2491 Lmin = 0; 2492 Lmax = UINT32_MAX; 2493 Fecode++; 2494 goto REPEATTYPE; 2495 2496 case OP_TYPEPOSPLUS: 2497 reptype = REPTYPE_POS; 2498 Lmin = 1; 2499 Lmax = UINT32_MAX; 2500 Fecode++; 2501 goto REPEATTYPE; 2502 2503 case OP_TYPEPOSQUERY: 2504 reptype = REPTYPE_POS; 2505 Lmin = 0; 2506 Lmax = 1; 2507 Fecode++; 2508 goto REPEATTYPE; 2509 2510 case OP_TYPEPOSUPTO: 2511 reptype = REPTYPE_POS; 2512 Lmin = 0; 2513 Lmax = GET2(Fecode, 1); 2514 Fecode += 1 + IMM2_SIZE; 2515 goto REPEATTYPE; 2516 2517 case OP_TYPESTAR: 2518 case OP_TYPEMINSTAR: 2519 case OP_TYPEPLUS: 2520 case OP_TYPEMINPLUS: 2521 case OP_TYPEQUERY: 2522 case OP_TYPEMINQUERY: 2523 fc = *Fecode++ - OP_TYPESTAR; 2524 Lmin = rep_min[fc]; 2525 Lmax = rep_max[fc]; 2526 reptype = rep_typ[fc]; 2527 2528 /* Common code for all repeated character type matches. */ 2529 2530 REPEATTYPE: 2531 Lctype = *Fecode++; /* Code for the character type */ 2532 2533 #ifdef SUPPORT_UNICODE 2534 if (Lctype == OP_PROP || Lctype == OP_NOTPROP) 2535 { 2536 proptype = *Fecode++; 2537 Lpropvalue = *Fecode++; 2538 } 2539 else proptype = -1; 2540 #endif 2541 2542 /* First, ensure the minimum number of matches are present. Use inline 2543 code for maximizing the speed, and do the type test once at the start 2544 (i.e. keep it out of the loop). The code for UTF mode is separated out for 2545 tidiness, except for Unicode property tests. */ 2546 2547 if (Lmin > 0) 2548 { 2549 #ifdef SUPPORT_UNICODE 2550 if (proptype >= 0) /* Property tests in all modes */ 2551 { 2552 switch(proptype) 2553 { 2554 case PT_ANY: 2555 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2556 for (i = 1; i <= Lmin; i++) 2557 { 2558 if (Feptr >= mb->end_subject) 2559 { 2560 SCHECK_PARTIAL(); 2561 RRETURN(MATCH_NOMATCH); 2562 } 2563 GETCHARINCTEST(fc, Feptr); 2564 } 2565 break; 2566 2567 case PT_LAMP: 2568 for (i = 1; i <= Lmin; i++) 2569 { 2570 int chartype; 2571 if (Feptr >= mb->end_subject) 2572 { 2573 SCHECK_PARTIAL(); 2574 RRETURN(MATCH_NOMATCH); 2575 } 2576 GETCHARINCTEST(fc, Feptr); 2577 chartype = UCD_CHARTYPE(fc); 2578 if ((chartype == ucp_Lu || 2579 chartype == ucp_Ll || 2580 chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) 2581 RRETURN(MATCH_NOMATCH); 2582 } 2583 break; 2584 2585 case PT_GC: 2586 for (i = 1; i <= Lmin; i++) 2587 { 2588 if (Feptr >= mb->end_subject) 2589 { 2590 SCHECK_PARTIAL(); 2591 RRETURN(MATCH_NOMATCH); 2592 } 2593 GETCHARINCTEST(fc, Feptr); 2594 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 2595 RRETURN(MATCH_NOMATCH); 2596 } 2597 break; 2598 2599 case PT_PC: 2600 for (i = 1; i <= Lmin; i++) 2601 { 2602 if (Feptr >= mb->end_subject) 2603 { 2604 SCHECK_PARTIAL(); 2605 RRETURN(MATCH_NOMATCH); 2606 } 2607 GETCHARINCTEST(fc, Feptr); 2608 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 2609 RRETURN(MATCH_NOMATCH); 2610 } 2611 break; 2612 2613 case PT_SC: 2614 for (i = 1; i <= Lmin; i++) 2615 { 2616 if (Feptr >= mb->end_subject) 2617 { 2618 SCHECK_PARTIAL(); 2619 RRETURN(MATCH_NOMATCH); 2620 } 2621 GETCHARINCTEST(fc, Feptr); 2622 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 2623 RRETURN(MATCH_NOMATCH); 2624 } 2625 break; 2626 2627 case PT_ALNUM: 2628 for (i = 1; i <= Lmin; i++) 2629 { 2630 int category; 2631 if (Feptr >= mb->end_subject) 2632 { 2633 SCHECK_PARTIAL(); 2634 RRETURN(MATCH_NOMATCH); 2635 } 2636 GETCHARINCTEST(fc, Feptr); 2637 category = UCD_CATEGORY(fc); 2638 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) 2639 RRETURN(MATCH_NOMATCH); 2640 } 2641 break; 2642 2643 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 2644 which means that Perl space and POSIX space are now identical. PCRE 2645 was changed at release 8.34. */ 2646 2647 case PT_SPACE: /* Perl space */ 2648 case PT_PXSPACE: /* POSIX space */ 2649 for (i = 1; i <= Lmin; i++) 2650 { 2651 if (Feptr >= mb->end_subject) 2652 { 2653 SCHECK_PARTIAL(); 2654 RRETURN(MATCH_NOMATCH); 2655 } 2656 GETCHARINCTEST(fc, Feptr); 2657 switch(fc) 2658 { 2659 HSPACE_CASES: 2660 VSPACE_CASES: 2661 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2662 break; 2663 2664 default: 2665 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) 2666 RRETURN(MATCH_NOMATCH); 2667 break; 2668 } 2669 } 2670 break; 2671 2672 case PT_WORD: 2673 for (i = 1; i <= Lmin; i++) 2674 { 2675 int category; 2676 if (Feptr >= mb->end_subject) 2677 { 2678 SCHECK_PARTIAL(); 2679 RRETURN(MATCH_NOMATCH); 2680 } 2681 GETCHARINCTEST(fc, Feptr); 2682 category = UCD_CATEGORY(fc); 2683 if ((category == ucp_L || category == ucp_N || 2684 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) 2685 RRETURN(MATCH_NOMATCH); 2686 } 2687 break; 2688 2689 case PT_CLIST: 2690 for (i = 1; i <= Lmin; i++) 2691 { 2692 const uint32_t *cp; 2693 if (Feptr >= mb->end_subject) 2694 { 2695 SCHECK_PARTIAL(); 2696 RRETURN(MATCH_NOMATCH); 2697 } 2698 GETCHARINCTEST(fc, Feptr); 2699 cp = PRIV(ucd_caseless_sets) + Lpropvalue; 2700 for (;;) 2701 { 2702 if (fc < *cp) 2703 { 2704 if (Lctype == OP_NOTPROP) break; 2705 RRETURN(MATCH_NOMATCH); 2706 } 2707 if (fc == *cp++) 2708 { 2709 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2710 break; 2711 } 2712 } 2713 } 2714 break; 2715 2716 case PT_UCNC: 2717 for (i = 1; i <= Lmin; i++) 2718 { 2719 if (Feptr >= mb->end_subject) 2720 { 2721 SCHECK_PARTIAL(); 2722 RRETURN(MATCH_NOMATCH); 2723 } 2724 GETCHARINCTEST(fc, Feptr); 2725 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || 2726 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || 2727 fc >= 0xe000) == (Lctype == OP_NOTPROP)) 2728 RRETURN(MATCH_NOMATCH); 2729 } 2730 break; 2731 2732 /* This should not occur */ 2733 2734 default: 2735 return PCRE2_ERROR_INTERNAL; 2736 } 2737 } 2738 2739 /* Match extended Unicode sequences. We will get here only if the 2740 support is in the binary; otherwise a compile-time error occurs. */ 2741 2742 else if (Lctype == OP_EXTUNI) 2743 { 2744 for (i = 1; i <= Lmin; i++) 2745 { 2746 if (Feptr >= mb->end_subject) 2747 { 2748 SCHECK_PARTIAL(); 2749 RRETURN(MATCH_NOMATCH); 2750 } 2751 else 2752 { 2753 GETCHARINCTEST(fc, Feptr); 2754 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, 2755 mb->end_subject, utf, NULL); 2756 } 2757 CHECK_PARTIAL(); 2758 } 2759 } 2760 else 2761 #endif /* SUPPORT_UNICODE */ 2762 2763 /* Handle all other cases in UTF mode */ 2764 2765 #ifdef SUPPORT_UNICODE 2766 if (utf) switch(Lctype) 2767 { 2768 case OP_ANY: 2769 for (i = 1; i <= Lmin; i++) 2770 { 2771 if (Feptr >= mb->end_subject) 2772 { 2773 SCHECK_PARTIAL(); 2774 RRETURN(MATCH_NOMATCH); 2775 } 2776 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); 2777 if (mb->partial != 0 && 2778 Feptr + 1 >= mb->end_subject && 2779 NLBLOCK->nltype == NLTYPE_FIXED && 2780 NLBLOCK->nllen == 2 && 2781 UCHAR21(Feptr) == NLBLOCK->nl[0]) 2782 { 2783 mb->hitend = TRUE; 2784 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 2785 } 2786 Feptr++; 2787 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 2788 } 2789 break; 2790 2791 case OP_ALLANY: 2792 for (i = 1; i <= Lmin; i++) 2793 { 2794 if (Feptr >= mb->end_subject) 2795 { 2796 SCHECK_PARTIAL(); 2797 RRETURN(MATCH_NOMATCH); 2798 } 2799 Feptr++; 2800 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 2801 } 2802 break; 2803 2804 case OP_ANYBYTE: 2805 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); 2806 Feptr += Lmin; 2807 break; 2808 2809 case OP_ANYNL: 2810 for (i = 1; i <= Lmin; i++) 2811 { 2812 if (Feptr >= mb->end_subject) 2813 { 2814 SCHECK_PARTIAL(); 2815 RRETURN(MATCH_NOMATCH); 2816 } 2817 GETCHARINC(fc, Feptr); 2818 switch(fc) 2819 { 2820 default: RRETURN(MATCH_NOMATCH); 2821 2822 case CHAR_CR: 2823 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; 2824 break; 2825 2826 case CHAR_LF: 2827 break; 2828 2829 case CHAR_VT: 2830 case CHAR_FF: 2831 case CHAR_NEL: 2832 #ifndef EBCDIC 2833 case 0x2028: 2834 case 0x2029: 2835 #endif /* Not EBCDIC */ 2836 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 2837 break; 2838 } 2839 } 2840 break; 2841 2842 case OP_NOT_HSPACE: 2843 for (i = 1; i <= Lmin; i++) 2844 { 2845 if (Feptr >= mb->end_subject) 2846 { 2847 SCHECK_PARTIAL(); 2848 RRETURN(MATCH_NOMATCH); 2849 } 2850 GETCHARINC(fc, Feptr); 2851 switch(fc) 2852 { 2853 HSPACE_CASES: RRETURN(MATCH_NOMATCH); 2854 default: break; 2855 } 2856 } 2857 break; 2858 2859 case OP_HSPACE: 2860 for (i = 1; i <= Lmin; i++) 2861 { 2862 if (Feptr >= mb->end_subject) 2863 { 2864 SCHECK_PARTIAL(); 2865 RRETURN(MATCH_NOMATCH); 2866 } 2867 GETCHARINC(fc, Feptr); 2868 switch(fc) 2869 { 2870 HSPACE_CASES: break; 2871 default: RRETURN(MATCH_NOMATCH); 2872 } 2873 } 2874 break; 2875 2876 case OP_NOT_VSPACE: 2877 for (i = 1; i <= Lmin; i++) 2878 { 2879 if (Feptr >= mb->end_subject) 2880 { 2881 SCHECK_PARTIAL(); 2882 RRETURN(MATCH_NOMATCH); 2883 } 2884 GETCHARINC(fc, Feptr); 2885 switch(fc) 2886 { 2887 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 2888 default: break; 2889 } 2890 } 2891 break; 2892 2893 case OP_VSPACE: 2894 for (i = 1; i <= Lmin; i++) 2895 { 2896 if (Feptr >= mb->end_subject) 2897 { 2898 SCHECK_PARTIAL(); 2899 RRETURN(MATCH_NOMATCH); 2900 } 2901 GETCHARINC(fc, Feptr); 2902 switch(fc) 2903 { 2904 VSPACE_CASES: break; 2905 default: RRETURN(MATCH_NOMATCH); 2906 } 2907 } 2908 break; 2909 2910 case OP_NOT_DIGIT: 2911 for (i = 1; i <= Lmin; i++) 2912 { 2913 if (Feptr >= mb->end_subject) 2914 { 2915 SCHECK_PARTIAL(); 2916 RRETURN(MATCH_NOMATCH); 2917 } 2918 GETCHARINC(fc, Feptr); 2919 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) 2920 RRETURN(MATCH_NOMATCH); 2921 } 2922 break; 2923 2924 case OP_DIGIT: 2925 for (i = 1; i <= Lmin; i++) 2926 { 2927 uint32_t cc; 2928 if (Feptr >= mb->end_subject) 2929 { 2930 SCHECK_PARTIAL(); 2931 RRETURN(MATCH_NOMATCH); 2932 } 2933 cc = UCHAR21(Feptr); 2934 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) 2935 RRETURN(MATCH_NOMATCH); 2936 Feptr++; 2937 /* No need to skip more code units - we know it has only one. */ 2938 } 2939 break; 2940 2941 case OP_NOT_WHITESPACE: 2942 for (i = 1; i <= Lmin; i++) 2943 { 2944 uint32_t cc; 2945 if (Feptr >= mb->end_subject) 2946 { 2947 SCHECK_PARTIAL(); 2948 RRETURN(MATCH_NOMATCH); 2949 } 2950 cc = UCHAR21(Feptr); 2951 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) 2952 RRETURN(MATCH_NOMATCH); 2953 Feptr++; 2954 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 2955 } 2956 break; 2957 2958 case OP_WHITESPACE: 2959 for (i = 1; i <= Lmin; i++) 2960 { 2961 uint32_t cc; 2962 if (Feptr >= mb->end_subject) 2963 { 2964 SCHECK_PARTIAL(); 2965 RRETURN(MATCH_NOMATCH); 2966 } 2967 cc = UCHAR21(Feptr); 2968 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) 2969 RRETURN(MATCH_NOMATCH); 2970 Feptr++; 2971 /* No need to skip more code units - we know it has only one. */ 2972 } 2973 break; 2974 2975 case OP_NOT_WORDCHAR: 2976 for (i = 1; i <= Lmin; i++) 2977 { 2978 uint32_t cc; 2979 if (Feptr >= mb->end_subject) 2980 { 2981 SCHECK_PARTIAL(); 2982 RRETURN(MATCH_NOMATCH); 2983 } 2984 cc = UCHAR21(Feptr); 2985 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) 2986 RRETURN(MATCH_NOMATCH); 2987 Feptr++; 2988 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 2989 } 2990 break; 2991 2992 case OP_WORDCHAR: 2993 for (i = 1; i <= Lmin; i++) 2994 { 2995 uint32_t cc; 2996 if (Feptr >= mb->end_subject) 2997 { 2998 SCHECK_PARTIAL(); 2999 RRETURN(MATCH_NOMATCH); 3000 } 3001 cc = UCHAR21(Feptr); 3002 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) 3003 RRETURN(MATCH_NOMATCH); 3004 Feptr++; 3005 /* No need to skip more code units - we know it has only one. */ 3006 } 3007 break; 3008 3009 default: 3010 return PCRE2_ERROR_INTERNAL; 3011 } /* End switch(Lctype) */ 3012 3013 else 3014 #endif /* SUPPORT_UNICODE */ 3015 3016 /* Code for the non-UTF case for minimum matching of operators other 3017 than OP_PROP and OP_NOTPROP. */ 3018 3019 switch(Lctype) 3020 { 3021 case OP_ANY: 3022 for (i = 1; i <= Lmin; i++) 3023 { 3024 if (Feptr >= mb->end_subject) 3025 { 3026 SCHECK_PARTIAL(); 3027 RRETURN(MATCH_NOMATCH); 3028 } 3029 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); 3030 if (mb->partial != 0 && 3031 Feptr + 1 >= mb->end_subject && 3032 NLBLOCK->nltype == NLTYPE_FIXED && 3033 NLBLOCK->nllen == 2 && 3034 *Feptr == NLBLOCK->nl[0]) 3035 { 3036 mb->hitend = TRUE; 3037 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 3038 } 3039 Feptr++; 3040 } 3041 break; 3042 3043 case OP_ALLANY: 3044 if (Feptr > mb->end_subject - Lmin) 3045 { 3046 SCHECK_PARTIAL(); 3047 RRETURN(MATCH_NOMATCH); 3048 } 3049 Feptr += Lmin; 3050 break; 3051 3052 /* This OP_ANYBYTE case will never be reached because \C gets turned 3053 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage 3054 reports don't complain about it's never being used. */ 3055 3056 /* case OP_ANYBYTE: 3057 * if (Feptr > mb->end_subject - Lmin) 3058 * { 3059 * SCHECK_PARTIAL(); 3060 * RRETURN(MATCH_NOMATCH); 3061 * } 3062 * Feptr += Lmin; 3063 * break; 3064 */ 3065 case OP_ANYNL: 3066 for (i = 1; i <= Lmin; i++) 3067 { 3068 if (Feptr >= mb->end_subject) 3069 { 3070 SCHECK_PARTIAL(); 3071 RRETURN(MATCH_NOMATCH); 3072 } 3073 switch(*Feptr++) 3074 { 3075 default: RRETURN(MATCH_NOMATCH); 3076 3077 case CHAR_CR: 3078 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; 3079 break; 3080 3081 case CHAR_LF: 3082 break; 3083 3084 case CHAR_VT: 3085 case CHAR_FF: 3086 case CHAR_NEL: 3087 #if PCRE2_CODE_UNIT_WIDTH != 8 3088 case 0x2028: 3089 case 0x2029: 3090 #endif 3091 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); 3092 break; 3093 } 3094 } 3095 break; 3096 3097 case OP_NOT_HSPACE: 3098 for (i = 1; i <= Lmin; i++) 3099 { 3100 if (Feptr >= mb->end_subject) 3101 { 3102 SCHECK_PARTIAL(); 3103 RRETURN(MATCH_NOMATCH); 3104 } 3105 switch(*Feptr++) 3106 { 3107 default: break; 3108 HSPACE_BYTE_CASES: 3109 #if PCRE2_CODE_UNIT_WIDTH != 8 3110 HSPACE_MULTIBYTE_CASES: 3111 #endif 3112 RRETURN(MATCH_NOMATCH); 3113 } 3114 } 3115 break; 3116 3117 case OP_HSPACE: 3118 for (i = 1; i <= Lmin; i++) 3119 { 3120 if (Feptr >= mb->end_subject) 3121 { 3122 SCHECK_PARTIAL(); 3123 RRETURN(MATCH_NOMATCH); 3124 } 3125 switch(*Feptr++) 3126 { 3127 default: RRETURN(MATCH_NOMATCH); 3128 HSPACE_BYTE_CASES: 3129 #if PCRE2_CODE_UNIT_WIDTH != 8 3130 HSPACE_MULTIBYTE_CASES: 3131 #endif 3132 break; 3133 } 3134 } 3135 break; 3136 3137 case OP_NOT_VSPACE: 3138 for (i = 1; i <= Lmin; i++) 3139 { 3140 if (Feptr >= mb->end_subject) 3141 { 3142 SCHECK_PARTIAL(); 3143 RRETURN(MATCH_NOMATCH); 3144 } 3145 switch(*Feptr++) 3146 { 3147 VSPACE_BYTE_CASES: 3148 #if PCRE2_CODE_UNIT_WIDTH != 8 3149 VSPACE_MULTIBYTE_CASES: 3150 #endif 3151 RRETURN(MATCH_NOMATCH); 3152 default: break; 3153 } 3154 } 3155 break; 3156 3157 case OP_VSPACE: 3158 for (i = 1; i <= Lmin; i++) 3159 { 3160 if (Feptr >= mb->end_subject) 3161 { 3162 SCHECK_PARTIAL(); 3163 RRETURN(MATCH_NOMATCH); 3164 } 3165 switch(*Feptr++) 3166 { 3167 default: RRETURN(MATCH_NOMATCH); 3168 VSPACE_BYTE_CASES: 3169 #if PCRE2_CODE_UNIT_WIDTH != 8 3170 VSPACE_MULTIBYTE_CASES: 3171 #endif 3172 break; 3173 } 3174 } 3175 break; 3176 3177 case OP_NOT_DIGIT: 3178 for (i = 1; i <= Lmin; i++) 3179 { 3180 if (Feptr >= mb->end_subject) 3181 { 3182 SCHECK_PARTIAL(); 3183 RRETURN(MATCH_NOMATCH); 3184 } 3185 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) 3186 RRETURN(MATCH_NOMATCH); 3187 Feptr++; 3188 } 3189 break; 3190 3191 case OP_DIGIT: 3192 for (i = 1; i <= Lmin; i++) 3193 { 3194 if (Feptr >= mb->end_subject) 3195 { 3196 SCHECK_PARTIAL(); 3197 RRETURN(MATCH_NOMATCH); 3198 } 3199 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) 3200 RRETURN(MATCH_NOMATCH); 3201 Feptr++; 3202 } 3203 break; 3204 3205 case OP_NOT_WHITESPACE: 3206 for (i = 1; i <= Lmin; i++) 3207 { 3208 if (Feptr >= mb->end_subject) 3209 { 3210 SCHECK_PARTIAL(); 3211 RRETURN(MATCH_NOMATCH); 3212 } 3213 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) 3214 RRETURN(MATCH_NOMATCH); 3215 Feptr++; 3216 } 3217 break; 3218 3219 case OP_WHITESPACE: 3220 for (i = 1; i <= Lmin; i++) 3221 { 3222 if (Feptr >= mb->end_subject) 3223 { 3224 SCHECK_PARTIAL(); 3225 RRETURN(MATCH_NOMATCH); 3226 } 3227 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) 3228 RRETURN(MATCH_NOMATCH); 3229 Feptr++; 3230 } 3231 break; 3232 3233 case OP_NOT_WORDCHAR: 3234 for (i = 1; i <= Lmin; i++) 3235 { 3236 if (Feptr >= mb->end_subject) 3237 { 3238 SCHECK_PARTIAL(); 3239 RRETURN(MATCH_NOMATCH); 3240 } 3241 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) 3242 RRETURN(MATCH_NOMATCH); 3243 Feptr++; 3244 } 3245 break; 3246 3247 case OP_WORDCHAR: 3248 for (i = 1; i <= Lmin; i++) 3249 { 3250 if (Feptr >= mb->end_subject) 3251 { 3252 SCHECK_PARTIAL(); 3253 RRETURN(MATCH_NOMATCH); 3254 } 3255 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) 3256 RRETURN(MATCH_NOMATCH); 3257 Feptr++; 3258 } 3259 break; 3260 3261 default: 3262 return PCRE2_ERROR_INTERNAL; 3263 } 3264 } 3265 3266 /* If Lmin = Lmax we are done. Continue with the main loop. */ 3267 3268 if (Lmin == Lmax) continue; 3269 3270 /* If minimizing, we have to test the rest of the pattern before each 3271 subsequent match. */ 3272 3273 if (reptype == REPTYPE_MIN) 3274 { 3275 #ifdef SUPPORT_UNICODE 3276 if (proptype >= 0) 3277 { 3278 switch(proptype) 3279 { 3280 case PT_ANY: 3281 for (;;) 3282 { 3283 RMATCH(Fecode, RM208); 3284 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3285 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3286 if (Feptr >= mb->end_subject) 3287 { 3288 SCHECK_PARTIAL(); 3289 RRETURN(MATCH_NOMATCH); 3290 } 3291 GETCHARINCTEST(fc, Feptr); 3292 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 3293 } 3294 /* Control never gets here */ 3295 3296 case PT_LAMP: 3297 for (;;) 3298 { 3299 int chartype; 3300 RMATCH(Fecode, RM209); 3301 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3302 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3303 if (Feptr >= mb->end_subject) 3304 { 3305 SCHECK_PARTIAL(); 3306 RRETURN(MATCH_NOMATCH); 3307 } 3308 GETCHARINCTEST(fc, Feptr); 3309 chartype = UCD_CHARTYPE(fc); 3310 if ((chartype == ucp_Lu || 3311 chartype == ucp_Ll || 3312 chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) 3313 RRETURN(MATCH_NOMATCH); 3314 } 3315 /* Control never gets here */ 3316 3317 case PT_GC: 3318 for (;;) 3319 { 3320 RMATCH(Fecode, RM210); 3321 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3322 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3323 if (Feptr >= mb->end_subject) 3324 { 3325 SCHECK_PARTIAL(); 3326 RRETURN(MATCH_NOMATCH); 3327 } 3328 GETCHARINCTEST(fc, Feptr); 3329 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3330 RRETURN(MATCH_NOMATCH); 3331 } 3332 /* Control never gets here */ 3333 3334 case PT_PC: 3335 for (;;) 3336 { 3337 RMATCH(Fecode, RM211); 3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3339 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3340 if (Feptr >= mb->end_subject) 3341 { 3342 SCHECK_PARTIAL(); 3343 RRETURN(MATCH_NOMATCH); 3344 } 3345 GETCHARINCTEST(fc, Feptr); 3346 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3347 RRETURN(MATCH_NOMATCH); 3348 } 3349 /* Control never gets here */ 3350 3351 case PT_SC: 3352 for (;;) 3353 { 3354 RMATCH(Fecode, RM212); 3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3356 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3357 if (Feptr >= mb->end_subject) 3358 { 3359 SCHECK_PARTIAL(); 3360 RRETURN(MATCH_NOMATCH); 3361 } 3362 GETCHARINCTEST(fc, Feptr); 3363 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3364 RRETURN(MATCH_NOMATCH); 3365 } 3366 /* Control never gets here */ 3367 3368 case PT_ALNUM: 3369 for (;;) 3370 { 3371 int category; 3372 RMATCH(Fecode, RM213); 3373 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3374 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3375 if (Feptr >= mb->end_subject) 3376 { 3377 SCHECK_PARTIAL(); 3378 RRETURN(MATCH_NOMATCH); 3379 } 3380 GETCHARINCTEST(fc, Feptr); 3381 category = UCD_CATEGORY(fc); 3382 if ((category == ucp_L || category == ucp_N) == 3383 (Lctype == OP_NOTPROP)) 3384 RRETURN(MATCH_NOMATCH); 3385 } 3386 /* Control never gets here */ 3387 3388 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 3389 which means that Perl space and POSIX space are now identical. PCRE 3390 was changed at release 8.34. */ 3391 3392 case PT_SPACE: /* Perl space */ 3393 case PT_PXSPACE: /* POSIX space */ 3394 for (;;) 3395 { 3396 RMATCH(Fecode, RM214); 3397 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3398 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3399 if (Feptr >= mb->end_subject) 3400 { 3401 SCHECK_PARTIAL(); 3402 RRETURN(MATCH_NOMATCH); 3403 } 3404 GETCHARINCTEST(fc, Feptr); 3405 switch(fc) 3406 { 3407 HSPACE_CASES: 3408 VSPACE_CASES: 3409 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 3410 break; 3411 3412 default: 3413 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) 3414 RRETURN(MATCH_NOMATCH); 3415 break; 3416 } 3417 } 3418 /* Control never gets here */ 3419 3420 case PT_WORD: 3421 for (;;) 3422 { 3423 int category; 3424 RMATCH(Fecode, RM215); 3425 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3426 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3427 if (Feptr >= mb->end_subject) 3428 { 3429 SCHECK_PARTIAL(); 3430 RRETURN(MATCH_NOMATCH); 3431 } 3432 GETCHARINCTEST(fc, Feptr); 3433 category = UCD_CATEGORY(fc); 3434 if ((category == ucp_L || 3435 category == ucp_N || 3436 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) 3437 RRETURN(MATCH_NOMATCH); 3438 } 3439 /* Control never gets here */ 3440 3441 case PT_CLIST: 3442 for (;;) 3443 { 3444 const uint32_t *cp; 3445 RMATCH(Fecode, RM216); 3446 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3447 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3448 if (Feptr >= mb->end_subject) 3449 { 3450 SCHECK_PARTIAL(); 3451 RRETURN(MATCH_NOMATCH); 3452 } 3453 GETCHARINCTEST(fc, Feptr); 3454 cp = PRIV(ucd_caseless_sets) + Lpropvalue; 3455 for (;;) 3456 { 3457 if (fc < *cp) 3458 { 3459 if (Lctype == OP_NOTPROP) break; 3460 RRETURN(MATCH_NOMATCH); 3461 } 3462 if (fc == *cp++) 3463 { 3464 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 3465 break; 3466 } 3467 } 3468 } 3469 /* Control never gets here */ 3470 3471 case PT_UCNC: 3472 for (;;) 3473 { 3474 RMATCH(Fecode, RM217); 3475 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3476 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3477 if (Feptr >= mb->end_subject) 3478 { 3479 SCHECK_PARTIAL(); 3480 RRETURN(MATCH_NOMATCH); 3481 } 3482 GETCHARINCTEST(fc, Feptr); 3483 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || 3484 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || 3485 fc >= 0xe000) == (Lctype == OP_NOTPROP)) 3486 RRETURN(MATCH_NOMATCH); 3487 } 3488 /* Control never gets here */ 3489 3490 /* This should never occur */ 3491 default: 3492 return PCRE2_ERROR_INTERNAL; 3493 } 3494 } 3495 3496 /* Match extended Unicode sequences. We will get here only if the 3497 support is in the binary; otherwise a compile-time error occurs. */ 3498 3499 else if (Lctype == OP_EXTUNI) 3500 { 3501 for (;;) 3502 { 3503 RMATCH(Fecode, RM218); 3504 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3505 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3506 if (Feptr >= mb->end_subject) 3507 { 3508 SCHECK_PARTIAL(); 3509 RRETURN(MATCH_NOMATCH); 3510 } 3511 else 3512 { 3513 GETCHARINCTEST(fc, Feptr); 3514 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, 3515 utf, NULL); 3516 } 3517 CHECK_PARTIAL(); 3518 } 3519 } 3520 else 3521 #endif /* SUPPORT_UNICODE */ 3522 3523 /* UTF mode for non-property testing character types. */ 3524 3525 #ifdef SUPPORT_UNICODE 3526 if (utf) 3527 { 3528 for (;;) 3529 { 3530 RMATCH(Fecode, RM219); 3531 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3532 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3533 if (Feptr >= mb->end_subject) 3534 { 3535 SCHECK_PARTIAL(); 3536 RRETURN(MATCH_NOMATCH); 3537 } 3538 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); 3539 GETCHARINC(fc, Feptr); 3540 switch(Lctype) 3541 { 3542 case OP_ANY: /* This is the non-NL case */ 3543 if (mb->partial != 0 && /* Take care with CRLF partial */ 3544 Feptr >= mb->end_subject && 3545 NLBLOCK->nltype == NLTYPE_FIXED && 3546 NLBLOCK->nllen == 2 && 3547 fc == NLBLOCK->nl[0]) 3548 { 3549 mb->hitend = TRUE; 3550 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 3551 } 3552 break; 3553 3554 case OP_ALLANY: 3555 case OP_ANYBYTE: 3556 break; 3557 3558 case OP_ANYNL: 3559 switch(fc) 3560 { 3561 default: RRETURN(MATCH_NOMATCH); 3562 3563 case CHAR_CR: 3564 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; 3565 break; 3566 3567 case CHAR_LF: 3568 break; 3569 3570 case CHAR_VT: 3571 case CHAR_FF: 3572 case CHAR_NEL: 3573 #ifndef EBCDIC 3574 case 0x2028: 3575 case 0x2029: 3576 #endif /* Not EBCDIC */ 3577 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) 3578 RRETURN(MATCH_NOMATCH); 3579 break; 3580 } 3581 break; 3582 3583 case OP_NOT_HSPACE: 3584 switch(fc) 3585 { 3586 HSPACE_CASES: RRETURN(MATCH_NOMATCH); 3587 default: break; 3588 } 3589 break; 3590 3591 case OP_HSPACE: 3592 switch(fc) 3593 { 3594 HSPACE_CASES: break; 3595 default: RRETURN(MATCH_NOMATCH); 3596 } 3597 break; 3598 3599 case OP_NOT_VSPACE: 3600 switch(fc) 3601 { 3602 VSPACE_CASES: RRETURN(MATCH_NOMATCH); 3603 default: break; 3604 } 3605 break; 3606 3607 case OP_VSPACE: 3608 switch(fc) 3609 { 3610 VSPACE_CASES: break; 3611 default: RRETURN(MATCH_NOMATCH); 3612 } 3613 break; 3614 3615 case OP_NOT_DIGIT: 3616 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) 3617 RRETURN(MATCH_NOMATCH); 3618 break; 3619 3620 case OP_DIGIT: 3621 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) 3622 RRETURN(MATCH_NOMATCH); 3623 break; 3624 3625 case OP_NOT_WHITESPACE: 3626 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) 3627 RRETURN(MATCH_NOMATCH); 3628 break; 3629 3630 case OP_WHITESPACE: 3631 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) 3632 RRETURN(MATCH_NOMATCH); 3633 break; 3634 3635 case OP_NOT_WORDCHAR: 3636 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) 3637 RRETURN(MATCH_NOMATCH); 3638 break; 3639 3640 case OP_WORDCHAR: 3641 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) 3642 RRETURN(MATCH_NOMATCH); 3643 break; 3644 3645 default: 3646 return PCRE2_ERROR_INTERNAL; 3647 } 3648 } 3649 } 3650 else 3651 #endif /* SUPPORT_UNICODE */ 3652 3653 /* Not UTF mode */ 3654 { 3655 for (;;) 3656 { 3657 RMATCH(Fecode, RM33); 3658 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3659 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 3660 if (Feptr >= mb->end_subject) 3661 { 3662 SCHECK_PARTIAL(); 3663 RRETURN(MATCH_NOMATCH); 3664 } 3665 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) 3666 RRETURN(MATCH_NOMATCH); 3667 fc = *Feptr++; 3668 switch(Lctype) 3669 { 3670 case OP_ANY: /* This is the non-NL case */ 3671 if (mb->partial != 0 && /* Take care with CRLF partial */ 3672 Feptr >= mb->end_subject && 3673 NLBLOCK->nltype == NLTYPE_FIXED && 3674 NLBLOCK->nllen == 2 && 3675 fc == NLBLOCK->nl[0]) 3676 { 3677 mb->hitend = TRUE; 3678 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 3679 } 3680 break; 3681 3682 case OP_ALLANY: 3683 case OP_ANYBYTE: 3684 break; 3685 3686 case OP_ANYNL: 3687 switch(fc) 3688 { 3689 default: RRETURN(MATCH_NOMATCH); 3690 3691 case CHAR_CR: 3692 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; 3693 break; 3694 3695 case CHAR_LF: 3696 break; 3697 3698 case CHAR_VT: 3699 case CHAR_FF: 3700 case CHAR_NEL: 3701 #if PCRE2_CODE_UNIT_WIDTH != 8 3702 case 0x2028: 3703 case 0x2029: 3704 #endif 3705 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) 3706 RRETURN(MATCH_NOMATCH); 3707 break; 3708 } 3709 break; 3710 3711 case OP_NOT_HSPACE: 3712 switch(fc) 3713 { 3714 default: break; 3715 HSPACE_BYTE_CASES: 3716 #if PCRE2_CODE_UNIT_WIDTH != 8 3717 HSPACE_MULTIBYTE_CASES: 3718 #endif 3719 RRETURN(MATCH_NOMATCH); 3720 } 3721 break; 3722 3723 case OP_HSPACE: 3724 switch(fc) 3725 { 3726 default: RRETURN(MATCH_NOMATCH); 3727 HSPACE_BYTE_CASES: 3728 #if PCRE2_CODE_UNIT_WIDTH != 8 3729 HSPACE_MULTIBYTE_CASES: 3730 #endif 3731 break; 3732 } 3733 break; 3734 3735 case OP_NOT_VSPACE: 3736 switch(fc) 3737 { 3738 default: break; 3739 VSPACE_BYTE_CASES: 3740 #if PCRE2_CODE_UNIT_WIDTH != 8 3741 VSPACE_MULTIBYTE_CASES: 3742 #endif 3743 RRETURN(MATCH_NOMATCH); 3744 } 3745 break; 3746 3747 case OP_VSPACE: 3748 switch(fc) 3749 { 3750 default: RRETURN(MATCH_NOMATCH); 3751 VSPACE_BYTE_CASES: 3752 #if PCRE2_CODE_UNIT_WIDTH != 8 3753 VSPACE_MULTIBYTE_CASES: 3754 #endif 3755 break; 3756 } 3757 break; 3758 3759 case OP_NOT_DIGIT: 3760 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) 3761 RRETURN(MATCH_NOMATCH); 3762 break; 3763 3764 case OP_DIGIT: 3765 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) 3766 RRETURN(MATCH_NOMATCH); 3767 break; 3768 3769 case OP_NOT_WHITESPACE: 3770 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) 3771 RRETURN(MATCH_NOMATCH); 3772 break; 3773 3774 case OP_WHITESPACE: 3775 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) 3776 RRETURN(MATCH_NOMATCH); 3777 break; 3778 3779 case OP_NOT_WORDCHAR: 3780 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) 3781 RRETURN(MATCH_NOMATCH); 3782 break; 3783 3784 case OP_WORDCHAR: 3785 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) 3786 RRETURN(MATCH_NOMATCH); 3787 break; 3788 3789 default: 3790 return PCRE2_ERROR_INTERNAL; 3791 } 3792 } 3793 } 3794 /* Control never gets here */ 3795 } 3796 3797 /* If maximizing, it is worth using inline code for speed, doing the type 3798 test once at the start (i.e. keep it out of the loop). */ 3799 3800 else 3801 { 3802 Lstart_eptr = Feptr; /* Remember where we started */ 3803 3804 #ifdef SUPPORT_UNICODE 3805 if (proptype >= 0) 3806 { 3807 switch(proptype) 3808 { 3809 case PT_ANY: 3810 for (i = Lmin; i < Lmax; i++) 3811 { 3812 int len = 1; 3813 if (Feptr >= mb->end_subject) 3814 { 3815 SCHECK_PARTIAL(); 3816 break; 3817 } 3818 GETCHARLENTEST(fc, Feptr, len); 3819 if (Lctype == OP_NOTPROP) break; 3820 Feptr+= len; 3821 } 3822 break; 3823 3824 case PT_LAMP: 3825 for (i = Lmin; i < Lmax; i++) 3826 { 3827 int chartype; 3828 int len = 1; 3829 if (Feptr >= mb->end_subject) 3830 { 3831 SCHECK_PARTIAL(); 3832 break; 3833 } 3834 GETCHARLENTEST(fc, Feptr, len); 3835 chartype = UCD_CHARTYPE(fc); 3836 if ((chartype == ucp_Lu || 3837 chartype == ucp_Ll || 3838 chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) 3839 break; 3840 Feptr+= len; 3841 } 3842 break; 3843 3844 case PT_GC: 3845 for (i = Lmin; i < Lmax; i++) 3846 { 3847 int len = 1; 3848 if (Feptr >= mb->end_subject) 3849 { 3850 SCHECK_PARTIAL(); 3851 break; 3852 } 3853 GETCHARLENTEST(fc, Feptr, len); 3854 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3855 break; 3856 Feptr+= len; 3857 } 3858 break; 3859 3860 case PT_PC: 3861 for (i = Lmin; i < Lmax; i++) 3862 { 3863 int len = 1; 3864 if (Feptr >= mb->end_subject) 3865 { 3866 SCHECK_PARTIAL(); 3867 break; 3868 } 3869 GETCHARLENTEST(fc, Feptr, len); 3870 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3871 break; 3872 Feptr+= len; 3873 } 3874 break; 3875 3876 case PT_SC: 3877 for (i = Lmin; i < Lmax; i++) 3878 { 3879 int len = 1; 3880 if (Feptr >= mb->end_subject) 3881 { 3882 SCHECK_PARTIAL(); 3883 break; 3884 } 3885 GETCHARLENTEST(fc, Feptr, len); 3886 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) 3887 break; 3888 Feptr+= len; 3889 } 3890 break; 3891 3892 case PT_ALNUM: 3893 for (i = Lmin; i < Lmax; i++) 3894 { 3895 int category; 3896 int len = 1; 3897 if (Feptr >= mb->end_subject) 3898 { 3899 SCHECK_PARTIAL(); 3900 break; 3901 } 3902 GETCHARLENTEST(fc, Feptr, len); 3903 category = UCD_CATEGORY(fc); 3904 if ((category == ucp_L || category == ucp_N) == 3905 (Lctype == OP_NOTPROP)) 3906 break; 3907 Feptr+= len; 3908 } 3909 break; 3910 3911 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 3912 which means that Perl space and POSIX space are now identical. PCRE 3913 was changed at release 8.34. */ 3914 3915 case PT_SPACE: /* Perl space */ 3916 case PT_PXSPACE: /* POSIX space */ 3917 for (i = Lmin; i < Lmax; i++) 3918 { 3919 int len = 1; 3920 if (Feptr >= mb->end_subject) 3921 { 3922 SCHECK_PARTIAL(); 3923 break; 3924 } 3925 GETCHARLENTEST(fc, Feptr, len); 3926 switch(fc) 3927 { 3928 HSPACE_CASES: 3929 VSPACE_CASES: 3930 if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */ 3931 break; 3932 3933 default: 3934 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) 3935 goto ENDLOOP99; /* Break the loop */ 3936 break; 3937 } 3938 Feptr+= len; 3939 } 3940 ENDLOOP99: 3941 break; 3942 3943 case PT_WORD: 3944 for (i = Lmin; i < Lmax; i++) 3945 { 3946 int category; 3947 int len = 1; 3948 if (Feptr >= mb->end_subject) 3949 { 3950 SCHECK_PARTIAL(); 3951 break; 3952 } 3953 GETCHARLENTEST(fc, Feptr, len); 3954 category = UCD_CATEGORY(fc); 3955 if ((category == ucp_L || category == ucp_N || 3956 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) 3957 break; 3958 Feptr+= len; 3959 } 3960 break; 3961 3962 case PT_CLIST: 3963 for (i = Lmin; i < Lmax; i++) 3964 { 3965 const uint32_t *cp; 3966 int len = 1; 3967 if (Feptr >= mb->end_subject) 3968 { 3969 SCHECK_PARTIAL(); 3970 break; 3971 } 3972 GETCHARLENTEST(fc, Feptr, len); 3973 cp = PRIV(ucd_caseless_sets) + Lpropvalue; 3974 for (;;) 3975 { 3976 if (fc < *cp) 3977 { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; } 3978 if (fc == *cp++) 3979 { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; } 3980 } 3981 Feptr += len; 3982 } 3983 GOT_MAX: 3984 break; 3985 3986 case PT_UCNC: 3987 for (i = Lmin; i < Lmax; i++) 3988 { 3989 int len = 1; 3990 if (Feptr >= mb->end_subject) 3991 { 3992 SCHECK_PARTIAL(); 3993 break; 3994 } 3995 GETCHARLENTEST(fc, Feptr, len); 3996 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || 3997 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || 3998 fc >= 0xe000) == (Lctype == OP_NOTPROP)) 3999 break; 4000 Feptr += len; 4001 } 4002 break; 4003 4004 default: 4005 return PCRE2_ERROR_INTERNAL; 4006 } 4007 4008 /* Feptr is now past the end of the maximum run */ 4009 4010 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 4011 4012 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 4013 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't 4014 go too far. */ 4015 4016 for(;;) 4017 { 4018 if (Feptr <= Lstart_eptr) break; 4019 RMATCH(Fecode, RM222); 4020 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4021 Feptr--; 4022 if (utf) BACKCHAR(Feptr); 4023 } 4024 } 4025 4026 /* Match extended Unicode grapheme clusters. We will get here only if the 4027 support is in the binary; otherwise a compile-time error occurs. */ 4028 4029 else if (Lctype == OP_EXTUNI) 4030 { 4031 for (i = Lmin; i < Lmax; i++) 4032 { 4033 if (Feptr >= mb->end_subject) 4034 { 4035 SCHECK_PARTIAL(); 4036 break; 4037 } 4038 else 4039 { 4040 GETCHARINCTEST(fc, Feptr); 4041 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, 4042 utf, NULL); 4043 } 4044 CHECK_PARTIAL(); 4045 } 4046 4047 /* Feptr is now past the end of the maximum run */ 4048 4049 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 4050 4051 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start 4052 of the run while backtracking because the use of \C in UTF mode can 4053 cause BACKCHAR to move back past Lstart_eptr. This is just palliative; 4054 the use of \C in UTF mode is fraught with danger. */ 4055 4056 for(;;) 4057 { 4058 int lgb, rgb; 4059 PCRE2_SPTR fptr; 4060 4061 if (Feptr <= Lstart_eptr) break; /* At start of char run */ 4062 RMATCH(Fecode, RM220); 4063 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4064 4065 /* Backtracking over an extended grapheme cluster involves inspecting 4066 the previous two characters (if present) to see if a break is 4067 permitted between them. */ 4068 4069 Feptr--; 4070 if (!utf) fc = *Feptr; else 4071 { 4072 BACKCHAR(Feptr); 4073 GETCHAR(fc, Feptr); 4074 } 4075 rgb = UCD_GRAPHBREAK(fc); 4076 4077 for (;;) 4078 { 4079 if (Feptr <= Lstart_eptr) break; /* At start of char run */ 4080 fptr = Feptr - 1; 4081 if (!utf) fc = *fptr; else 4082 { 4083 BACKCHAR(fptr); 4084 GETCHAR(fc, fptr); 4085 } 4086 lgb = UCD_GRAPHBREAK(fc); 4087 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; 4088 Feptr = fptr; 4089 rgb = lgb; 4090 } 4091 } 4092 } 4093 4094 else 4095 #endif /* SUPPORT_UNICODE */ 4096 4097 #ifdef SUPPORT_UNICODE 4098 if (utf) 4099 { 4100 switch(Lctype) 4101 { 4102 case OP_ANY: 4103 for (i = Lmin; i < Lmax; i++) 4104 { 4105 if (Feptr >= mb->end_subject) 4106 { 4107 SCHECK_PARTIAL(); 4108 break; 4109 } 4110 if (IS_NEWLINE(Feptr)) break; 4111 if (mb->partial != 0 && /* Take care with CRLF partial */ 4112 Feptr + 1 >= mb->end_subject && 4113 NLBLOCK->nltype == NLTYPE_FIXED && 4114 NLBLOCK->nllen == 2 && 4115 UCHAR21(Feptr) == NLBLOCK->nl[0]) 4116 { 4117 mb->hitend = TRUE; 4118 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 4119 } 4120 Feptr++; 4121 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 4122 } 4123 break; 4124 4125 case OP_ALLANY: 4126 if (Lmax < UINT32_MAX) 4127 { 4128 for (i = Lmin; i < Lmax; i++) 4129 { 4130 if (Feptr >= mb->end_subject) 4131 { 4132 SCHECK_PARTIAL(); 4133 break; 4134 } 4135 Feptr++; 4136 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); 4137 } 4138 } 4139 else 4140 { 4141 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ 4142 SCHECK_PARTIAL(); 4143 } 4144 break; 4145 4146 /* The "byte" (i.e. "code unit") case is the same as non-UTF */ 4147 4148 case OP_ANYBYTE: 4149 fc = Lmax - Lmin; 4150 if (fc > (uint32_t)(mb->end_subject - Feptr)) 4151 { 4152 Feptr = mb->end_subject; 4153 SCHECK_PARTIAL(); 4154 } 4155 else Feptr += fc; 4156 break; 4157 4158 case OP_ANYNL: 4159 for (i = Lmin; i < Lmax; i++) 4160 { 4161 int len = 1; 4162 if (Feptr >= mb->end_subject) 4163 { 4164 SCHECK_PARTIAL(); 4165 break; 4166 } 4167 GETCHARLEN(fc, Feptr, len); 4168 if (fc == CHAR_CR) 4169 { 4170 if (++Feptr >= mb->end_subject) break; 4171 if (UCHAR21(Feptr) == CHAR_LF) Feptr++; 4172 } 4173 else 4174 { 4175 if (fc != CHAR_LF && 4176 (mb->bsr_convention == PCRE2_BSR_ANYCRLF || 4177 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL 4178 #ifndef EBCDIC 4179 && fc != 0x2028 && fc != 0x2029 4180 #endif /* Not EBCDIC */ 4181 ))) 4182 break; 4183 Feptr += len; 4184 } 4185 } 4186 break; 4187 4188 case OP_NOT_HSPACE: 4189 case OP_HSPACE: 4190 for (i = Lmin; i < Lmax; i++) 4191 { 4192 BOOL gotspace; 4193 int len = 1; 4194 if (Feptr >= mb->end_subject) 4195 { 4196 SCHECK_PARTIAL(); 4197 break; 4198 } 4199 GETCHARLEN(fc, Feptr, len); 4200 switch(fc) 4201 { 4202 HSPACE_CASES: gotspace = TRUE; break; 4203 default: gotspace = FALSE; break; 4204 } 4205 if (gotspace == (Lctype == OP_NOT_HSPACE)) break; 4206 Feptr += len; 4207 } 4208 break; 4209 4210 case OP_NOT_VSPACE: 4211 case OP_VSPACE: 4212 for (i = Lmin; i < Lmax; i++) 4213 { 4214 BOOL gotspace; 4215 int len = 1; 4216 if (Feptr >= mb->end_subject) 4217 { 4218 SCHECK_PARTIAL(); 4219 break; 4220 } 4221 GETCHARLEN(fc, Feptr, len); 4222 switch(fc) 4223 { 4224 VSPACE_CASES: gotspace = TRUE; break; 4225 default: gotspace = FALSE; break; 4226 } 4227 if (gotspace == (Lctype == OP_NOT_VSPACE)) break; 4228 Feptr += len; 4229 } 4230 break; 4231 4232 case OP_NOT_DIGIT: 4233 for (i = Lmin; i < Lmax; i++) 4234 { 4235 int len = 1; 4236 if (Feptr >= mb->end_subject) 4237 { 4238 SCHECK_PARTIAL(); 4239 break; 4240 } 4241 GETCHARLEN(fc, Feptr, len); 4242 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; 4243 Feptr+= len; 4244 } 4245 break; 4246 4247 case OP_DIGIT: 4248 for (i = Lmin; i < Lmax; i++) 4249 { 4250 int len = 1; 4251 if (Feptr >= mb->end_subject) 4252 { 4253 SCHECK_PARTIAL(); 4254 break; 4255 } 4256 GETCHARLEN(fc, Feptr, len); 4257 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; 4258 Feptr+= len; 4259 } 4260 break; 4261 4262 case OP_NOT_WHITESPACE: 4263 for (i = Lmin; i < Lmax; i++) 4264 { 4265 int len = 1; 4266 if (Feptr >= mb->end_subject) 4267 { 4268 SCHECK_PARTIAL(); 4269 break; 4270 } 4271 GETCHARLEN(fc, Feptr, len); 4272 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; 4273 Feptr+= len; 4274 } 4275 break; 4276 4277 case OP_WHITESPACE: 4278 for (i = Lmin; i < Lmax; i++) 4279 { 4280 int len = 1; 4281 if (Feptr >= mb->end_subject) 4282 { 4283 SCHECK_PARTIAL(); 4284 break; 4285 } 4286 GETCHARLEN(fc, Feptr, len); 4287 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; 4288 Feptr+= len; 4289 } 4290 break; 4291 4292 case OP_NOT_WORDCHAR: 4293 for (i = Lmin; i < Lmax; i++) 4294 { 4295 int len = 1; 4296 if (Feptr >= mb->end_subject) 4297 { 4298 SCHECK_PARTIAL(); 4299 break; 4300 } 4301 GETCHARLEN(fc, Feptr, len); 4302 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; 4303 Feptr+= len; 4304 } 4305 break; 4306 4307 case OP_WORDCHAR: 4308 for (i = Lmin; i < Lmax; i++) 4309 { 4310 int len = 1; 4311 if (Feptr >= mb->end_subject) 4312 { 4313 SCHECK_PARTIAL(); 4314 break; 4315 } 4316 GETCHARLEN(fc, Feptr, len); 4317 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; 4318 Feptr+= len; 4319 } 4320 break; 4321 4322 default: 4323 return PCRE2_ERROR_INTERNAL; 4324 } 4325 4326 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 4327 4328 /* After \C in UTF mode, Lstart_eptr might be in the middle of a 4329 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go 4330 too far. */ 4331 4332 for(;;) 4333 { 4334 if (Feptr <= Lstart_eptr) break; 4335 RMATCH(Fecode, RM221); 4336 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4337 Feptr--; 4338 BACKCHAR(Feptr); 4339 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && 4340 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) 4341 Feptr--; 4342 } 4343 } 4344 else 4345 #endif /* SUPPORT_UNICODE */ 4346 4347 /* Not UTF mode */ 4348 { 4349 switch(Lctype) 4350 { 4351 case OP_ANY: 4352 for (i = Lmin; i < Lmax; i++) 4353 { 4354 if (Feptr >= mb->end_subject) 4355 { 4356 SCHECK_PARTIAL(); 4357 break; 4358 } 4359 if (IS_NEWLINE(Feptr)) break; 4360 if (mb->partial != 0 && /* Take care with CRLF partial */ 4361 Feptr + 1 >= mb->end_subject && 4362 NLBLOCK->nltype == NLTYPE_FIXED && 4363 NLBLOCK->nllen == 2 && 4364 *Feptr == NLBLOCK->nl[0]) 4365 { 4366 mb->hitend = TRUE; 4367 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 4368 } 4369 Feptr++; 4370 } 4371 break; 4372 4373 case OP_ALLANY: 4374 case OP_ANYBYTE: 4375 fc = Lmax - Lmin; 4376 if (fc > (uint32_t)(mb->end_subject - Feptr)) 4377 { 4378 Feptr = mb->end_subject; 4379 SCHECK_PARTIAL(); 4380 } 4381 else Feptr += fc; 4382 break; 4383 4384 case OP_ANYNL: 4385 for (i = Lmin; i < Lmax; i++) 4386 { 4387 if (Feptr >= mb->end_subject) 4388 { 4389 SCHECK_PARTIAL(); 4390 break; 4391 } 4392 fc = *Feptr; 4393 if (fc == CHAR_CR) 4394 { 4395 if (++Feptr >= mb->end_subject) break; 4396 if (*Feptr == CHAR_LF) Feptr++; 4397 } 4398 else 4399 { 4400 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || 4401 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL 4402 #if PCRE2_CODE_UNIT_WIDTH != 8 4403 && fc != 0x2028 && fc != 0x2029 4404 #endif 4405 ))) break; 4406 Feptr++; 4407 } 4408 } 4409 break; 4410 4411 case OP_NOT_HSPACE: 4412 for (i = Lmin; i < Lmax; i++) 4413 { 4414 if (Feptr >= mb->end_subject) 4415 { 4416 SCHECK_PARTIAL(); 4417 break; 4418 } 4419 switch(*Feptr) 4420 { 4421 default: Feptr++; break; 4422 HSPACE_BYTE_CASES: 4423 #if PCRE2_CODE_UNIT_WIDTH != 8 4424 HSPACE_MULTIBYTE_CASES: 4425 #endif 4426 goto ENDLOOP00; 4427 } 4428 } 4429 ENDLOOP00: 4430 break; 4431 4432 case OP_HSPACE: 4433 for (i = Lmin; i < Lmax; i++) 4434 { 4435 if (Feptr >= mb->end_subject) 4436 { 4437 SCHECK_PARTIAL(); 4438 break; 4439 } 4440 switch(*Feptr) 4441 { 4442 default: goto ENDLOOP01; 4443 HSPACE_BYTE_CASES: 4444 #if PCRE2_CODE_UNIT_WIDTH != 8 4445 HSPACE_MULTIBYTE_CASES: 4446 #endif 4447 Feptr++; break; 4448 } 4449 } 4450 ENDLOOP01: 4451 break; 4452 4453 case OP_NOT_VSPACE: 4454 for (i = Lmin; i < Lmax; i++) 4455 { 4456 if (Feptr >= mb->end_subject) 4457 { 4458 SCHECK_PARTIAL(); 4459 break; 4460 } 4461 switch(*Feptr) 4462 { 4463 default: Feptr++; break; 4464 VSPACE_BYTE_CASES: 4465 #if PCRE2_CODE_UNIT_WIDTH != 8 4466 VSPACE_MULTIBYTE_CASES: 4467 #endif 4468 goto ENDLOOP02; 4469 } 4470 } 4471 ENDLOOP02: 4472 break; 4473 4474 case OP_VSPACE: 4475 for (i = Lmin; i < Lmax; i++) 4476 { 4477 if (Feptr >= mb->end_subject) 4478 { 4479 SCHECK_PARTIAL(); 4480 break; 4481 } 4482 switch(*Feptr) 4483 { 4484 default: goto ENDLOOP03; 4485 VSPACE_BYTE_CASES: 4486 #if PCRE2_CODE_UNIT_WIDTH != 8 4487 VSPACE_MULTIBYTE_CASES: 4488 #endif 4489 Feptr++; break; 4490 } 4491 } 4492 ENDLOOP03: 4493 break; 4494 4495 case OP_NOT_DIGIT: 4496 for (i = Lmin; i < Lmax; i++) 4497 { 4498 if (Feptr >= mb->end_subject) 4499 { 4500 SCHECK_PARTIAL(); 4501 break; 4502 } 4503 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) 4504 break; 4505 Feptr++; 4506 } 4507 break; 4508 4509 case OP_DIGIT: 4510 for (i = Lmin; i < Lmax; i++) 4511 { 4512 if (Feptr >= mb->end_subject) 4513 { 4514 SCHECK_PARTIAL(); 4515 break; 4516 } 4517 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) 4518 break; 4519 Feptr++; 4520 } 4521 break; 4522 4523 case OP_NOT_WHITESPACE: 4524 for (i = Lmin; i < Lmax; i++) 4525 { 4526 if (Feptr >= mb->end_subject) 4527 { 4528 SCHECK_PARTIAL(); 4529 break; 4530 } 4531 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) 4532 break; 4533 Feptr++; 4534 } 4535 break; 4536 4537 case OP_WHITESPACE: 4538 for (i = Lmin; i < Lmax; i++) 4539 { 4540 if (Feptr >= mb->end_subject) 4541 { 4542 SCHECK_PARTIAL(); 4543 break; 4544 } 4545 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) 4546 break; 4547 Feptr++; 4548 } 4549 break; 4550 4551 case OP_NOT_WORDCHAR: 4552 for (i = Lmin; i < Lmax; i++) 4553 { 4554 if (Feptr >= mb->end_subject) 4555 { 4556 SCHECK_PARTIAL(); 4557 break; 4558 } 4559 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) 4560 break; 4561 Feptr++; 4562 } 4563 break; 4564 4565 case OP_WORDCHAR: 4566 for (i = Lmin; i < Lmax; i++) 4567 { 4568 if (Feptr >= mb->end_subject) 4569 { 4570 SCHECK_PARTIAL(); 4571 break; 4572 } 4573 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) 4574 break; 4575 Feptr++; 4576 } 4577 break; 4578 4579 default: 4580 return PCRE2_ERROR_INTERNAL; 4581 } 4582 4583 if (reptype == REPTYPE_POS) continue; /* No backtracking */ 4584 4585 for (;;) 4586 { 4587 if (Feptr == Lstart_eptr) break; 4588 RMATCH(Fecode, RM34); 4589 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4590 Feptr--; 4591 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && 4592 Feptr[-1] == CHAR_CR) Feptr--; 4593 } 4594 } 4595 } 4596 break; /* End of repeat character type processing */ 4597 4598 #undef Lstart_eptr 4599 #undef Lmin 4600 #undef Lmax 4601 #undef Lctype 4602 #undef Lpropvalue 4603 4604 4605 /* ===================================================================== */ 4606 /* Match a back reference, possibly repeatedly. Look past the end of the 4607 item to see if there is repeat information following. The OP_REF and 4608 OP_REFI opcodes are used for a reference to a numbered group or to a 4609 non-duplicated named group. For a duplicated named group, OP_DNREF and 4610 OP_DNREFI are used. In this case we must scan the list of groups to which 4611 the name refers, and use the first one that is set. */ 4612 4613 #define Lmin F->temp_32[0] 4614 #define Lmax F->temp_32[1] 4615 #define Lcaseless F->temp_32[2] 4616 #define Lstart F->temp_sptr[0] 4617 #define Loffset F->temp_size 4618 4619 case OP_DNREF: 4620 case OP_DNREFI: 4621 Lcaseless = (Fop == OP_DNREFI); 4622 { 4623 int count = GET2(Fecode, 1+IMM2_SIZE); 4624 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; 4625 Fecode += 1 + 2*IMM2_SIZE; 4626 4627 while (count-- > 0) 4628 { 4629 Loffset = (GET2(slot, 0) << 1) - 2; 4630 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; 4631 slot += mb->name_entry_size; 4632 } 4633 } 4634 goto REF_REPEAT; 4635 4636 case OP_REF: 4637 case OP_REFI: 4638 Lcaseless = (Fop == OP_REFI); 4639 Loffset = (GET2(Fecode, 1) << 1) - 2; 4640 Fecode += 1 + IMM2_SIZE; 4641 4642 /* Set up for repetition, or handle the non-repeated case. The maximum and 4643 minimum must be in the heap frame, but as they are short-term values, we 4644 use temporary fields. */ 4645 4646 REF_REPEAT: 4647 switch (*Fecode) 4648 { 4649 case OP_CRSTAR: 4650 case OP_CRMINSTAR: 4651 case OP_CRPLUS: 4652 case OP_CRMINPLUS: 4653 case OP_CRQUERY: 4654 case OP_CRMINQUERY: 4655 fc = *Fecode++ - OP_CRSTAR; 4656 Lmin = rep_min[fc]; 4657 Lmax = rep_max[fc]; 4658 reptype = rep_typ[fc]; 4659 break; 4660 4661 case OP_CRRANGE: 4662 case OP_CRMINRANGE: 4663 Lmin = GET2(Fecode, 1); 4664 Lmax = GET2(Fecode, 1 + IMM2_SIZE); 4665 reptype = rep_typ[*Fecode - OP_CRSTAR]; 4666 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ 4667 Fecode += 1 + 2 * IMM2_SIZE; 4668 break; 4669 4670 default: /* No repeat follows */ 4671 { 4672 rrc = match_ref(Loffset, Lcaseless, F, mb, &length); 4673 if (rrc != 0) 4674 { 4675 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ 4676 CHECK_PARTIAL(); 4677 RRETURN(MATCH_NOMATCH); 4678 } 4679 } 4680 Feptr += length; 4681 continue; /* With the main loop */ 4682 } 4683 4684 /* Handle repeated back references. If a set group has length zero, just 4685 continue with the main loop, because it matches however many times. For an 4686 unset reference, if the minimum is zero, we can also just continue. We can 4687 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset 4688 group behave as a zero-length group. For any other unset cases, carrying 4689 on will result in NOMATCH. */ 4690 4691 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) 4692 { 4693 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; 4694 } 4695 else /* Group is not set */ 4696 { 4697 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) 4698 continue; 4699 } 4700 4701 /* First, ensure the minimum number of matches are present. */ 4702 4703 for (i = 1; i <= Lmin; i++) 4704 { 4705 PCRE2_SIZE slength; 4706 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); 4707 if (rrc != 0) 4708 { 4709 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ 4710 CHECK_PARTIAL(); 4711 RRETURN(MATCH_NOMATCH); 4712 } 4713 Feptr += slength; 4714 } 4715 4716 /* If min = max, we are done. They are not both allowed to be zero. */ 4717 4718 if (Lmin == Lmax) continue; 4719 4720 /* If minimizing, keep trying and advancing the pointer. */ 4721 4722 if (reptype == REPTYPE_MIN) 4723 { 4724 for (;;) 4725 { 4726 PCRE2_SIZE slength; 4727 RMATCH(Fecode, RM20); 4728 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4729 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); 4730 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); 4731 if (rrc != 0) 4732 { 4733 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ 4734 CHECK_PARTIAL(); 4735 RRETURN(MATCH_NOMATCH); 4736 } 4737 Feptr += slength; 4738 } 4739 /* Control never gets here */ 4740 } 4741 4742 /* If maximizing, find the longest string and work backwards, as long as 4743 the matched lengths for each iteration are the same. */ 4744 4745 else 4746 { 4747 BOOL samelengths = TRUE; 4748 Lstart = Feptr; /* Starting position */ 4749 Flength = Fovector[Loffset+1] - Fovector[Loffset]; 4750 4751 for (i = Lmin; i < Lmax; i++) 4752 { 4753 PCRE2_SIZE slength; 4754 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); 4755 if (rrc != 0) 4756 { 4757 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in 4758 the soft partial matching case. */ 4759 4760 if (rrc > 0 && mb->partial != 0 && 4761 mb->end_subject > mb->start_used_ptr) 4762 { 4763 mb->hitend = TRUE; 4764 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 4765 } 4766 break; 4767 } 4768 4769 if (slength != Flength) samelengths = FALSE; 4770 Feptr += slength; 4771 } 4772 4773 /* If the length matched for each repetition is the same as the length of 4774 the captured group, we can easily work backwards. This is the normal 4775 case. However, in caseless UTF-8 mode there are pairs of case-equivalent 4776 characters whose lengths (in terms of code units) differ. However, this 4777 is very rare, so we handle it by re-matching fewer and fewer times. */ 4778 4779 if (samelengths) 4780 { 4781 while (Feptr >= Lstart) 4782 { 4783 RMATCH(Fecode, RM21); 4784 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4785 Feptr -= Flength; 4786 } 4787 } 4788 4789 /* The rare case of non-matching lengths. Re-scan the repetition for each 4790 iteration. We know that match_ref() will succeed every time. */ 4791 4792 else 4793 { 4794 Lmax = i; 4795 for (;;) 4796 { 4797 RMATCH(Fecode, RM22); 4798 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4799 if (Feptr == Lstart) break; /* Failed after minimal repetition */ 4800 Feptr = Lstart; 4801 Lmax--; 4802 for (i = Lmin; i < Lmax; i++) 4803 { 4804 PCRE2_SIZE slength; 4805 (void)match_ref(Loffset, Lcaseless, F, mb, &slength); 4806 Feptr += slength; 4807 } 4808 } 4809 } 4810 4811 RRETURN(MATCH_NOMATCH); 4812 } 4813 /* Control never gets here */ 4814 4815 #undef Lcaseless 4816 #undef Lmin 4817 #undef Lmax 4818 #undef Lstart 4819 #undef Loffset 4820 4821 4822 4823 /* ========================================================================= */ 4824 /* Opcodes for the start of various parenthesized items */ 4825 /* ========================================================================= */ 4826 4827 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the 4828 (*THEN) is within the current branch by comparing the address of OP_THEN 4829 that is passed back with the end of the branch. If (*THEN) is within the 4830 current branch, and the branch is one of two or more alternatives (it 4831 either starts or ends with OP_ALT), we have reached the limit of THEN's 4832 action, so convert the return code to NOMATCH, which will cause normal 4833 backtracking to happen from now on. Otherwise, THEN is passed back to an 4834 outer alternative. This implements Perl's treatment of parenthesized 4835 groups, where a group not containing | does not affect the current 4836 alternative, that is, (X) is NOT the same as (X|(*F)). */ 4837 4838 4839 /* ===================================================================== */ 4840 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive 4841 bracket group, indicating that it may occur zero times. It may repeat 4842 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in 4843 the pattern. Brackets with fixed upper repeat limits are compiled as a 4844 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. 4845 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ 4846 4847 #define Lnext_ecode F->temp_sptr[0] 4848 4849 case OP_BRAZERO: 4850 Lnext_ecode = Fecode + 1; 4851 RMATCH(Lnext_ecode, RM9); 4852 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4853 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); 4854 Fecode = Lnext_ecode + 1 + LINK_SIZE; 4855 break; 4856 4857 case OP_BRAMINZERO: 4858 Lnext_ecode = Fecode + 1; 4859 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); 4860 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); 4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4862 Fecode++; 4863 break; 4864 4865 #undef Lnext_ecode 4866 4867 case OP_SKIPZERO: 4868 Fecode++; 4869 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); 4870 Fecode += 1 + LINK_SIZE; 4871 break; 4872 4873 4874 /* ===================================================================== */ 4875 /* Handle possessive brackets with an unlimited repeat. The end of these 4876 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without 4877 going further in the pattern. */ 4878 4879 #define Lframe_type F->temp_32[0] 4880 #define Lmatched_once F->temp_32[1] 4881 #define Lzero_allowed F->temp_32[2] 4882 #define Lstart_eptr F->temp_sptr[0] 4883 #define Lstart_group F->temp_sptr[1] 4884 4885 case OP_BRAPOSZERO: 4886 Lzero_allowed = TRUE; /* Zero repeat is allowed */ 4887 Fecode += 1; 4888 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) 4889 goto POSSESSIVE_CAPTURE; 4890 goto POSSESSIVE_NON_CAPTURE; 4891 4892 case OP_BRAPOS: 4893 case OP_SBRAPOS: 4894 Lzero_allowed = FALSE; /* Zero repeat not allowed */ 4895 4896 POSSESSIVE_NON_CAPTURE: 4897 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ 4898 goto POSSESSIVE_GROUP; 4899 4900 case OP_CBRAPOS: 4901 case OP_SCBRAPOS: 4902 Lzero_allowed = FALSE; /* Zero repeat not allowed */ 4903 4904 POSSESSIVE_CAPTURE: 4905 number = GET2(Fecode, 1+LINK_SIZE); 4906 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ 4907 4908 POSSESSIVE_GROUP: 4909 Lmatched_once = FALSE; /* Never matched */ 4910 Lstart_group = Fecode; /* Start of this group */ 4911 4912 for (;;) 4913 { 4914 Lstart_eptr = Feptr; /* Position at group start */ 4915 group_frame_type = Lframe_type; 4916 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); 4917 if (rrc == MATCH_KETRPOS) 4918 { 4919 Lmatched_once = TRUE; /* Matched at least once */ 4920 if (Feptr == Lstart_eptr) /* Empty match; skip to end */ 4921 { 4922 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); 4923 break; 4924 } 4925 4926 Fecode = Lstart_group; 4927 continue; 4928 } 4929 4930 /* See comment above about handling THEN. */ 4931 4932 if (rrc == MATCH_THEN) 4933 { 4934 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); 4935 if (mb->verb_ecode_ptr < next_ecode && 4936 (*Fecode == OP_ALT || *next_ecode == OP_ALT)) 4937 rrc = MATCH_NOMATCH; 4938 } 4939 4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4941 Fecode += GET(Fecode, 1); 4942 if (*Fecode != OP_ALT) break; 4943 } 4944 4945 /* Success if matched something or zero repeat allowed */ 4946 4947 if (Lmatched_once || Lzero_allowed) 4948 { 4949 Fecode += 1 + LINK_SIZE; 4950 break; 4951 } 4952 4953 RRETURN(MATCH_NOMATCH); 4954 4955 #undef Lmatched_once 4956 #undef Lzero_allowed 4957 #undef Lframe_type 4958 #undef Lstart_eptr 4959 #undef Lstart_group 4960 4961 4962 /* ===================================================================== */ 4963 /* Handle non-capturing brackets that cannot match an empty string. When we 4964 get to the final alternative within the brackets, as long as there are no 4965 THEN's in the pattern, we can optimize by not recording a new backtracking 4966 point. (Ideally we should test for a THEN within this group, but we don't 4967 have that information.) Don't do this if we are at the very top level, 4968 however, because that would make handling assertions and once-only brackets 4969 messier when there is nothing to go back to. */ 4970 4971 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ 4972 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ 4973 4974 case OP_BRA: 4975 if (mb->hasthen || Frdepth == 0) 4976 { 4977 Lframe_type = 0; 4978 goto GROUPLOOP; 4979 } 4980 4981 for (;;) 4982 { 4983 Lnext_branch = Fecode + GET(Fecode, 1); 4984 if (*Lnext_branch != OP_ALT) break; 4985 4986 /* This is never the final branch. We do not need to test for MATCH_THEN 4987 here because this code is not used when there is a THEN in the pattern. */ 4988 4989 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); 4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4991 Fecode = Lnext_branch; 4992 } 4993 4994 /* Hit the start of the final branch. Continue at this level. */ 4995 4996 Fecode += PRIV(OP_lengths)[*Fecode]; 4997 break; 4998 4999 #undef Lnext_branch 5000 5001 5002 /* ===================================================================== */ 5003 /* Handle a capturing bracket, other than those that are possessive with an 5004 unlimited repeat. */ 5005 5006 case OP_CBRA: 5007 case OP_SCBRA: 5008 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); 5009 goto GROUPLOOP; 5010 5011 5012 /* ===================================================================== */ 5013 /* Atomic groups and non-capturing brackets that can match an empty string 5014 must record a backtracking point and also set up a chained frame. */ 5015 5016 case OP_ONCE: 5017 case OP_SBRA: 5018 Lframe_type = GF_NOCAPTURE | Fop; 5019 5020 GROUPLOOP: 5021 for (;;) 5022 { 5023 group_frame_type = Lframe_type; 5024 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); 5025 if (rrc == MATCH_THEN) 5026 { 5027 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); 5028 if (mb->verb_ecode_ptr < next_ecode && 5029 (*Fecode == OP_ALT || *next_ecode == OP_ALT)) 5030 rrc = MATCH_NOMATCH; 5031 } 5032 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5033 Fecode += GET(Fecode, 1); 5034 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); 5035 } 5036 /* Control never reaches here. */ 5037 5038 #undef Lframe_type 5039 5040 5041 /* ===================================================================== */ 5042 /* Recursion either matches the current regex, or some subexpression. The 5043 offset data is the offset to the starting bracket from the start of the 5044 whole pattern. (This is so that it works from duplicated subpatterns.) */ 5045 5046 #define Lframe_type F->temp_32[0] 5047 #define Lstart_branch F->temp_sptr[0] 5048 5049 case OP_RECURSE: 5050 bracode = mb->start_code + GET(Fecode, 1); 5051 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); 5052 5053 /* If we are already in a recursion, check for repeating the same one 5054 without advancing the subject pointer. This should catch convoluted mutual 5055 recursions. (Some simple cases are caught at compile time.) */ 5056 5057 if (Fcurrent_recurse != RECURSE_UNSET) 5058 { 5059 offset = Flast_group_offset; 5060 while (offset != PCRE2_UNSET) 5061 { 5062 N = (heapframe *)((char *)mb->match_frames + offset); 5063 P = (heapframe *)((char *)N - frame_size); 5064 if (N->group_frame_type == (GF_RECURSE | number)) 5065 { 5066 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP; 5067 break; 5068 } 5069 offset = P->last_group_offset; 5070 } 5071 } 5072 5073 /* Now run the recursion, branch by branch. */ 5074 5075 Lstart_branch = bracode; 5076 Lframe_type = GF_RECURSE | number; 5077 5078 for (;;) 5079 { 5080 PCRE2_SPTR next_ecode; 5081 5082 group_frame_type = Lframe_type; 5083 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); 5084 next_ecode = Lstart_branch + GET(Lstart_branch,1); 5085 5086 /* Handle backtracking verbs, which are defined in a range that can 5087 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to 5088 escape beyond a recursion; they cause a NOMATCH for the entire recursion. 5089 5090 When one of these verbs triggers, the current recursion group number is 5091 recorded. If it matches the recursion we are processing, the verb 5092 happened within the recursion and we must deal with it. Otherwise it must 5093 have happened after the recursion completed, and so has to be passed 5094 back. See comment above about handling THEN. */ 5095 5096 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && 5097 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) 5098 { 5099 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && 5100 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) 5101 rrc = MATCH_NOMATCH; 5102 else RRETURN(MATCH_NOMATCH); 5103 } 5104 5105 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the 5106 OP_ACCEPT code. Nothing needs to be done here. */ 5107 5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5109 Lstart_branch = next_ecode; 5110 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); 5111 } 5112 /* Control never reaches here. */ 5113 5114 #undef Lframe_type 5115 #undef Lstart_branch 5116 5117 5118 /* ===================================================================== */ 5119 /* Positive assertions are like other groups except that PCRE doesn't allow 5120 the effect of (*THEN) to escape beyond an assertion; it is therefore 5121 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its 5122 captures and mark retained. Any other return is an error. */ 5123 5124 #define Lframe_type F->temp_32[0] 5125 5126 case OP_ASSERT: 5127 case OP_ASSERTBACK: 5128 Lframe_type = GF_NOCAPTURE | Fop; 5129 for (;;) 5130 { 5131 group_frame_type = Lframe_type; 5132 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); 5133 if (rrc == MATCH_ACCEPT) 5134 { 5135 memcpy(Fovector, 5136 (char *)assert_accept_frame + offsetof(heapframe, ovector), 5137 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); 5138 Foffset_top = assert_accept_frame->offset_top; 5139 Fmark = assert_accept_frame->mark; 5140 break; 5141 } 5142 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 5143 Fecode += GET(Fecode, 1); 5144 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); 5145 } 5146 5147 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); 5148 Fecode += 1 + LINK_SIZE; 5149 break; 5150 5151 #undef Lframe_type 5152 5153 5154 /* ===================================================================== */ 5155 /* Handle negative assertions. Loop for each non-matching branch as for 5156 positive assertions. */ 5157 5158 #define Lframe_type F->temp_32[0] 5159 5160 case OP_ASSERT_NOT: 5161 case OP_ASSERTBACK_NOT: 5162 Lframe_type = GF_NOCAPTURE | Fop; 5163 5164 for (;;) 5165 { 5166 group_frame_type = Lframe_type; 5167 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); 5168 switch(rrc) 5169 { 5170 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ 5171 case MATCH_MATCH: 5172 RRETURN (MATCH_NOMATCH); 5173 5174 case MATCH_NOMATCH: /* Branch failed, try next if present. */ 5175 case MATCH_THEN: 5176 Fecode += GET(Fecode, 1); 5177 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; 5178 break; 5179 5180 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ 5181 case MATCH_SKIP: 5182 case MATCH_PRUNE: 5183 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); 5184 goto ASSERT_NOT_FAILED; 5185 5186 default: /* Pass back any other return */ 5187 RRETURN(rrc); 5188 } 5189 } 5190 5191 /* None of the branches have matched or there was a backtrack to (*COMMIT), 5192 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a 5193 negative assertion, so carry on. */ 5194 5195 ASSERT_NOT_FAILED: 5196 Fecode += 1 + LINK_SIZE; 5197 break; 5198 5199 #undef Lframe_type 5200 5201 5202 /* ===================================================================== */ 5203 /* The callout item calls an external function, if one is provided, passing 5204 details of the match so far. This is mainly for debugging, though the 5205 function is able to force a failure. */ 5206 5207 case OP_CALLOUT: 5208 case OP_CALLOUT_STR: 5209 rrc = do_callout(F, mb, &length); 5210 if (rrc > 0) RRETURN(MATCH_NOMATCH); 5211 if (rrc < 0) RRETURN(rrc); 5212 Fecode += length; 5213 break; 5214 5215 5216 /* ===================================================================== */ 5217 /* Conditional group: compilation checked that there are no more than two 5218 branches. If the condition is false, skipping the first branch takes us 5219 past the end of the item if there is only one branch, but that's exactly 5220 what we want. */ 5221 5222 case OP_COND: 5223 case OP_SCOND: 5224 5225 /* The variable Flength will be added to Fecode when the condition is 5226 false, to get to the second branch. Setting it to the offset to the ALT or 5227 KET, then incrementing Fecode achieves this effect. However, if the second 5228 branch is non-existent, we must point to the KET so that the end of the 5229 group is correctly processed. We now have Fecode pointing to the condition 5230 or callout. */ 5231 5232 Flength = GET(Fecode, 1); /* Offset to the second branch */ 5233 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; 5234 Fecode += 1 + LINK_SIZE; /* From this opcode */ 5235 5236 /* Because of the way auto-callout works during compile, a callout item is 5237 inserted between OP_COND and an assertion condition. Such a callout can 5238 also be inserted manually. */ 5239 5240 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) 5241 { 5242 rrc = do_callout(F, mb, &length); 5243 if (rrc > 0) RRETURN(MATCH_NOMATCH); 5244 if (rrc < 0) RRETURN(rrc); 5245 5246 /* Advance Fecode past the callout, so it now points to the condition. We 5247 must adjust Flength so that the value of Fecode+Flength is unchanged. */ 5248 5249 Fecode += length; 5250 Flength -= length; 5251 } 5252 5253 /* Test the various possible conditions */ 5254 5255 condition = FALSE; 5256 switch(*Fecode) 5257 { 5258 case OP_RREF: /* Group recursion test */ 5259 if (Fcurrent_recurse != RECURSE_UNSET) 5260 { 5261 number = GET2(Fecode, 1); 5262 condition = (number == RREF_ANY || number == Fcurrent_recurse); 5263 } 5264 break; 5265 5266 case OP_DNRREF: /* Duplicate named group recursion test */ 5267 if (Fcurrent_recurse != RECURSE_UNSET) 5268 { 5269 int count = GET2(Fecode, 1 + IMM2_SIZE); 5270 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; 5271 while (count-- > 0) 5272 { 5273 number = GET2(slot, 0); 5274 condition = number == Fcurrent_recurse; 5275 if (condition) break; 5276 slot += mb->name_entry_size; 5277 } 5278 } 5279 break; 5280 5281 case OP_CREF: /* Numbered group used test */ 5282 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ 5283 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; 5284 break; 5285 5286 case OP_DNCREF: /* Duplicate named group used test */ 5287 { 5288 int count = GET2(Fecode, 1 + IMM2_SIZE); 5289 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; 5290 while (count-- > 0) 5291 { 5292 offset = (GET2(slot, 0) << 1) - 2; 5293 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; 5294 if (condition) break; 5295 slot += mb->name_entry_size; 5296 } 5297 } 5298 break; 5299 5300 case OP_FALSE: 5301 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ 5302 break; 5303 5304 case OP_TRUE: 5305 condition = TRUE; 5306 break; 5307 5308 /* The condition is an assertion. Run code similar to the assertion code 5309 above. */ 5310 5311 #define Lpositive F->temp_32[0] 5312 #define Lstart_branch F->temp_sptr[0] 5313 5314 default: 5315 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); 5316 Lstart_branch = Fecode; 5317 5318 for (;;) 5319 { 5320 group_frame_type = GF_CONDASSERT | *Fecode; 5321 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); 5322 5323 switch(rrc) 5324 { 5325 case MATCH_ACCEPT: /* Save captures */ 5326 memcpy(Fovector, 5327 (char *)assert_accept_frame + offsetof(heapframe, ovector), 5328 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); 5329 Foffset_top = assert_accept_frame->offset_top; 5330 5331 /* Fall through */ 5332 /* In the case of a match, the captures have already been put into 5333 the current frame. */ 5334 5335 case MATCH_MATCH: 5336 condition = Lpositive; /* TRUE for positive assertion */ 5337 break; 5338 5339 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 5340 assertion; it is therefore always treated as NOMATCH. */ 5341 5342 case MATCH_NOMATCH: 5343 case MATCH_THEN: 5344 Lstart_branch += GET(Lstart_branch, 1); 5345 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ 5346 condition = !Lpositive; /* TRUE for negative assertion */ 5347 break; 5348 5349 /* These force no match without checking other branches. */ 5350 5351 case MATCH_COMMIT: 5352 case MATCH_SKIP: 5353 case MATCH_PRUNE: 5354 condition = !Lpositive; 5355 break; 5356 5357 default: 5358 RRETURN(rrc); 5359 } 5360 break; /* Out of the branch loop */ 5361 } 5362 5363 /* If the condition is true, find the end of the assertion so that 5364 advancing past it gets us to the start of the first branch. */ 5365 5366 if (condition) 5367 { 5368 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); 5369 } 5370 break; /* End of assertion condition */ 5371 } 5372 5373 #undef Lpositive 5374 #undef Lstart_branch 5375 5376 /* Choose branch according to the condition. */ 5377 5378 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; 5379 5380 /* If the opcode is OP_SCOND it means we are at a repeated conditional 5381 group that might match an empty string. We must therefore descend a level 5382 so that the start is remembered for checking. For OP_COND we can just 5383 continue at this level. */ 5384 5385 if (Fop == OP_SCOND) 5386 { 5387 group_frame_type = GF_NOCAPTURE | Fop; 5388 RMATCH(Fecode, RM35); 5389 RRETURN(rrc); 5390 } 5391 break; 5392 5393 5394 5395 /* ========================================================================= */ 5396 /* End of start of parenthesis opcodes */ 5397 /* ========================================================================= */ 5398 5399 5400 /* ===================================================================== */ 5401 /* Move the subject pointer back. This occurs only at the start of each 5402 branch of a lookbehind assertion. If we are too close to the start to move 5403 back, fail. When working with UTF-8 we move back a number of characters, 5404 not bytes. */ 5405 5406 case OP_REVERSE: 5407 number = GET(Fecode, 1); 5408 #ifdef SUPPORT_UNICODE 5409 if (utf) 5410 { 5411 while (number-- > 0) 5412 { 5413 if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH); 5414 Feptr--; 5415 BACKCHAR(Feptr); 5416 } 5417 } 5418 else 5419 #endif 5420 5421 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 5422 5423 { 5424 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); 5425 Feptr -= number; 5426 } 5427 5428 /* Save the earliest consulted character, then skip to next opcode */ 5429 5430 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; 5431 Fecode += 1 + LINK_SIZE; 5432 break; 5433 5434 5435 /* ===================================================================== */ 5436 /* An alternation is the end of a branch; scan along to find the end of the 5437 bracketed group. */ 5438 5439 case OP_ALT: 5440 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); 5441 break; 5442 5443 5444 /* ===================================================================== */ 5445 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the 5446 starting frame was added to the chained frames in order to remember the 5447 starting subject position for the group. */ 5448 5449 case OP_KET: 5450 case OP_KETRMIN: 5451 case OP_KETRMAX: 5452 case OP_KETRPOS: 5453 5454 bracode = Fecode - GET(Fecode, 1); 5455 5456 /* Point N to the frame at the start of the most recent group. 5457 Remember the subject pointer at the start of the group. */ 5458 5459 if (*bracode != OP_BRA && *bracode != OP_COND) 5460 { 5461 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset); 5462 P = (heapframe *)((char *)N - frame_size); 5463 Flast_group_offset = P->last_group_offset; 5464 5465 #ifdef DEBUG_SHOW_RMATCH 5466 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n", 5467 N->rdepth, N->group_frame_type, 5468 (char *)P->eptr - (char *)mb->start_subject); 5469 #endif 5470 5471 /* If we are at the end of an assertion that is a condition, return a 5472 match, discarding any intermediate backtracking points. Copy back the 5473 captures into the frame before N so that they are set on return. Doing 5474 this for all assertions, both positive and negative, seems to match what 5475 Perl does. */ 5476 5477 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) 5478 { 5479 memcpy((char *)P + offsetof(heapframe, ovector), Fovector, 5480 Foffset_top * sizeof(PCRE2_SIZE)); 5481 P->offset_top = Foffset_top; 5482 Fback_frame = (char *)F - (char *)P; 5483 RRETURN(MATCH_MATCH); 5484 } 5485 } 5486 else P = NULL; /* Indicates starting frame not recorded */ 5487 5488 /* The group was not a conditional assertion. */ 5489 5490 switch (*bracode) 5491 { 5492 case OP_BRA: /* No need to do anything for these */ 5493 case OP_COND: 5494 case OP_SCOND: 5495 break; 5496 5497 /* Positive assertions are like OP_ONCE, except that in addition the 5498 subject pointer must be put back to where it was at the start of the 5499 assertion. */ 5500 5501 case OP_ASSERT: 5502 case OP_ASSERTBACK: 5503 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; 5504 Feptr = P->eptr; 5505 /* Fall through */ 5506 5507 /* For an atomic group, discard internal backtracking points. We must 5508 also ensure that any remaining branches within the top-level of the group 5509 are not tried. Do this by adjusting the code pointer within the backtrack 5510 frame so that it points to the final branch. */ 5511 5512 case OP_ONCE: 5513 Fback_frame = ((char *)F - (char *)P); 5514 for (;;) 5515 { 5516 uint32_t y = GET(P->ecode,1); 5517 if ((P->ecode)[y] != OP_ALT) break; 5518 P->ecode += y; 5519 } 5520 break; 5521 5522 /* A matching negative assertion returns MATCH, which is turned into 5523 NOMATCH at the assertion level. */ 5524 5525 case OP_ASSERT_NOT: 5526 case OP_ASSERTBACK_NOT: 5527 RRETURN(MATCH_MATCH); 5528 5529 /* Whole-pattern recursion is coded as a recurse into group 0, so it 5530 won't be picked up here. Instead, we catch it when the OP_END is reached. 5531 Other recursion is handled here. */ 5532 5533 case OP_CBRA: 5534 case OP_CBRAPOS: 5535 case OP_SCBRA: 5536 case OP_SCBRAPOS: 5537 number = GET2(bracode, 1+LINK_SIZE); 5538 5539 /* Handle a recursively called group. We reinstate the previous set of 5540 captures and then carry on after the recursion call. */ 5541 5542 if (Fcurrent_recurse == number) 5543 { 5544 P = (heapframe *)((char *)N - frame_size); 5545 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, 5546 P->offset_top * sizeof(PCRE2_SIZE)); 5547 Foffset_top = P->offset_top; 5548 Fcapture_last = P->capture_last; 5549 Fcurrent_recurse = P->current_recurse; 5550 Fecode = P->ecode + 1 + LINK_SIZE; 5551 continue; /* With next opcode */ 5552 } 5553 5554 /* Deal with actual capturing. */ 5555 5556 offset = (number << 1) - 2; 5557 Fcapture_last = number; 5558 Fovector[offset] = P->eptr - mb->start_subject; 5559 Fovector[offset+1] = Feptr - mb->start_subject; 5560 if (offset >= Foffset_top) Foffset_top = offset + 2; 5561 break; 5562 } /* End actions relating to the starting opcode */ 5563 5564 /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 5565 and return the MATCH_KETRPOS. This makes it possible to do the repeats one 5566 at a time from the outer level. This must precede the empty string test - 5567 in this case that test is done at the outer level. */ 5568 5569 if (*Fecode == OP_KETRPOS) 5570 { 5571 memcpy((char *)P + offsetof(heapframe, eptr), 5572 (char *)F + offsetof(heapframe, eptr), 5573 frame_copy_size); 5574 RRETURN(MATCH_KETRPOS); 5575 } 5576 5577 /* Handle the different kinds of closing brackets. A non-repeating ket 5578 needs no special action, just continuing at this level. This also happens 5579 for the repeating kets if the group matched no characters, in order to 5580 forcibly break infinite loops. Otherwise, the repeating kets try the rest 5581 of the pattern or restart from the preceding bracket, in the appropriate 5582 order. */ 5583 5584 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) 5585 { 5586 if (Fop == OP_KETRMIN) 5587 { 5588 RMATCH(Fecode + 1 + LINK_SIZE, RM6); 5589 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5590 Fecode -= GET(Fecode, 1); 5591 break; /* End of ket processing */ 5592 } 5593 5594 /* Repeat the maximum number of times (KETRMAX) */ 5595 5596 RMATCH(bracode, RM7); 5597 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5598 } 5599 5600 /* Carry on at this level for a non-repeating ket, or after matching an 5601 empty string, or after repeating for a maximum number of times. */ 5602 5603 Fecode += 1 + LINK_SIZE; 5604 break; 5605 5606 5607 /* ===================================================================== */ 5608 /* Start and end of line assertions, not multiline mode. */ 5609 5610 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ 5611 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) 5612 RRETURN(MATCH_NOMATCH); 5613 Fecode++; 5614 break; 5615 5616 case OP_SOD: /* Unconditional start of subject */ 5617 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); 5618 Fecode++; 5619 break; 5620 5621 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a 5622 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ 5623 5624 case OP_DOLL: 5625 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); 5626 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; 5627 5628 /* Fall through */ 5629 /* Unconditional end of subject assertion (\z) */ 5630 5631 case OP_EOD: 5632 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); 5633 SCHECK_PARTIAL(); 5634 Fecode++; 5635 break; 5636 5637 /* End of subject or ending \n assertion (\Z) */ 5638 5639 case OP_EODN: 5640 ASSERT_NL_OR_EOS: 5641 if (Feptr < mb->end_subject && 5642 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen)) 5643 { 5644 if (mb->partial != 0 && 5645 Feptr + 1 >= mb->end_subject && 5646 NLBLOCK->nltype == NLTYPE_FIXED && 5647 NLBLOCK->nllen == 2 && 5648 UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) 5649 { 5650 mb->hitend = TRUE; 5651 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 5652 } 5653 RRETURN(MATCH_NOMATCH); 5654 } 5655 5656 /* Either at end of string or \n before end. */ 5657 5658 SCHECK_PARTIAL(); 5659 Fecode++; 5660 break; 5661 5662 5663 /* ===================================================================== */ 5664 /* Start and end of line assertions, multiline mode. */ 5665 5666 /* Start of subject unless notbol, or after any newline except for one at 5667 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ 5668 5669 case OP_CIRCM: 5670 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) 5671 RRETURN(MATCH_NOMATCH); 5672 if (Feptr != mb->start_subject && 5673 ((Feptr == mb->end_subject && 5674 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || 5675 !WAS_NEWLINE(Feptr))) 5676 RRETURN(MATCH_NOMATCH); 5677 Fecode++; 5678 break; 5679 5680 /* Assert before any newline, or before end of subject unless noteol is 5681 set. */ 5682 5683 case OP_DOLLM: 5684 if (Feptr < mb->end_subject) 5685 { 5686 if (!IS_NEWLINE(Feptr)) 5687 { 5688 if (mb->partial != 0 && 5689 Feptr + 1 >= mb->end_subject && 5690 NLBLOCK->nltype == NLTYPE_FIXED && 5691 NLBLOCK->nllen == 2 && 5692 UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) 5693 { 5694 mb->hitend = TRUE; 5695 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; 5696 } 5697 RRETURN(MATCH_NOMATCH); 5698 } 5699 } 5700 else 5701 { 5702 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); 5703 SCHECK_PARTIAL(); 5704 } 5705 Fecode++; 5706 break; 5707 5708 5709 /* ===================================================================== */ 5710 /* Start of match assertion */ 5711 5712 case OP_SOM: 5713 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); 5714 Fecode++; 5715 break; 5716 5717 5718 /* ===================================================================== */ 5719 /* Reset the start of match point */ 5720 5721 case OP_SET_SOM: 5722 Fstart_match = Feptr; 5723 Fecode++; 5724 break; 5725 5726 5727 /* ===================================================================== */ 5728 /* Word boundary assertions. Find out if the previous and current 5729 characters are "word" characters. It takes a bit more work in UTF mode. 5730 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is 5731 not set. When it is set, use Unicode properties if available, even when not 5732 in UTF mode. Remember the earliest and latest consulted characters. */ 5733 5734 case OP_NOT_WORD_BOUNDARY: 5735 case OP_WORD_BOUNDARY: 5736 if (Feptr == mb->start_subject) prev_is_word = FALSE; else 5737 { 5738 PCRE2_SPTR lastptr = Feptr - 1; 5739 #ifdef SUPPORT_UNICODE 5740 if (utf) 5741 { 5742 BACKCHAR(lastptr); 5743 GETCHAR(fc, lastptr); 5744 } 5745 else 5746 #endif /* SUPPORT_UNICODE */ 5747 fc = *lastptr; 5748 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; 5749 #ifdef SUPPORT_UNICODE 5750 if ((mb->poptions & PCRE2_UCP) != 0) 5751 { 5752 if (fc == '_') prev_is_word = TRUE; else 5753 { 5754 int cat = UCD_CATEGORY(fc); 5755 prev_is_word = (cat == ucp_L || cat == ucp_N); 5756 } 5757 } 5758 else 5759 #endif /* SUPPORT_UNICODE */ 5760 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; 5761 } 5762 5763 /* Get status of next character */ 5764 5765 if (Feptr >= mb->end_subject) 5766 { 5767 SCHECK_PARTIAL(); 5768 cur_is_word = FALSE; 5769 } 5770 else 5771 { 5772 PCRE2_SPTR nextptr = Feptr + 1; 5773 #ifdef SUPPORT_UNICODE 5774 if (utf) 5775 { 5776 FORWARDCHARTEST(nextptr, mb->end_subject); 5777 GETCHAR(fc, Feptr); 5778 } 5779 else 5780 #endif /* SUPPORT_UNICODE */ 5781 fc = *Feptr; 5782 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; 5783 #ifdef SUPPORT_UNICODE 5784 if ((mb->poptions & PCRE2_UCP) != 0) 5785 { 5786 if (fc == '_') cur_is_word = TRUE; else 5787 { 5788 int cat = UCD_CATEGORY(fc); 5789 cur_is_word = (cat == ucp_L || cat == ucp_N); 5790 } 5791 } 5792 else 5793 #endif /* SUPPORT_UNICODE */ 5794 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; 5795 } 5796 5797 /* Now see if the situation is what we want */ 5798 5799 if ((*Fecode++ == OP_WORD_BOUNDARY)? 5800 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 5801 RRETURN(MATCH_NOMATCH); 5802 break; 5803 5804 5805 /* ===================================================================== */ 5806 /* Backtracking (*VERB)s, with and without arguments. Note that if the 5807 pattern is successfully matched, we do not come back from RMATCH. */ 5808 5809 case OP_MARK: 5810 Fmark = mb->nomatch_mark = Fecode + 2; 5811 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); 5812 5813 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 5814 argument, and we must check whether that argument matches this MARK's 5815 argument. It is passed back in mb->verb_skip_ptr. If it does match, we 5816 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject 5817 position that corresponds to this mark. Otherwise, pass back the return 5818 code unaltered. */ 5819 5820 if (rrc == MATCH_SKIP_ARG && 5821 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) 5822 { 5823 mb->verb_skip_ptr = Feptr; /* Pass back current position */ 5824 RRETURN(MATCH_SKIP); 5825 } 5826 RRETURN(rrc); 5827 5828 case OP_FAIL: 5829 RRETURN(MATCH_NOMATCH); 5830 5831 /* Record the current recursing group number in mb->verb_current_recurse 5832 when a backtracking return such as MATCH_COMMIT is given. This enables the 5833 recurse processing to catch verbs from within the recursion. */ 5834 5835 case OP_COMMIT: 5836 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); 5837 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5838 mb->verb_current_recurse = Fcurrent_recurse; 5839 RRETURN(MATCH_COMMIT); 5840 5841 case OP_COMMIT_ARG: 5842 Fmark = mb->nomatch_mark = Fecode + 2; 5843 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); 5844 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5845 mb->verb_current_recurse = Fcurrent_recurse; 5846 RRETURN(MATCH_COMMIT); 5847 5848 case OP_PRUNE: 5849 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); 5850 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5851 mb->verb_current_recurse = Fcurrent_recurse; 5852 RRETURN(MATCH_PRUNE); 5853 5854 case OP_PRUNE_ARG: 5855 Fmark = mb->nomatch_mark = Fecode + 2; 5856 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); 5857 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5858 mb->verb_current_recurse = Fcurrent_recurse; 5859 RRETURN(MATCH_PRUNE); 5860 5861 case OP_SKIP: 5862 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); 5863 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5864 mb->verb_skip_ptr = Feptr; /* Pass back current position */ 5865 mb->verb_current_recurse = Fcurrent_recurse; 5866 RRETURN(MATCH_SKIP); 5867 5868 /* Note that, for Perl compatibility, SKIP with an argument does NOT set 5869 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was 5870 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG 5871 that failed and any that precede it (either they also failed, or were not 5872 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a 5873 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg 5874 set to the count of the one that failed. */ 5875 5876 case OP_SKIP_ARG: 5877 mb->skip_arg_count++; 5878 if (mb->skip_arg_count <= mb->ignore_skip_arg) 5879 { 5880 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; 5881 break; 5882 } 5883 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); 5884 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5885 5886 /* Pass back the current skip name and return the special MATCH_SKIP_ARG 5887 return code. This will either be caught by a matching MARK, or get to the 5888 top, where it causes a rematch with mb->ignore_skip_arg set to the value of 5889 mb->skip_arg_count. */ 5890 5891 mb->verb_skip_ptr = Fecode + 2; 5892 mb->verb_current_recurse = Fcurrent_recurse; 5893 RRETURN(MATCH_SKIP_ARG); 5894 5895 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 5896 the branch in which it occurs can be determined. */ 5897 5898 case OP_THEN: 5899 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); 5900 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5901 mb->verb_ecode_ptr = Fecode; 5902 mb->verb_current_recurse = Fcurrent_recurse; 5903 RRETURN(MATCH_THEN); 5904 5905 case OP_THEN_ARG: 5906 Fmark = mb->nomatch_mark = Fecode + 2; 5907 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); 5908 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5909 mb->verb_ecode_ptr = Fecode; 5910 mb->verb_current_recurse = Fcurrent_recurse; 5911 RRETURN(MATCH_THEN); 5912 5913 5914 /* ===================================================================== */ 5915 /* There's been some horrible disaster. Arrival here can only mean there is 5916 something seriously wrong in the code above or the OP_xxx definitions. */ 5917 5918 default: 5919 return PCRE2_ERROR_INTERNAL; 5920 } 5921 5922 /* Do not insert any code in here without much thought; it is assumed 5923 that "continue" in the code above comes out to here to repeat the main 5924 loop. */ 5925 5926 } /* End of main loop */ 5927 /* Control never reaches here */ 5928 5929 5930 /* ========================================================================= */ 5931 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id 5932 indicates which label we actually want to return to. The value in Frdepth is 5933 the index number of the frame in the vector. The return value has been placed 5934 in rrc. */ 5935 5936 #define LBL(val) case val: goto L_RM##val; 5937 5938 RETURN_SWITCH: 5939 if (Frdepth == 0) return rrc; /* Exit from the top level */ 5940 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ 5941 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ 5942 5943 #ifdef DEBUG_SHOW_RMATCH 5944 fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id); 5945 #endif 5946 5947 switch (Freturn_id) 5948 { 5949 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 5950 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) 5951 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) 5952 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) 5953 LBL(33) LBL(34) LBL(35) LBL(36) 5954 5955 #ifdef SUPPORT_WIDE_CHARS 5956 LBL(100) LBL(101) 5957 #endif 5958 5959 #ifdef SUPPORT_UNICODE 5960 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) 5961 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) 5962 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) 5963 LBL(221) LBL(222) 5964 #endif 5965 5966 default: 5967 return PCRE2_ERROR_INTERNAL; 5968 } 5969 #undef LBL 5970 } 5971 5972 5973 /************************************************* 5974 * Match a Regular Expression * 5975 *************************************************/ 5976 5977 /* This function applies a compiled pattern to a subject string and picks out 5978 portions of the string if it matches. Two elements in the vector are set for 5979 each substring: the offsets to the start and end of the substring. 5980 5981 Arguments: 5982 code points to the compiled expression 5983 subject points to the subject string 5984 length length of subject string (may contain binary zeros) 5985 start_offset where to start in the subject string 5986 options option bits 5987 match_data points to a match_data block 5988 mcontext points a PCRE2 context 5989 5990 Returns: > 0 => success; value is the number of ovector pairs filled 5991 = 0 => success, but ovector is not big enough 5992 -1 => failed to match (PCRE2_ERROR_NOMATCH) 5993 -2 => partial match (PCRE2_ERROR_PARTIAL) 5994 < -2 => some kind of unexpected problem 5995 */ 5996 5997 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 5998 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 5999 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 6000 pcre2_match_context *mcontext) 6001 { 6002 int rc; 6003 const uint8_t *start_bits = NULL; 6004 6005 const pcre2_real_code *re = (const pcre2_real_code *)code; 6006 6007 BOOL anchored; 6008 BOOL firstline; 6009 BOOL has_first_cu = FALSE; 6010 BOOL has_req_cu = FALSE; 6011 BOOL startline; 6012 BOOL utf; 6013 6014 PCRE2_UCHAR first_cu = 0; 6015 PCRE2_UCHAR first_cu2 = 0; 6016 PCRE2_UCHAR req_cu = 0; 6017 PCRE2_UCHAR req_cu2 = 0; 6018 6019 PCRE2_SPTR bumpalong_limit; 6020 PCRE2_SPTR end_subject; 6021 PCRE2_SPTR start_match = subject + start_offset; 6022 PCRE2_SPTR req_cu_ptr = start_match - 1; 6023 PCRE2_SPTR start_partial = NULL; 6024 PCRE2_SPTR match_partial = NULL; 6025 6026 PCRE2_SIZE frame_size; 6027 6028 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE 6029 macro is used below, and it expects NLBLOCK to be defined as a pointer. */ 6030 6031 pcre2_callout_block cb; 6032 match_block actual_match_block; 6033 match_block *mb = &actual_match_block; 6034 6035 /* Allocate an initial vector of backtracking frames on the stack. If this 6036 proves to be too small, it is replaced by a larger one on the heap. To get a 6037 vector of the size required that is aligned for pointers, allocate it as a 6038 vector of pointers. */ 6039 6040 PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]; 6041 mb->stack_frames = (heapframe *)stack_frames_vector; 6042 6043 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated 6044 subject string. */ 6045 6046 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); 6047 end_subject = subject + length; 6048 6049 /* Plausibility checks */ 6050 6051 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; 6052 if (code == NULL || subject == NULL || match_data == NULL) 6053 return PCRE2_ERROR_NULL; 6054 if (start_offset > length) return PCRE2_ERROR_BADOFFSET; 6055 6056 /* Check that the first field in the block is the magic number. */ 6057 6058 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; 6059 6060 /* Check the code unit width. */ 6061 6062 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) 6063 return PCRE2_ERROR_BADMODE; 6064 6065 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the 6066 options variable for this function. Users of PCRE2 who are not calling the 6067 function directly would like to have a way of setting these flags, in the same 6068 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with 6069 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and 6070 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now 6071 transfer to the options for this function. The bits are guaranteed to be 6072 adjacent, but do not have the same values. This bit of Boolean trickery assumes 6073 that the match-time bits are not more significant than the flag bits. If by 6074 accident this is not the case, a compile-time division by zero error will 6075 occur. */ 6076 6077 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) 6078 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) 6079 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); 6080 #undef FF 6081 #undef OO 6082 6083 /* These two settings are used in the code for checking a UTF string that 6084 follows immediately afterwards. Other values in the mb block are used only 6085 during interpretive processing, not when the JIT support is in use, so they are 6086 set up later. */ 6087 6088 utf = (re->overall_options & PCRE2_UTF) != 0; 6089 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : 6090 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; 6091 6092 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same 6093 time. */ 6094 6095 if (mb->partial != 0 && 6096 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) 6097 return PCRE2_ERROR_BADOPTION; 6098 6099 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, 6100 we must also check that a starting offset does not point into the middle of a 6101 multiunit character. We check only the portion of the subject that is going to 6102 be inspected during matching - from the offset minus the maximum back reference 6103 to the given length. This saves time when a small part of a large subject is 6104 being matched by the use of a starting offset. Note that the maximum lookbehind 6105 is a number of characters, not code units. */ 6106 6107 #ifdef SUPPORT_UNICODE 6108 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) 6109 { 6110 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ 6111 6112 if (start_offset > 0) 6113 { 6114 #if PCRE2_CODE_UNIT_WIDTH != 32 6115 unsigned int i; 6116 if (start_match < end_subject && NOT_FIRSTCU(*start_match)) 6117 return PCRE2_ERROR_BADUTFOFFSET; 6118 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) 6119 { 6120 check_subject--; 6121 while (check_subject > subject && 6122 #if PCRE2_CODE_UNIT_WIDTH == 8 6123 (*check_subject & 0xc0) == 0x80) 6124 #else /* 16-bit */ 6125 (*check_subject & 0xfc00) == 0xdc00) 6126 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 6127 check_subject--; 6128 } 6129 #else 6130 /* In the 32-bit library, one code unit equals one character. However, 6131 we cannot just subtract the lookbehind and then compare pointers, because 6132 a very large lookbehind could create an invalid pointer. */ 6133 6134 if (start_offset >= re->max_lookbehind) 6135 check_subject -= re->max_lookbehind; 6136 else 6137 check_subject = subject; 6138 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ 6139 } 6140 6141 /* Validate the relevant portion of the subject. After an error, adjust the 6142 offset to be an absolute offset in the whole string. */ 6143 6144 match_data->rc = PRIV(valid_utf)(check_subject, 6145 length - (check_subject - subject), &(match_data->startchar)); 6146 if (match_data->rc != 0) 6147 { 6148 match_data->startchar += check_subject - subject; 6149 return match_data->rc; 6150 } 6151 } 6152 #endif /* SUPPORT_UNICODE */ 6153 6154 /* It is an error to set an offset limit without setting the flag at compile 6155 time. */ 6156 6157 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && 6158 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) 6159 return PCRE2_ERROR_BADOFFSETLIMIT; 6160 6161 /* If the pattern was successfully studied with JIT support, run the JIT 6162 executable instead of the rest of this function. Most options must be set at 6163 compile time for the JIT code to be usable. Fallback to the normal code path if 6164 an unsupported option is set or if JIT returns BADOPTION (which means that the 6165 selected normal or partial matching mode was not compiled). */ 6166 6167 #ifdef SUPPORT_JIT 6168 if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) 6169 { 6170 rc = pcre2_jit_match(code, subject, length, start_offset, options, 6171 match_data, mcontext); 6172 if (rc != PCRE2_ERROR_JIT_BADOPTION) return rc; 6173 } 6174 #endif 6175 6176 /* Carry on with non-JIT matching. A NULL match context means "use a default 6177 context", but we take the memory control functions from the pattern. */ 6178 6179 if (mcontext == NULL) 6180 { 6181 mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); 6182 mb->memctl = re->memctl; 6183 } 6184 else mb->memctl = mcontext->memctl; 6185 6186 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; 6187 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; 6188 startline = (re->flags & PCRE2_STARTLINE) != 0; 6189 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? 6190 end_subject : subject + mcontext->offset_limit; 6191 6192 /* Initialize and set up the fixed fields in the callout block, with a pointer 6193 in the match block. */ 6194 6195 mb->cb = &cb; 6196 cb.version = 2; 6197 cb.subject = subject; 6198 cb.subject_length = (PCRE2_SIZE)(end_subject - subject); 6199 cb.callout_flags = 0; 6200 6201 /* Fill in the remaining fields in the match block. */ 6202 6203 mb->callout = mcontext->callout; 6204 mb->callout_data = mcontext->callout_data; 6205 6206 mb->start_subject = subject; 6207 mb->start_offset = start_offset; 6208 mb->end_subject = end_subject; 6209 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; 6210 6211 mb->moptions = options; /* Match options */ 6212 mb->poptions = re->overall_options; /* Pattern options */ 6213 6214 mb->ignore_skip_arg = 0; 6215 mb->mark = mb->nomatch_mark = NULL; /* In case never set */ 6216 mb->hitend = FALSE; 6217 6218 /* The name table is needed for finding all the numbers associated with a 6219 given name, for condition testing. The code follows the name table. */ 6220 6221 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); 6222 mb->name_count = re->name_count; 6223 mb->name_entry_size = re->name_entry_size; 6224 mb->start_code = mb->name_table + re->name_count * re->name_entry_size; 6225 6226 /* Process the \R and newline settings. */ 6227 6228 mb->bsr_convention = re->bsr_convention; 6229 mb->nltype = NLTYPE_FIXED; 6230 switch(re->newline_convention) 6231 { 6232 case PCRE2_NEWLINE_CR: 6233 mb->nllen = 1; 6234 mb->nl[0] = CHAR_CR; 6235 break; 6236 6237 case PCRE2_NEWLINE_LF: 6238 mb->nllen = 1; 6239 mb->nl[0] = CHAR_NL; 6240 break; 6241 6242 case PCRE2_NEWLINE_NUL: 6243 mb->nllen = 1; 6244 mb->nl[0] = CHAR_NUL; 6245 break; 6246 6247 case PCRE2_NEWLINE_CRLF: 6248 mb->nllen = 2; 6249 mb->nl[0] = CHAR_CR; 6250 mb->nl[1] = CHAR_NL; 6251 break; 6252 6253 case PCRE2_NEWLINE_ANY: 6254 mb->nltype = NLTYPE_ANY; 6255 break; 6256 6257 case PCRE2_NEWLINE_ANYCRLF: 6258 mb->nltype = NLTYPE_ANYCRLF; 6259 break; 6260 6261 default: return PCRE2_ERROR_INTERNAL; 6262 } 6263 6264 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE 6265 vector at the end, whose size depends on the number of capturing parentheses in 6266 the pattern. It is not used at all if there are no capturing parentheses. 6267 6268 frame_size is the total size of each frame 6269 mb->frame_vector_size is the total usable size of the vector (rounded down 6270 to a whole number of frames) 6271 6272 The last of these is changed within the match() function if the frame vector 6273 has to be expanded. We therefore put it into the match block so that it is 6274 correct when calling match() more than once for non-anchored patterns. */ 6275 6276 frame_size = offsetof(heapframe, ovector) + 6277 re->top_bracket * 2 * sizeof(PCRE2_SIZE); 6278 6279 /* Limits set in the pattern override the match context only if they are 6280 smaller. */ 6281 6282 mb->heap_limit = (mcontext->heap_limit < re->limit_heap)? 6283 mcontext->heap_limit : re->limit_heap; 6284 6285 mb->match_limit = (mcontext->match_limit < re->limit_match)? 6286 mcontext->match_limit : re->limit_match; 6287 6288 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? 6289 mcontext->depth_limit : re->limit_depth; 6290 6291 /* If a pattern has very many capturing parentheses, the frame size may be very 6292 large. Ensure that there are at least 10 available frames by getting an initial 6293 vector on the heap if necessary, except when the heap limit prevents this. Get 6294 fewer if possible. (The heap limit is in kibibytes.) */ 6295 6296 if (frame_size <= START_FRAMES_SIZE/10) 6297 { 6298 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */ 6299 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size); 6300 } 6301 else 6302 { 6303 mb->frame_vector_size = frame_size * 10; 6304 if ((mb->frame_vector_size / 1024) > mb->heap_limit) 6305 { 6306 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT; 6307 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size; 6308 } 6309 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size, 6310 mb->memctl.memory_data); 6311 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY; 6312 } 6313 6314 mb->match_frames_top = 6315 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size); 6316 6317 /* Write to the ovector within the first frame to mark every capture unset and 6318 to avoid uninitialized memory read errors when it is copied to a new frame. */ 6319 6320 memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff, 6321 re->top_bracket * 2 * sizeof(PCRE2_SIZE)); 6322 6323 /* Pointers to the individual character tables */ 6324 6325 mb->lcc = re->tables + lcc_offset; 6326 mb->fcc = re->tables + fcc_offset; 6327 mb->ctypes = re->tables + ctypes_offset; 6328 6329 /* Set up the first code unit to match, if available. If there's no first code 6330 unit there may be a bitmap of possible first characters. */ 6331 6332 if ((re->flags & PCRE2_FIRSTSET) != 0) 6333 { 6334 has_first_cu = TRUE; 6335 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); 6336 if ((re->flags & PCRE2_FIRSTCASELESS) != 0) 6337 { 6338 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); 6339 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 6340 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); 6341 #endif 6342 } 6343 } 6344 else 6345 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) 6346 start_bits = re->start_bitmap; 6347 6348 /* There may also be a "last known required character" set. */ 6349 6350 if ((re->flags & PCRE2_LASTSET) != 0) 6351 { 6352 has_req_cu = TRUE; 6353 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); 6354 if ((re->flags & PCRE2_LASTCASELESS) != 0) 6355 { 6356 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); 6357 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 6358 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); 6359 #endif 6360 } 6361 } 6362 6363 6364 /* ==========================================================================*/ 6365 6366 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 6367 the loop runs just once. */ 6368 6369 for(;;) 6370 { 6371 PCRE2_SPTR new_start_match; 6372 6373 /* ----------------- Start of match optimizations ---------------- */ 6374 6375 /* There are some optimizations that avoid running the match if a known 6376 starting point is not found, or if a known later code unit is not present. 6377 However, there is an option (settable at compile time) that disables these, 6378 for testing and for ensuring that all callouts do actually occur. */ 6379 6380 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) 6381 { 6382 /* If firstline is TRUE, the start of the match is constrained to the first 6383 line of a multiline string. That is, the match must be before or at the 6384 first newline following the start of matching. Temporarily adjust 6385 end_subject so that we stop the scans for a first code unit at a newline. 6386 If the match fails at the newline, later code breaks the loop. */ 6387 6388 if (firstline) 6389 { 6390 PCRE2_SPTR t = start_match; 6391 #ifdef SUPPORT_UNICODE 6392 if (utf) 6393 { 6394 while (t < end_subject && !IS_NEWLINE(t)) 6395 { 6396 t++; 6397 ACROSSCHAR(t < end_subject, t, t++); 6398 } 6399 } 6400 else 6401 #endif 6402 while (t < end_subject && !IS_NEWLINE(t)) t++; 6403 end_subject = t; 6404 } 6405 6406 /* Anchored: check the first code unit if one is recorded. This may seem 6407 pointless but it can help in detecting a no match case without scanning for 6408 the required code unit. */ 6409 6410 if (anchored) 6411 { 6412 if (has_first_cu || start_bits != NULL) 6413 { 6414 BOOL ok = start_match < end_subject; 6415 if (ok) 6416 { 6417 PCRE2_UCHAR c = UCHAR21TEST(start_match); 6418 ok = has_first_cu && (c == first_cu || c == first_cu2); 6419 if (!ok && start_bits != NULL) 6420 { 6421 #if PCRE2_CODE_UNIT_WIDTH != 8 6422 if (c > 255) c = 255; 6423 #endif 6424 ok = (start_bits[c/8] & (1 << (c&7))) != 0; 6425 } 6426 } 6427 if (!ok) 6428 { 6429 rc = MATCH_NOMATCH; 6430 break; 6431 } 6432 } 6433 } 6434 6435 /* Not anchored. Advance to a unique first code unit if there is one. In 6436 8-bit mode, the use of memchr() gives a big speed up, even though we have 6437 to call it twice in caseless mode, in order to find the earliest occurrence 6438 of the character in either of its cases. */ 6439 6440 else 6441 { 6442 if (has_first_cu) 6443 { 6444 if (first_cu != first_cu2) /* Caseless */ 6445 { 6446 #if PCRE2_CODE_UNIT_WIDTH != 8 6447 PCRE2_UCHAR smc; 6448 while (start_match < end_subject && 6449 (smc = UCHAR21TEST(start_match)) != first_cu && 6450 smc != first_cu2) 6451 start_match++; 6452 #else /* 8-bit code units */ 6453 PCRE2_SPTR pp1 = 6454 memchr(start_match, first_cu, end_subject-start_match); 6455 PCRE2_SPTR pp2 = 6456 memchr(start_match, first_cu2, end_subject-start_match); 6457 if (pp1 == NULL) 6458 start_match = (pp2 == NULL)? end_subject : pp2; 6459 else 6460 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; 6461 #endif 6462 } 6463 6464 /* The caseful case */ 6465 6466 else 6467 { 6468 #if PCRE2_CODE_UNIT_WIDTH != 8 6469 while (start_match < end_subject && UCHAR21TEST(start_match) != 6470 first_cu) 6471 start_match++; 6472 #else 6473 start_match = memchr(start_match, first_cu, end_subject - start_match); 6474 if (start_match == NULL) start_match = end_subject; 6475 #endif 6476 } 6477 6478 /* If we can't find the required code unit, having reached the true end 6479 of the subject, break the bumpalong loop, to force a match failure, 6480 except when doing partial matching, when we let the next cycle run at 6481 the end of the subject. To see why, consider the pattern /(?<=abc)def/, 6482 which partially matches "abc", even though the string does not contain 6483 the starting character "d". If we have not reached the true end of the 6484 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) 6485 we also let the cycle run, because the matching string is legitimately 6486 allowed to start with the first code unit of a newline. */ 6487 6488 if (!mb->partial && start_match >= mb->end_subject) 6489 { 6490 rc = MATCH_NOMATCH; 6491 break; 6492 } 6493 } 6494 6495 /* If there's no first code unit, advance to just after a linebreak for a 6496 multiline match if required. */ 6497 6498 else if (startline) 6499 { 6500 if (start_match > mb->start_subject + start_offset) 6501 { 6502 #ifdef SUPPORT_UNICODE 6503 if (utf) 6504 { 6505 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6506 { 6507 start_match++; 6508 ACROSSCHAR(start_match < end_subject, start_match, start_match++); 6509 } 6510 } 6511 else 6512 #endif 6513 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6514 start_match++; 6515 6516 /* If we have just passed a CR and the newline option is ANY or 6517 ANYCRLF, and we are now at a LF, advance the match position by one 6518 more code unit. */ 6519 6520 if (start_match[-1] == CHAR_CR && 6521 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && 6522 start_match < end_subject && 6523 UCHAR21TEST(start_match) == CHAR_NL) 6524 start_match++; 6525 } 6526 } 6527 6528 /* If there's no first code unit or a requirement for a multiline line 6529 start, advance to a non-unique first code unit if any have been 6530 identified. The bitmap contains only 256 bits. When code units are 16 or 6531 32 bits wide, all code units greater than 254 set the 255 bit. */ 6532 6533 else if (start_bits != NULL) 6534 { 6535 while (start_match < end_subject) 6536 { 6537 uint32_t c = UCHAR21TEST(start_match); 6538 #if PCRE2_CODE_UNIT_WIDTH != 8 6539 if (c > 255) c = 255; 6540 #endif 6541 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 6542 start_match++; 6543 } 6544 6545 /* See comment above in first_cu checking about the next few lines. */ 6546 6547 if (!mb->partial && start_match >= mb->end_subject) 6548 { 6549 rc = MATCH_NOMATCH; 6550 break; 6551 } 6552 } 6553 } /* End first code unit handling */ 6554 6555 /* Restore fudged end_subject */ 6556 6557 end_subject = mb->end_subject; 6558 6559 /* The following two optimizations must be disabled for partial matching. */ 6560 6561 if (!mb->partial) 6562 { 6563 /* The minimum matching length is a lower bound; no string of that length 6564 may actually match the pattern. Although the value is, strictly, in 6565 characters, we treat it as code units to avoid spending too much time in 6566 this optimization. */ 6567 6568 if (end_subject - start_match < re->minlength) 6569 { 6570 rc = MATCH_NOMATCH; 6571 break; 6572 } 6573 6574 /* If req_cu is set, we know that that code unit must appear in the 6575 subject for the (non-partial) match to succeed. If the first code unit is 6576 set, req_cu must be later in the subject; otherwise the test starts at 6577 the match point. This optimization can save a huge amount of backtracking 6578 in patterns with nested unlimited repeats that aren't going to match. 6579 Writing separate code for caseful/caseless versions makes it go faster, 6580 as does using an autoincrement and backing off on a match. As in the case 6581 of the first code unit, using memchr() in the 8-bit library gives a big 6582 speed up. Unlike the first_cu check above, we do not need to call 6583 memchr() twice in the caseless case because we only need to check for the 6584 presence of the character in either case, not find the first occurrence. 6585 6586 HOWEVER: when the subject string is very, very long, searching to its end 6587 can take a long time, and give bad performance on quite ordinary 6588 patterns. This showed up when somebody was matching something like 6589 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is 6590 sufficiently long. */ 6591 6592 if (has_req_cu && end_subject - start_match < REQ_CU_MAX) 6593 { 6594 PCRE2_SPTR p = start_match + (has_first_cu? 1:0); 6595 6596 /* We don't need to repeat the search if we haven't yet reached the 6597 place we found it last time round the bumpalong loop. */ 6598 6599 if (p > req_cu_ptr) 6600 { 6601 if (p < end_subject) 6602 { 6603 if (req_cu != req_cu2) /* Caseless */ 6604 { 6605 #if PCRE2_CODE_UNIT_WIDTH != 8 6606 do 6607 { 6608 uint32_t pp = UCHAR21INCTEST(p); 6609 if (pp == req_cu || pp == req_cu2) { p--; break; } 6610 } 6611 while (p < end_subject); 6612 6613 #else /* 8-bit code units */ 6614 PCRE2_SPTR pp = p; 6615 p = memchr(pp, req_cu, end_subject - pp); 6616 if (p == NULL) 6617 { 6618 p = memchr(pp, req_cu2, end_subject - pp); 6619 if (p == NULL) p = end_subject; 6620 } 6621 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ 6622 } 6623 6624 /* The caseful case */ 6625 6626 else 6627 { 6628 #if PCRE2_CODE_UNIT_WIDTH != 8 6629 do 6630 { 6631 if (UCHAR21INCTEST(p) == req_cu) { p--; break; } 6632 } 6633 while (p < end_subject); 6634 6635 #else /* 8-bit code units */ 6636 p = memchr(p, req_cu, end_subject - p); 6637 if (p == NULL) p = end_subject; 6638 #endif 6639 } 6640 } 6641 6642 /* If we can't find the required code unit, break the bumpalong loop, 6643 forcing a match failure. */ 6644 6645 if (p >= end_subject) 6646 { 6647 rc = MATCH_NOMATCH; 6648 break; 6649 } 6650 6651 /* If we have found the required code unit, save the point where we 6652 found it, so that we don't search again next time round the bumpalong 6653 loop if the start hasn't yet passed this code unit. */ 6654 6655 req_cu_ptr = p; 6656 } 6657 } 6658 } 6659 } 6660 6661 /* ------------ End of start of match optimizations ------------ */ 6662 6663 /* Give no match if we have passed the bumpalong limit. */ 6664 6665 if (start_match > bumpalong_limit) 6666 { 6667 rc = MATCH_NOMATCH; 6668 break; 6669 } 6670 6671 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6672 first starting point for which a partial match was found. */ 6673 6674 cb.start_match = (PCRE2_SIZE)(start_match - subject); 6675 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; 6676 6677 mb->start_used_ptr = start_match; 6678 mb->last_used_ptr = start_match; 6679 mb->match_call_count = 0; 6680 mb->end_offset_top = 0; 6681 mb->skip_arg_count = 0; 6682 6683 rc = match(start_match, mb->start_code, match_data->ovector, 6684 match_data->oveccount, re->top_bracket, frame_size, mb); 6685 6686 if (mb->hitend && start_partial == NULL) 6687 { 6688 start_partial = mb->start_used_ptr; 6689 match_partial = start_match; 6690 } 6691 6692 switch(rc) 6693 { 6694 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6695 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 6696 entirely. The only way we can do that is to re-do the match at the same 6697 point, with a flag to force SKIP with an argument to be ignored. Just 6698 treating this case as NOMATCH does not work because it does not check other 6699 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 6700 6701 case MATCH_SKIP_ARG: 6702 new_start_match = start_match; 6703 mb->ignore_skip_arg = mb->skip_arg_count; 6704 break; 6705 6706 /* SKIP passes back the next starting point explicitly, but if it is no 6707 greater than the match we have just done, treat it as NOMATCH. */ 6708 6709 case MATCH_SKIP: 6710 if (mb->verb_skip_ptr > start_match) 6711 { 6712 new_start_match = mb->verb_skip_ptr; 6713 break; 6714 } 6715 /* Fall through */ 6716 6717 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6718 exactly like PRUNE. Unset ignore SKIP-with-argument. */ 6719 6720 case MATCH_NOMATCH: 6721 case MATCH_PRUNE: 6722 case MATCH_THEN: 6723 mb->ignore_skip_arg = 0; 6724 new_start_match = start_match + 1; 6725 #ifdef SUPPORT_UNICODE 6726 if (utf) 6727 ACROSSCHAR(new_start_match < end_subject, new_start_match, 6728 new_start_match++); 6729 #endif 6730 break; 6731 6732 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6733 6734 case MATCH_COMMIT: 6735 rc = MATCH_NOMATCH; 6736 goto ENDLOOP; 6737 6738 /* Any other return is either a match, or some kind of error. */ 6739 6740 default: 6741 goto ENDLOOP; 6742 } 6743 6744 /* Control reaches here for the various types of "no match at this point" 6745 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 6746 6747 rc = MATCH_NOMATCH; 6748 6749 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first 6750 newline in the subject (though it may continue over the newline). Therefore, 6751 if we have just failed to match, starting at a newline, do not continue. */ 6752 6753 if (firstline && IS_NEWLINE(start_match)) break; 6754 6755 /* Advance to new matching position */ 6756 6757 start_match = new_start_match; 6758 6759 /* Break the loop if the pattern is anchored or if we have passed the end of 6760 the subject. */ 6761 6762 if (anchored || start_match > end_subject) break; 6763 6764 /* If we have just passed a CR and we are now at a LF, and the pattern does 6765 not contain any explicit matches for \r or \n, and the newline option is CRLF 6766 or ANY or ANYCRLF, advance the match position by one more code unit. In 6767 normal matching start_match will aways be greater than the first position at 6768 this stage, but a failed *SKIP can cause a return at the same point, which is 6769 why the first test exists. */ 6770 6771 if (start_match > subject + start_offset && 6772 start_match[-1] == CHAR_CR && 6773 start_match < end_subject && 6774 *start_match == CHAR_NL && 6775 (re->flags & PCRE2_HASCRORLF) == 0 && 6776 (mb->nltype == NLTYPE_ANY || 6777 mb->nltype == NLTYPE_ANYCRLF || 6778 mb->nllen == 2)) 6779 start_match++; 6780 6781 mb->mark = NULL; /* Reset for start of next match attempt */ 6782 } /* End of for(;;) "bumpalong" loop */ 6783 6784 /* ==========================================================================*/ 6785 6786 /* When we reach here, one of the following stopping conditions is true: 6787 6788 (1) The match succeeded, either completely, or partially; 6789 6790 (2) The pattern is anchored or the match was failed after (*COMMIT); 6791 6792 (3) We are past the end of the subject or the bumpalong limit; 6793 6794 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because 6795 this option requests that a match occur at or before the first newline in 6796 the subject. 6797 6798 (5) Some kind of error occurred. 6799 6800 */ 6801 6802 ENDLOOP: 6803 6804 /* Release an enlarged frame vector that is on the heap. */ 6805 6806 if (mb->match_frames != mb->stack_frames) 6807 mb->memctl.free(mb->match_frames, mb->memctl.memory_data); 6808 6809 /* Fill in fields that are always returned in the match data. */ 6810 6811 match_data->code = re; 6812 match_data->subject = subject; 6813 match_data->mark = mb->mark; 6814 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; 6815 6816 /* Handle a fully successful match. Set the return code to the number of 6817 captured strings, or 0 if there were too many to fit into the ovector, and then 6818 set the remaining returned values before returning. */ 6819 6820 if (rc == MATCH_MATCH) 6821 { 6822 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? 6823 0 : (int)mb->end_offset_top/2 + 1; 6824 match_data->startchar = start_match - subject; 6825 match_data->leftchar = mb->start_used_ptr - subject; 6826 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? 6827 mb->last_used_ptr : mb->end_match_ptr) - subject; 6828 return match_data->rc; 6829 } 6830 6831 /* Control gets here if there has been a partial match, an error, or if the 6832 overall match attempt has failed at all permitted starting positions. Any mark 6833 data is in the nomatch_mark field. */ 6834 6835 match_data->mark = mb->nomatch_mark; 6836 6837 /* For anything other than nomatch or partial match, just return the code. */ 6838 6839 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; 6840 6841 /* Handle a partial match. */ 6842 6843 else if (match_partial != NULL) 6844 { 6845 match_data->ovector[0] = match_partial - subject; 6846 match_data->ovector[1] = end_subject - subject; 6847 match_data->startchar = match_partial - subject; 6848 match_data->leftchar = start_partial - subject; 6849 match_data->rightchar = end_subject - subject; 6850 match_data->rc = PCRE2_ERROR_PARTIAL; 6851 } 6852 6853 /* Else this is the classic nomatch case. */ 6854 6855 else match_data->rc = PCRE2_ERROR_NOMATCH; 6856 6857 return match_data->rc; 6858 } 6859 6860 /* End of pcre2_match.c */ 6861