1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2010 University of Cambridge 10 11 ----------------------------------------------------------------------------- 12 Redistribution and use in source and binary forms, with or without 13 modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 POSSIBILITY OF SUCH DAMAGE. 37 ----------------------------------------------------------------------------- 38 */ 39 40 41 /* This module contains pcre_exec(), the externally visible function that does 42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as 43 possible. There are also some static supporting functions. */ 44 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 49 #define NLBLOCK md /* Block containing newline information */ 50 #define PSSTART start_subject /* Field containing processed string start */ 51 #define PSEND end_subject /* Field containing processed string end */ 52 53 #include "pcre_internal.h" 54 55 /* Undefine some potentially clashing cpp symbols */ 56 57 #undef min 58 #undef max 59 60 /* Flag bits for the match() function */ 61 62 #define match_condassert 0x01 /* Called to check a condition assertion */ 63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ 64 65 /* Non-error returns from the match() function. Error returns are externally 66 defined PCRE_ERROR_xxx codes, which are all negative. */ 67 68 #define MATCH_MATCH 1 69 #define MATCH_NOMATCH 0 70 71 /* Special internal returns from the match() function. Make them sufficiently 72 negative to avoid the external error codes. */ 73 74 #define MATCH_ACCEPT (-999) 75 #define MATCH_COMMIT (-998) 76 #define MATCH_PRUNE (-997) 77 #define MATCH_SKIP (-996) 78 #define MATCH_SKIP_ARG (-995) 79 #define MATCH_THEN (-994) 80 81 /* This is a convenience macro for code that occurs many times. */ 82 83 #define MRRETURN(ra) \ 84 { \ 85 md->mark = markptr; \ 86 RRETURN(ra); \ 87 } 88 89 /* Maximum number of ints of offset to save on the stack for recursive calls. 90 If the offset vector is bigger, malloc is used. This should be a multiple of 3, 91 because the offset vector is always a multiple of 3 long. */ 92 93 #define REC_STACK_SAVE_MAX 30 94 95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */ 96 97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; 98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; 99 100 101 102 #ifdef PCRE_DEBUG 103 /************************************************* 104 * Debugging function to print chars * 105 *************************************************/ 106 107 /* Print a sequence of chars in printable format, stopping at the end of the 108 subject if the requested. 109 110 Arguments: 111 p points to characters 112 length number to print 113 is_subject TRUE if printing from within md->start_subject 114 md pointer to matching data block, if is_subject is TRUE 115 116 Returns: nothing 117 */ 118 119 static void 120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md) 121 { 122 unsigned int c; 123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 124 while (length-- > 0) 125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); 126 } 127 #endif 128 129 130 131 /************************************************* 132 * Match a back-reference * 133 *************************************************/ 134 135 /* If a back reference hasn't been set, the length that is passed is greater 136 than the number of characters left in the string, so the match fails. 137 138 Arguments: 139 offset index into the offset vector 140 eptr points into the subject 141 length length to be matched 142 md points to match data block 143 ims the ims flags 144 145 Returns: TRUE if matched 146 */ 147 148 static BOOL 149 match_ref(int offset, register USPTR eptr, int length, match_data *md, 150 unsigned long int ims) 151 { 152 USPTR p = md->start_subject + md->offset_vector[offset]; 153 154 #ifdef PCRE_DEBUG 155 if (eptr >= md->end_subject) 156 printf("matching subject <null>"); 157 else 158 { 159 printf("matching subject "); 160 pchars(eptr, length, TRUE, md); 161 } 162 printf(" against backref "); 163 pchars(p, length, FALSE, md); 164 printf("\n"); 165 #endif 166 167 /* Always fail if not enough characters left */ 168 169 if (length > md->end_subject - eptr) return FALSE; 170 171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this 172 properly if Unicode properties are supported. Otherwise, we can check only 173 ASCII characters. */ 174 175 if ((ims & PCRE_CASELESS) != 0) 176 { 177 #ifdef SUPPORT_UTF8 178 #ifdef SUPPORT_UCP 179 if (md->utf8) 180 { 181 USPTR endptr = eptr + length; 182 while (eptr < endptr) 183 { 184 int c, d; 185 GETCHARINC(c, eptr); 186 GETCHARINC(d, p); 187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE; 188 } 189 } 190 else 191 #endif 192 #endif 193 194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 195 is no UCP support. */ 196 197 while (length-- > 0) 198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } 199 } 200 201 /* In the caseful case, we can just compare the bytes, whether or not we 202 are in UTF-8 mode. */ 203 204 else 205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } 206 207 return TRUE; 208 } 209 210 211 212 /*************************************************************************** 213 **************************************************************************** 214 RECURSION IN THE match() FUNCTION 215 216 The match() function is highly recursive, though not every recursive call 217 increases the recursive depth. Nevertheless, some regular expressions can cause 218 it to recurse to a great depth. I was writing for Unix, so I just let it call 219 itself recursively. This uses the stack for saving everything that has to be 220 saved for a recursive call. On Unix, the stack can be large, and this works 221 fine. 222 223 It turns out that on some non-Unix-like systems there are problems with 224 programs that use a lot of stack. (This despite the fact that every last chip 225 has oodles of memory these days, and techniques for extending the stack have 226 been known for decades.) So.... 227 228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 229 calls by keeping local variables that need to be preserved in blocks of memory 230 obtained from malloc() instead instead of on the stack. Macros are used to 231 achieve this so that the actual code doesn't look very different to what it 232 always used to. 233 234 The original heap-recursive code used longjmp(). However, it seems that this 235 can be very slow on some operating systems. Following a suggestion from Stan 236 Switzer, the use of longjmp() has been abolished, at the cost of having to 237 provide a unique number for each call to RMATCH. There is no way of generating 238 a sequence of numbers at compile time in C. I have given them names, to make 239 them stand out more clearly. 240 241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 243 tests. Furthermore, not using longjmp() means that local dynamic variables 244 don't have indeterminate values; this has meant that the frame size can be 245 reduced because the result can be "passed back" by straight setting of the 246 variable instead of being passed in the frame. 247 **************************************************************************** 248 ***************************************************************************/ 249 250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 251 below must be updated in sync. */ 252 253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 259 RM61, RM62 }; 260 261 /* These versions of the macros use the stack, as normal. There are debugging 262 versions and production versions. Note that the "rw" argument of RMATCH isn't 263 actually used in this definition. */ 264 265 #ifndef NO_RECURSE 266 #define REGISTER register 267 268 #ifdef PCRE_DEBUG 269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ 270 { \ 271 printf("match() called in line %d\n", __LINE__); \ 272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \ 273 printf("to line %d\n", __LINE__); \ 274 } 275 #define RRETURN(ra) \ 276 { \ 277 printf("match() returned %d from line %d ", ra, __LINE__); \ 278 return ra; \ 279 } 280 #else 281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ 282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1) 283 #define RRETURN(ra) return ra 284 #endif 285 286 #else 287 288 289 /* These versions of the macros manage a private stack on the heap. Note that 290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md 291 argument of match(), which never changes. */ 292 293 #define REGISTER 294 295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ 296 {\ 297 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\ 298 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ 299 frame->Xwhere = rw; \ 300 newframe->Xeptr = ra;\ 301 newframe->Xecode = rb;\ 302 newframe->Xmstart = mstart;\ 303 newframe->Xmarkptr = markptr;\ 304 newframe->Xoffset_top = rc;\ 305 newframe->Xims = re;\ 306 newframe->Xeptrb = rf;\ 307 newframe->Xflags = rg;\ 308 newframe->Xrdepth = frame->Xrdepth + 1;\ 309 newframe->Xprevframe = frame;\ 310 frame = newframe;\ 311 DPRINTF(("restarting from line %d\n", __LINE__));\ 312 goto HEAP_RECURSE;\ 313 L_##rw:\ 314 DPRINTF(("jumped back to line %d\n", __LINE__));\ 315 } 316 317 #define RRETURN(ra)\ 318 {\ 319 heapframe *oldframe = frame;\ 320 frame = oldframe->Xprevframe;\ 321 (pcre_stack_free)(oldframe);\ 322 if (frame != NULL)\ 323 {\ 324 rrc = ra;\ 325 goto HEAP_RETURN;\ 326 }\ 327 return ra;\ 328 } 329 330 331 /* Structure for remembering the local variables in a private frame */ 332 333 typedef struct heapframe { 334 struct heapframe *Xprevframe; 335 336 /* Function arguments that may change */ 337 338 USPTR Xeptr; 339 const uschar *Xecode; 340 USPTR Xmstart; 341 USPTR Xmarkptr; 342 int Xoffset_top; 343 long int Xims; 344 eptrblock *Xeptrb; 345 int Xflags; 346 unsigned int Xrdepth; 347 348 /* Function local variables */ 349 350 USPTR Xcallpat; 351 #ifdef SUPPORT_UTF8 352 USPTR Xcharptr; 353 #endif 354 USPTR Xdata; 355 USPTR Xnext; 356 USPTR Xpp; 357 USPTR Xprev; 358 USPTR Xsaved_eptr; 359 360 recursion_info Xnew_recursive; 361 362 BOOL Xcur_is_word; 363 BOOL Xcondition; 364 BOOL Xprev_is_word; 365 366 unsigned long int Xoriginal_ims; 367 368 #ifdef SUPPORT_UCP 369 int Xprop_type; 370 int Xprop_value; 371 int Xprop_fail_result; 372 int Xprop_category; 373 int Xprop_chartype; 374 int Xprop_script; 375 int Xoclength; 376 uschar Xocchars[8]; 377 #endif 378 379 int Xcodelink; 380 int Xctype; 381 unsigned int Xfc; 382 int Xfi; 383 int Xlength; 384 int Xmax; 385 int Xmin; 386 int Xnumber; 387 int Xoffset; 388 int Xop; 389 int Xsave_capture_last; 390 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 391 int Xstacksave[REC_STACK_SAVE_MAX]; 392 393 eptrblock Xnewptrb; 394 395 /* Where to jump back to */ 396 397 int Xwhere; 398 399 } heapframe; 400 401 #endif 402 403 404 /*************************************************************************** 405 ***************************************************************************/ 406 407 408 409 /************************************************* 410 * Match from current position * 411 *************************************************/ 412 413 /* This function is called recursively in many circumstances. Whenever it 414 returns a negative (error) response, the outer incarnation must also return the 415 same response. */ 416 417 /* These macros pack up tests that are used for partial matching, and which 418 appears several times in the code. We set the "hit end" flag if the pointer is 419 at the end of the subject and also past the start of the subject (i.e. 420 something has been matched). For hard partial matching, we then return 421 immediately. The second one is used when we already know we are past the end of 422 the subject. */ 423 424 #define CHECK_PARTIAL()\ 425 if (md->partial != 0 && eptr >= md->end_subject && \ 426 eptr > md->start_used_ptr) \ 427 { \ 428 md->hitend = TRUE; \ 429 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \ 430 } 431 432 #define SCHECK_PARTIAL()\ 433 if (md->partial != 0 && eptr > md->start_used_ptr) \ 434 { \ 435 md->hitend = TRUE; \ 436 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \ 437 } 438 439 440 /* Performance note: It might be tempting to extract commonly used fields from 441 the md structure (e.g. utf8, end_subject) into individual variables to improve 442 performance. Tests using gcc on a SPARC disproved this; in the first case, it 443 made performance worse. 444 445 Arguments: 446 eptr pointer to current character in subject 447 ecode pointer to current position in compiled code 448 mstart pointer to the current match start position (can be modified 449 by encountering \K) 450 markptr pointer to the most recent MARK name, or NULL 451 offset_top current top pointer 452 md pointer to "static" info for the match 453 ims current /i, /m, and /s options 454 eptrb pointer to chain of blocks containing eptr at start of 455 brackets - for testing for empty matches 456 flags can contain 457 match_condassert - this is an assertion condition 458 match_cbegroup - this is the start of an unlimited repeat 459 group that can match an empty string 460 rdepth the recursion depth 461 462 Returns: MATCH_MATCH if matched ) these values are >= 0 463 MATCH_NOMATCH if failed to match ) 464 a negative MATCH_xxx value for PRUNE, SKIP, etc 465 a negative PCRE_ERROR_xxx value if aborted by an error condition 466 (e.g. stopped by repeated call or recursion limit) 467 */ 468 469 static int 470 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, 471 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims, 472 eptrblock *eptrb, int flags, unsigned int rdepth) 473 { 474 /* These variables do not need to be preserved over recursion in this function, 475 so they can be ordinary variables in all cases. Mark some of them with 476 "register" because they are used a lot in loops. */ 477 478 register int rrc; /* Returns from recursive calls */ 479 register int i; /* Used for loops not involving calls to RMATCH() */ 480 register unsigned int c; /* Character values not kept over RMATCH() calls */ 481 register BOOL utf8; /* Local copy of UTF-8 flag for speed */ 482 483 BOOL minimize, possessive; /* Quantifier options */ 484 int condcode; 485 486 /* When recursion is not being used, all "local" variables that have to be 487 preserved over calls to RMATCH() are part of a "frame" which is obtained from 488 heap storage. Set up the top-level frame here; others are obtained from the 489 heap whenever RMATCH() does a "recursion". See the macro definitions above. */ 490 491 #ifdef NO_RECURSE 492 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe)); 493 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 494 frame->Xprevframe = NULL; /* Marks the top level */ 495 496 /* Copy in the original argument variables */ 497 498 frame->Xeptr = eptr; 499 frame->Xecode = ecode; 500 frame->Xmstart = mstart; 501 frame->Xmarkptr = markptr; 502 frame->Xoffset_top = offset_top; 503 frame->Xims = ims; 504 frame->Xeptrb = eptrb; 505 frame->Xflags = flags; 506 frame->Xrdepth = rdepth; 507 508 /* This is where control jumps back to to effect "recursion" */ 509 510 HEAP_RECURSE: 511 512 /* Macros make the argument variables come from the current frame */ 513 514 #define eptr frame->Xeptr 515 #define ecode frame->Xecode 516 #define mstart frame->Xmstart 517 #define markptr frame->Xmarkptr 518 #define offset_top frame->Xoffset_top 519 #define ims frame->Xims 520 #define eptrb frame->Xeptrb 521 #define flags frame->Xflags 522 #define rdepth frame->Xrdepth 523 524 /* Ditto for the local variables */ 525 526 #ifdef SUPPORT_UTF8 527 #define charptr frame->Xcharptr 528 #endif 529 #define callpat frame->Xcallpat 530 #define codelink frame->Xcodelink 531 #define data frame->Xdata 532 #define next frame->Xnext 533 #define pp frame->Xpp 534 #define prev frame->Xprev 535 #define saved_eptr frame->Xsaved_eptr 536 537 #define new_recursive frame->Xnew_recursive 538 539 #define cur_is_word frame->Xcur_is_word 540 #define condition frame->Xcondition 541 #define prev_is_word frame->Xprev_is_word 542 543 #define original_ims frame->Xoriginal_ims 544 545 #ifdef SUPPORT_UCP 546 #define prop_type frame->Xprop_type 547 #define prop_value frame->Xprop_value 548 #define prop_fail_result frame->Xprop_fail_result 549 #define prop_category frame->Xprop_category 550 #define prop_chartype frame->Xprop_chartype 551 #define prop_script frame->Xprop_script 552 #define oclength frame->Xoclength 553 #define occhars frame->Xocchars 554 #endif 555 556 #define ctype frame->Xctype 557 #define fc frame->Xfc 558 #define fi frame->Xfi 559 #define length frame->Xlength 560 #define max frame->Xmax 561 #define min frame->Xmin 562 #define number frame->Xnumber 563 #define offset frame->Xoffset 564 #define op frame->Xop 565 #define save_capture_last frame->Xsave_capture_last 566 #define save_offset1 frame->Xsave_offset1 567 #define save_offset2 frame->Xsave_offset2 568 #define save_offset3 frame->Xsave_offset3 569 #define stacksave frame->Xstacksave 570 571 #define newptrb frame->Xnewptrb 572 573 /* When recursion is being used, local variables are allocated on the stack and 574 get preserved during recursion in the normal way. In this environment, fi and 575 i, and fc and c, can be the same variables. */ 576 577 #else /* NO_RECURSE not defined */ 578 #define fi i 579 #define fc c 580 581 582 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */ 583 const uschar *charptr; /* in small blocks of the code. My normal */ 584 #endif /* style of coding would have declared */ 585 const uschar *callpat; /* them within each of those blocks. */ 586 const uschar *data; /* However, in order to accommodate the */ 587 const uschar *next; /* version of this code that uses an */ 588 USPTR pp; /* external "stack" implemented on the */ 589 const uschar *prev; /* heap, it is easier to declare them all */ 590 USPTR saved_eptr; /* here, so the declarations can be cut */ 591 /* out in a block. The only declarations */ 592 recursion_info new_recursive; /* within blocks below are for variables */ 593 /* that do not have to be preserved over */ 594 BOOL cur_is_word; /* a recursive call to RMATCH(). */ 595 BOOL condition; 596 BOOL prev_is_word; 597 598 unsigned long int original_ims; 599 600 #ifdef SUPPORT_UCP 601 int prop_type; 602 int prop_value; 603 int prop_fail_result; 604 int prop_category; 605 int prop_chartype; 606 int prop_script; 607 int oclength; 608 uschar occhars[8]; 609 #endif 610 611 int codelink; 612 int ctype; 613 int length; 614 int max; 615 int min; 616 int number; 617 int offset; 618 int op; 619 int save_capture_last; 620 int save_offset1, save_offset2, save_offset3; 621 int stacksave[REC_STACK_SAVE_MAX]; 622 623 eptrblock newptrb; 624 #endif /* NO_RECURSE */ 625 626 /* These statements are here to stop the compiler complaining about unitialized 627 variables. */ 628 629 #ifdef SUPPORT_UCP 630 prop_value = 0; 631 prop_fail_result = 0; 632 #endif 633 634 635 /* This label is used for tail recursion, which is used in a few cases even 636 when NO_RECURSE is not defined, in order to reduce the amount of stack that is 637 used. Thanks to Ian Taylor for noticing this possibility and sending the 638 original patch. */ 639 640 TAIL_RECURSE: 641 642 /* OK, now we can get on with the real code of the function. Recursive calls 643 are specified by the macro RMATCH and RRETURN is used to return. When 644 NO_RECURSE is *not* defined, these just turn into a recursive call to match() 645 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 646 defined). However, RMATCH isn't like a function call because it's quite a 647 complicated macro. It has to be used in one particular way. This shouldn't, 648 however, impact performance when true recursion is being used. */ 649 650 #ifdef SUPPORT_UTF8 651 utf8 = md->utf8; /* Local copy of the flag */ 652 #else 653 utf8 = FALSE; 654 #endif 655 656 /* First check that we haven't called match() too many times, or that we 657 haven't exceeded the recursive call limit. */ 658 659 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 660 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 661 662 original_ims = ims; /* Save for resetting on ')' */ 663 664 /* At the start of a group with an unlimited repeat that may match an empty 665 string, the match_cbegroup flag is set. When this is the case, add the current 666 subject pointer to the chain of such remembered pointers, to be checked when we 667 hit the closing ket, in order to break infinite loops that match no characters. 668 When match() is called in other circumstances, don't add to the chain. The 669 match_cbegroup flag must NOT be used with tail recursion, because the memory 670 block that is used is on the stack, so a new one may be required for each 671 match(). */ 672 673 if ((flags & match_cbegroup) != 0) 674 { 675 newptrb.epb_saved_eptr = eptr; 676 newptrb.epb_prev = eptrb; 677 eptrb = &newptrb; 678 } 679 680 /* Now start processing the opcodes. */ 681 682 for (;;) 683 { 684 minimize = possessive = FALSE; 685 op = *ecode; 686 687 switch(op) 688 { 689 case OP_MARK: 690 markptr = ecode + 2; 691 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, 692 ims, eptrb, flags, RM55); 693 694 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 695 argument, and we must check whether that argument matches this MARK's 696 argument. It is passed back in md->start_match_ptr (an overloading of that 697 variable). If it does match, we reset that variable to the current subject 698 position and return MATCH_SKIP. Otherwise, pass back the return code 699 unaltered. */ 700 701 if (rrc == MATCH_SKIP_ARG && 702 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0) 703 { 704 md->start_match_ptr = eptr; 705 RRETURN(MATCH_SKIP); 706 } 707 708 if (md->mark == NULL) md->mark = markptr; 709 RRETURN(rrc); 710 711 case OP_FAIL: 712 MRRETURN(MATCH_NOMATCH); 713 714 /* COMMIT overrides PRUNE, SKIP, and THEN */ 715 716 case OP_COMMIT: 717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 718 ims, eptrb, flags, RM52); 719 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && 720 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && 721 rrc != MATCH_THEN) 722 RRETURN(rrc); 723 MRRETURN(MATCH_COMMIT); 724 725 /* PRUNE overrides THEN */ 726 727 case OP_PRUNE: 728 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 729 ims, eptrb, flags, RM51); 730 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 731 MRRETURN(MATCH_PRUNE); 732 733 case OP_PRUNE_ARG: 734 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, 735 ims, eptrb, flags, RM56); 736 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 737 md->mark = ecode + 2; 738 RRETURN(MATCH_PRUNE); 739 740 /* SKIP overrides PRUNE and THEN */ 741 742 case OP_SKIP: 743 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 744 ims, eptrb, flags, RM53); 745 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) 746 RRETURN(rrc); 747 md->start_match_ptr = eptr; /* Pass back current position */ 748 MRRETURN(MATCH_SKIP); 749 750 case OP_SKIP_ARG: 751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, 752 ims, eptrb, flags, RM57); 753 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) 754 RRETURN(rrc); 755 756 /* Pass back the current skip name by overloading md->start_match_ptr and 757 returning the special MATCH_SKIP_ARG return code. This will either be 758 caught by a matching MARK, or get to the top, where it is treated the same 759 as PRUNE. */ 760 761 md->start_match_ptr = ecode + 2; 762 RRETURN(MATCH_SKIP_ARG); 763 764 /* For THEN (and THEN_ARG) we pass back the address of the bracket or 765 the alt that is at the start of the current branch. This makes it possible 766 to skip back past alternatives that precede the THEN within the current 767 branch. */ 768 769 case OP_THEN: 770 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 771 ims, eptrb, flags, RM54); 772 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 773 md->start_match_ptr = ecode - GET(ecode, 1); 774 MRRETURN(MATCH_THEN); 775 776 case OP_THEN_ARG: 777 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE], 778 offset_top, md, ims, eptrb, flags, RM58); 779 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 780 md->start_match_ptr = ecode - GET(ecode, 1); 781 md->mark = ecode + LINK_SIZE + 2; 782 RRETURN(MATCH_THEN); 783 784 /* Handle a capturing bracket. If there is space in the offset vector, save 785 the current subject position in the working slot at the top of the vector. 786 We mustn't change the current values of the data slot, because they may be 787 set from a previous iteration of this group, and be referred to by a 788 reference inside the group. 789 790 If the bracket fails to match, we need to restore this value and also the 791 values of the final offsets, in case they were set by a previous iteration 792 of the same bracket. 793 794 If there isn't enough space in the offset vector, treat this as if it were 795 a non-capturing bracket. Don't worry about setting the flag for the error 796 case here; that is handled in the code for KET. */ 797 798 case OP_CBRA: 799 case OP_SCBRA: 800 number = GET2(ecode, 1+LINK_SIZE); 801 offset = number << 1; 802 803 #ifdef PCRE_DEBUG 804 printf("start bracket %d\n", number); 805 printf("subject="); 806 pchars(eptr, 16, TRUE, md); 807 printf("\n"); 808 #endif 809 810 if (offset < md->offset_max) 811 { 812 save_offset1 = md->offset_vector[offset]; 813 save_offset2 = md->offset_vector[offset+1]; 814 save_offset3 = md->offset_vector[md->offset_end - number]; 815 save_capture_last = md->capture_last; 816 817 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 818 md->offset_vector[md->offset_end - number] = 819 (int)(eptr - md->start_subject); 820 821 flags = (op == OP_SCBRA)? match_cbegroup : 0; 822 do 823 { 824 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 825 ims, eptrb, flags, RM1); 826 if (rrc != MATCH_NOMATCH && 827 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 828 RRETURN(rrc); 829 md->capture_last = save_capture_last; 830 ecode += GET(ecode, 1); 831 } 832 while (*ecode == OP_ALT); 833 834 DPRINTF(("bracket %d failed\n", number)); 835 836 md->offset_vector[offset] = save_offset1; 837 md->offset_vector[offset+1] = save_offset2; 838 md->offset_vector[md->offset_end - number] = save_offset3; 839 840 if (rrc != MATCH_THEN) md->mark = markptr; 841 RRETURN(MATCH_NOMATCH); 842 } 843 844 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 845 as a non-capturing bracket. */ 846 847 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 848 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 849 850 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 851 852 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 853 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 854 855 /* Non-capturing bracket. Loop for all the alternatives. When we get to the 856 final alternative within the brackets, we would return the result of a 857 recursive call to match() whatever happened. We can reduce stack usage by 858 turning this into a tail recursion, except in the case when match_cbegroup 859 is set.*/ 860 861 case OP_BRA: 862 case OP_SBRA: 863 DPRINTF(("start non-capturing bracket\n")); 864 flags = (op >= OP_SBRA)? match_cbegroup : 0; 865 for (;;) 866 { 867 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ 868 { 869 if (flags == 0) /* Not a possibly empty group */ 870 { 871 ecode += _pcre_OP_lengths[*ecode]; 872 DPRINTF(("bracket 0 tail recursion\n")); 873 goto TAIL_RECURSE; 874 } 875 876 /* Possibly empty group; can't use tail recursion. */ 877 878 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, 879 eptrb, flags, RM48); 880 if (rrc == MATCH_NOMATCH) md->mark = markptr; 881 RRETURN(rrc); 882 } 883 884 /* For non-final alternatives, continue the loop for a NOMATCH result; 885 otherwise return. */ 886 887 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, 888 eptrb, flags, RM2); 889 if (rrc != MATCH_NOMATCH && 890 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 891 RRETURN(rrc); 892 ecode += GET(ecode, 1); 893 } 894 /* Control never reaches here. */ 895 896 /* Conditional group: compilation checked that there are no more than 897 two branches. If the condition is false, skipping the first branch takes us 898 past the end if there is only one branch, but that's OK because that is 899 exactly what going to the ket would do. As there is only one branch to be 900 obeyed, we can use tail recursion to avoid using another stack frame. */ 901 902 case OP_COND: 903 case OP_SCOND: 904 codelink= GET(ecode, 1); 905 906 /* Because of the way auto-callout works during compile, a callout item is 907 inserted between OP_COND and an assertion condition. */ 908 909 if (ecode[LINK_SIZE+1] == OP_CALLOUT) 910 { 911 if (pcre_callout != NULL) 912 { 913 pcre_callout_block cb; 914 cb.version = 1; /* Version 1 of the callout block */ 915 cb.callout_number = ecode[LINK_SIZE+2]; 916 cb.offset_vector = md->offset_vector; 917 cb.subject = (PCRE_SPTR)md->start_subject; 918 cb.subject_length = (int)(md->end_subject - md->start_subject); 919 cb.start_match = (int)(mstart - md->start_subject); 920 cb.current_position = (int)(eptr - md->start_subject); 921 cb.pattern_position = GET(ecode, LINK_SIZE + 3); 922 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); 923 cb.capture_top = offset_top/2; 924 cb.capture_last = md->capture_last; 925 cb.callout_data = md->callout_data; 926 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); 927 if (rrc < 0) RRETURN(rrc); 928 } 929 ecode += _pcre_OP_lengths[OP_CALLOUT]; 930 } 931 932 condcode = ecode[LINK_SIZE+1]; 933 934 /* Now see what the actual condition is */ 935 936 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ 937 { 938 if (md->recursive == NULL) /* Not recursing => FALSE */ 939 { 940 condition = FALSE; 941 ecode += GET(ecode, 1); 942 } 943 else 944 { 945 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ 946 condition = (recno == RREF_ANY || recno == md->recursive->group_num); 947 948 /* If the test is for recursion into a specific subpattern, and it is 949 false, but the test was set up by name, scan the table to see if the 950 name refers to any other numbers, and test them. The condition is true 951 if any one is set. */ 952 953 if (!condition && condcode == OP_NRREF && recno != RREF_ANY) 954 { 955 uschar *slotA = md->name_table; 956 for (i = 0; i < md->name_count; i++) 957 { 958 if (GET2(slotA, 0) == recno) break; 959 slotA += md->name_entry_size; 960 } 961 962 /* Found a name for the number - there can be only one; duplicate 963 names for different numbers are allowed, but not vice versa. First 964 scan down for duplicates. */ 965 966 if (i < md->name_count) 967 { 968 uschar *slotB = slotA; 969 while (slotB > md->name_table) 970 { 971 slotB -= md->name_entry_size; 972 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 973 { 974 condition = GET2(slotB, 0) == md->recursive->group_num; 975 if (condition) break; 976 } 977 else break; 978 } 979 980 /* Scan up for duplicates */ 981 982 if (!condition) 983 { 984 slotB = slotA; 985 for (i++; i < md->name_count; i++) 986 { 987 slotB += md->name_entry_size; 988 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 989 { 990 condition = GET2(slotB, 0) == md->recursive->group_num; 991 if (condition) break; 992 } 993 else break; 994 } 995 } 996 } 997 } 998 999 /* Chose branch according to the condition */ 1000 1001 ecode += condition? 3 : GET(ecode, 1); 1002 } 1003 } 1004 1005 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ 1006 { 1007 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ 1008 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1009 1010 /* If the numbered capture is unset, but the reference was by name, 1011 scan the table to see if the name refers to any other numbers, and test 1012 them. The condition is true if any one is set. This is tediously similar 1013 to the code above, but not close enough to try to amalgamate. */ 1014 1015 if (!condition && condcode == OP_NCREF) 1016 { 1017 int refno = offset >> 1; 1018 uschar *slotA = md->name_table; 1019 1020 for (i = 0; i < md->name_count; i++) 1021 { 1022 if (GET2(slotA, 0) == refno) break; 1023 slotA += md->name_entry_size; 1024 } 1025 1026 /* Found a name for the number - there can be only one; duplicate names 1027 for different numbers are allowed, but not vice versa. First scan down 1028 for duplicates. */ 1029 1030 if (i < md->name_count) 1031 { 1032 uschar *slotB = slotA; 1033 while (slotB > md->name_table) 1034 { 1035 slotB -= md->name_entry_size; 1036 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 1037 { 1038 offset = GET2(slotB, 0) << 1; 1039 condition = offset < offset_top && 1040 md->offset_vector[offset] >= 0; 1041 if (condition) break; 1042 } 1043 else break; 1044 } 1045 1046 /* Scan up for duplicates */ 1047 1048 if (!condition) 1049 { 1050 slotB = slotA; 1051 for (i++; i < md->name_count; i++) 1052 { 1053 slotB += md->name_entry_size; 1054 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 1055 { 1056 offset = GET2(slotB, 0) << 1; 1057 condition = offset < offset_top && 1058 md->offset_vector[offset] >= 0; 1059 if (condition) break; 1060 } 1061 else break; 1062 } 1063 } 1064 } 1065 } 1066 1067 /* Chose branch according to the condition */ 1068 1069 ecode += condition? 3 : GET(ecode, 1); 1070 } 1071 1072 else if (condcode == OP_DEF) /* DEFINE - always false */ 1073 { 1074 condition = FALSE; 1075 ecode += GET(ecode, 1); 1076 } 1077 1078 /* The condition is an assertion. Call match() to evaluate it - setting 1079 the final argument match_condassert causes it to stop at the end of an 1080 assertion. */ 1081 1082 else 1083 { 1084 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 1085 match_condassert, RM3); 1086 if (rrc == MATCH_MATCH) 1087 { 1088 condition = TRUE; 1089 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); 1090 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1091 } 1092 else if (rrc != MATCH_NOMATCH && 1093 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 1094 { 1095 RRETURN(rrc); /* Need braces because of following else */ 1096 } 1097 else 1098 { 1099 condition = FALSE; 1100 ecode += codelink; 1101 } 1102 } 1103 1104 /* We are now at the branch that is to be obeyed. As there is only one, 1105 we can use tail recursion to avoid using another stack frame, except when 1106 match_cbegroup is required for an unlimited repeat of a possibly empty 1107 group. If the second alternative doesn't exist, we can just plough on. */ 1108 1109 if (condition || *ecode == OP_ALT) 1110 { 1111 ecode += 1 + LINK_SIZE; 1112 if (op == OP_SCOND) /* Possibly empty group */ 1113 { 1114 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); 1115 RRETURN(rrc); 1116 } 1117 else /* Group must match something */ 1118 { 1119 flags = 0; 1120 goto TAIL_RECURSE; 1121 } 1122 } 1123 else /* Condition false & no alternative */ 1124 { 1125 ecode += 1 + LINK_SIZE; 1126 } 1127 break; 1128 1129 1130 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1131 to close any currently open capturing brackets. */ 1132 1133 case OP_CLOSE: 1134 number = GET2(ecode, 1); 1135 offset = number << 1; 1136 1137 #ifdef PCRE_DEBUG 1138 printf("end bracket %d at *ACCEPT", number); 1139 printf("\n"); 1140 #endif 1141 1142 md->capture_last = number; 1143 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1144 { 1145 md->offset_vector[offset] = 1146 md->offset_vector[md->offset_end - number]; 1147 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1148 if (offset_top <= offset) offset_top = offset + 2; 1149 } 1150 ecode += 3; 1151 break; 1152 1153 1154 /* End of the pattern, either real or forced. If we are in a top-level 1155 recursion, we should restore the offsets appropriately and continue from 1156 after the call. */ 1157 1158 case OP_ACCEPT: 1159 case OP_END: 1160 if (md->recursive != NULL && md->recursive->group_num == 0) 1161 { 1162 recursion_info *rec = md->recursive; 1163 DPRINTF(("End of pattern in a (?0) recursion\n")); 1164 md->recursive = rec->prevrec; 1165 memmove(md->offset_vector, rec->offset_save, 1166 rec->saved_max * sizeof(int)); 1167 offset_top = rec->save_offset_top; 1168 ims = original_ims; 1169 ecode = rec->after_call; 1170 break; 1171 } 1172 1173 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is 1174 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of 1175 the subject. In both cases, backtracking will then try other alternatives, 1176 if any. */ 1177 1178 if (eptr == mstart && 1179 (md->notempty || 1180 (md->notempty_atstart && 1181 mstart == md->start_subject + md->start_offset))) 1182 MRRETURN(MATCH_NOMATCH); 1183 1184 /* Otherwise, we have a match. */ 1185 1186 md->end_match_ptr = eptr; /* Record where we ended */ 1187 md->end_offset_top = offset_top; /* and how many extracts were taken */ 1188 md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1189 1190 /* For some reason, the macros don't work properly if an expression is 1191 given as the argument to MRRETURN when the heap is in use. */ 1192 1193 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1194 MRRETURN(rrc); 1195 1196 /* Change option settings */ 1197 1198 case OP_OPT: 1199 ims = ecode[1]; 1200 ecode += 2; 1201 DPRINTF(("ims set to %02lx\n", ims)); 1202 break; 1203 1204 /* Assertion brackets. Check the alternative branches in turn - the 1205 matching won't pass the KET for an assertion. If any one branch matches, 1206 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1207 start of each branch to move the current point backwards, so the code at 1208 this level is identical to the lookahead case. */ 1209 1210 case OP_ASSERT: 1211 case OP_ASSERTBACK: 1212 do 1213 { 1214 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, 1215 RM4); 1216 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1217 { 1218 mstart = md->start_match_ptr; /* In case \K reset it */ 1219 break; 1220 } 1221 if (rrc != MATCH_NOMATCH && 1222 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 1223 RRETURN(rrc); 1224 ecode += GET(ecode, 1); 1225 } 1226 while (*ecode == OP_ALT); 1227 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH); 1228 1229 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1230 1231 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 1232 1233 /* Continue from after the assertion, updating the offsets high water 1234 mark, since extracts may have been taken during the assertion. */ 1235 1236 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1237 ecode += 1 + LINK_SIZE; 1238 offset_top = md->end_offset_top; 1239 continue; 1240 1241 /* Negative assertion: all branches must fail to match. Encountering SKIP, 1242 PRUNE, or COMMIT means we must assume failure without checking subsequent 1243 branches. */ 1244 1245 case OP_ASSERT_NOT: 1246 case OP_ASSERTBACK_NOT: 1247 do 1248 { 1249 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, 1250 RM5); 1251 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH); 1252 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) 1253 { 1254 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1255 break; 1256 } 1257 if (rrc != MATCH_NOMATCH && 1258 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 1259 RRETURN(rrc); 1260 ecode += GET(ecode,1); 1261 } 1262 while (*ecode == OP_ALT); 1263 1264 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 1265 1266 ecode += 1 + LINK_SIZE; 1267 continue; 1268 1269 /* Move the subject pointer back. This occurs only at the start of 1270 each branch of a lookbehind assertion. If we are too close to the start to 1271 move back, this match function fails. When working with UTF-8 we move 1272 back a number of characters, not bytes. */ 1273 1274 case OP_REVERSE: 1275 #ifdef SUPPORT_UTF8 1276 if (utf8) 1277 { 1278 i = GET(ecode, 1); 1279 while (i-- > 0) 1280 { 1281 eptr--; 1282 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); 1283 BACKCHAR(eptr); 1284 } 1285 } 1286 else 1287 #endif 1288 1289 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1290 1291 { 1292 eptr -= GET(ecode, 1); 1293 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH); 1294 } 1295 1296 /* Save the earliest consulted character, then skip to next op code */ 1297 1298 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1299 ecode += 1 + LINK_SIZE; 1300 break; 1301 1302 /* The callout item calls an external function, if one is provided, passing 1303 details of the match so far. This is mainly for debugging, though the 1304 function is able to force a failure. */ 1305 1306 case OP_CALLOUT: 1307 if (pcre_callout != NULL) 1308 { 1309 pcre_callout_block cb; 1310 cb.version = 1; /* Version 1 of the callout block */ 1311 cb.callout_number = ecode[1]; 1312 cb.offset_vector = md->offset_vector; 1313 cb.subject = (PCRE_SPTR)md->start_subject; 1314 cb.subject_length = (int)(md->end_subject - md->start_subject); 1315 cb.start_match = (int)(mstart - md->start_subject); 1316 cb.current_position = (int)(eptr - md->start_subject); 1317 cb.pattern_position = GET(ecode, 2); 1318 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1319 cb.capture_top = offset_top/2; 1320 cb.capture_last = md->capture_last; 1321 cb.callout_data = md->callout_data; 1322 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH); 1323 if (rrc < 0) RRETURN(rrc); 1324 } 1325 ecode += 2 + 2*LINK_SIZE; 1326 break; 1327 1328 /* Recursion either matches the current regex, or some subexpression. The 1329 offset data is the offset to the starting bracket from the start of the 1330 whole pattern. (This is so that it works from duplicated subpatterns.) 1331 1332 If there are any capturing brackets started but not finished, we have to 1333 save their starting points and reinstate them after the recursion. However, 1334 we don't know how many such there are (offset_top records the completed 1335 total) so we just have to save all the potential data. There may be up to 1336 65535 such values, which is too large to put on the stack, but using malloc 1337 for small numbers seems expensive. As a compromise, the stack is used when 1338 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc 1339 is used. A problem is what to do if the malloc fails ... there is no way of 1340 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX 1341 values on the stack, and accept that the rest may be wrong. 1342 1343 There are also other values that have to be saved. We use a chained 1344 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1345 for the original version of this logic. */ 1346 1347 case OP_RECURSE: 1348 { 1349 callpat = md->start_code + GET(ecode, 1); 1350 new_recursive.group_num = (callpat == md->start_code)? 0 : 1351 GET2(callpat, 1 + LINK_SIZE); 1352 1353 /* Add to "recursing stack" */ 1354 1355 new_recursive.prevrec = md->recursive; 1356 md->recursive = &new_recursive; 1357 1358 /* Find where to continue from afterwards */ 1359 1360 ecode += 1 + LINK_SIZE; 1361 new_recursive.after_call = ecode; 1362 1363 /* Now save the offset data. */ 1364 1365 new_recursive.saved_max = md->offset_end; 1366 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1367 new_recursive.offset_save = stacksave; 1368 else 1369 { 1370 new_recursive.offset_save = 1371 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); 1372 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1373 } 1374 1375 memcpy(new_recursive.offset_save, md->offset_vector, 1376 new_recursive.saved_max * sizeof(int)); 1377 new_recursive.save_offset_top = offset_top; 1378 1379 /* OK, now we can do the recursion. For each top-level alternative we 1380 restore the offset and recursion data. */ 1381 1382 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1383 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; 1384 do 1385 { 1386 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, 1387 md, ims, eptrb, flags, RM6); 1388 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1389 { 1390 DPRINTF(("Recursion matched\n")); 1391 md->recursive = new_recursive.prevrec; 1392 if (new_recursive.offset_save != stacksave) 1393 (pcre_free)(new_recursive.offset_save); 1394 MRRETURN(MATCH_MATCH); 1395 } 1396 else if (rrc != MATCH_NOMATCH && 1397 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 1398 { 1399 DPRINTF(("Recursion gave error %d\n", rrc)); 1400 if (new_recursive.offset_save != stacksave) 1401 (pcre_free)(new_recursive.offset_save); 1402 RRETURN(rrc); 1403 } 1404 1405 md->recursive = &new_recursive; 1406 memcpy(md->offset_vector, new_recursive.offset_save, 1407 new_recursive.saved_max * sizeof(int)); 1408 callpat += GET(callpat, 1); 1409 } 1410 while (*callpat == OP_ALT); 1411 1412 DPRINTF(("Recursion didn't match\n")); 1413 md->recursive = new_recursive.prevrec; 1414 if (new_recursive.offset_save != stacksave) 1415 (pcre_free)(new_recursive.offset_save); 1416 MRRETURN(MATCH_NOMATCH); 1417 } 1418 /* Control never reaches here */ 1419 1420 /* "Once" brackets are like assertion brackets except that after a match, 1421 the point in the subject string is not moved back. Thus there can never be 1422 a move back into the brackets. Friedl calls these "atomic" subpatterns. 1423 Check the alternative branches in turn - the matching won't pass the KET 1424 for this kind of subpattern. If any one branch matches, we carry on as at 1425 the end of a normal bracket, leaving the subject pointer, but resetting 1426 the start-of-match value in case it was changed by \K. */ 1427 1428 case OP_ONCE: 1429 prev = ecode; 1430 saved_eptr = eptr; 1431 1432 do 1433 { 1434 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); 1435 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 1436 { 1437 mstart = md->start_match_ptr; 1438 break; 1439 } 1440 if (rrc != MATCH_NOMATCH && 1441 (rrc != MATCH_THEN || md->start_match_ptr != ecode)) 1442 RRETURN(rrc); 1443 ecode += GET(ecode,1); 1444 } 1445 while (*ecode == OP_ALT); 1446 1447 /* If hit the end of the group (which could be repeated), fail */ 1448 1449 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 1450 1451 /* Continue as from after the assertion, updating the offsets high water 1452 mark, since extracts may have been taken. */ 1453 1454 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1455 1456 offset_top = md->end_offset_top; 1457 eptr = md->end_match_ptr; 1458 1459 /* For a non-repeating ket, just continue at this level. This also 1460 happens for a repeating ket if no characters were matched in the group. 1461 This is the forcible breaking of infinite loops as implemented in Perl 1462 5.005. If there is an options reset, it will get obeyed in the normal 1463 course of events. */ 1464 1465 if (*ecode == OP_KET || eptr == saved_eptr) 1466 { 1467 ecode += 1+LINK_SIZE; 1468 break; 1469 } 1470 1471 /* The repeating kets try the rest of the pattern or restart from the 1472 preceding bracket, in the appropriate order. The second "call" of match() 1473 uses tail recursion, to avoid using another stack frame. We need to reset 1474 any options that changed within the bracket before re-running it, so 1475 check the next opcode. */ 1476 1477 if (ecode[1+LINK_SIZE] == OP_OPT) 1478 { 1479 ims = (ims & ~PCRE_IMS) | ecode[4]; 1480 DPRINTF(("ims set to %02lx at group repeat\n", ims)); 1481 } 1482 1483 if (*ecode == OP_KETRMIN) 1484 { 1485 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); 1486 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1487 ecode = prev; 1488 flags = 0; 1489 goto TAIL_RECURSE; 1490 } 1491 else /* OP_KETRMAX */ 1492 { 1493 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); 1494 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1495 ecode += 1 + LINK_SIZE; 1496 flags = 0; 1497 goto TAIL_RECURSE; 1498 } 1499 /* Control never gets here */ 1500 1501 /* An alternation is the end of a branch; scan along to find the end of the 1502 bracketed group and go to there. */ 1503 1504 case OP_ALT: 1505 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1506 break; 1507 1508 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1509 indicating that it may occur zero times. It may repeat infinitely, or not 1510 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1511 with fixed upper repeat limits are compiled as a number of copies, with the 1512 optional ones preceded by BRAZERO or BRAMINZERO. */ 1513 1514 case OP_BRAZERO: 1515 { 1516 next = ecode+1; 1517 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); 1518 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1519 do next += GET(next,1); while (*next == OP_ALT); 1520 ecode = next + 1 + LINK_SIZE; 1521 } 1522 break; 1523 1524 case OP_BRAMINZERO: 1525 { 1526 next = ecode+1; 1527 do next += GET(next, 1); while (*next == OP_ALT); 1528 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); 1529 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1530 ecode++; 1531 } 1532 break; 1533 1534 case OP_SKIPZERO: 1535 { 1536 next = ecode+1; 1537 do next += GET(next,1); while (*next == OP_ALT); 1538 ecode = next + 1 + LINK_SIZE; 1539 } 1540 break; 1541 1542 /* End of a group, repeated or non-repeating. */ 1543 1544 case OP_KET: 1545 case OP_KETRMIN: 1546 case OP_KETRMAX: 1547 prev = ecode - GET(ecode, 1); 1548 1549 /* If this was a group that remembered the subject start, in order to break 1550 infinite repeats of empty string matches, retrieve the subject start from 1551 the chain. Otherwise, set it NULL. */ 1552 1553 if (*prev >= OP_SBRA) 1554 { 1555 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1556 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1557 } 1558 else saved_eptr = NULL; 1559 1560 /* If we are at the end of an assertion group or an atomic group, stop 1561 matching and return MATCH_MATCH, but record the current high water mark for 1562 use by positive assertions. We also need to record the match start in case 1563 it was changed by \K. */ 1564 1565 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || 1566 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || 1567 *prev == OP_ONCE) 1568 { 1569 md->end_match_ptr = eptr; /* For ONCE */ 1570 md->end_offset_top = offset_top; 1571 md->start_match_ptr = mstart; 1572 MRRETURN(MATCH_MATCH); 1573 } 1574 1575 /* For capturing groups we have to check the group number back at the start 1576 and if necessary complete handling an extraction by setting the offsets and 1577 bumping the high water mark. Note that whole-pattern recursion is coded as 1578 a recurse into group 0, so it won't be picked up here. Instead, we catch it 1579 when the OP_END is reached. Other recursion is handled here. */ 1580 1581 if (*prev == OP_CBRA || *prev == OP_SCBRA) 1582 { 1583 number = GET2(prev, 1+LINK_SIZE); 1584 offset = number << 1; 1585 1586 #ifdef PCRE_DEBUG 1587 printf("end bracket %d", number); 1588 printf("\n"); 1589 #endif 1590 1591 md->capture_last = number; 1592 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1593 { 1594 md->offset_vector[offset] = 1595 md->offset_vector[md->offset_end - number]; 1596 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1597 if (offset_top <= offset) offset_top = offset + 2; 1598 } 1599 1600 /* Handle a recursively called group. Restore the offsets 1601 appropriately and continue from after the call. */ 1602 1603 if (md->recursive != NULL && md->recursive->group_num == number) 1604 { 1605 recursion_info *rec = md->recursive; 1606 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); 1607 md->recursive = rec->prevrec; 1608 memcpy(md->offset_vector, rec->offset_save, 1609 rec->saved_max * sizeof(int)); 1610 offset_top = rec->save_offset_top; 1611 ecode = rec->after_call; 1612 ims = original_ims; 1613 break; 1614 } 1615 } 1616 1617 /* For both capturing and non-capturing groups, reset the value of the ims 1618 flags, in case they got changed during the group. */ 1619 1620 ims = original_ims; 1621 DPRINTF(("ims reset to %02lx\n", ims)); 1622 1623 /* For a non-repeating ket, just continue at this level. This also 1624 happens for a repeating ket if no characters were matched in the group. 1625 This is the forcible breaking of infinite loops as implemented in Perl 1626 5.005. If there is an options reset, it will get obeyed in the normal 1627 course of events. */ 1628 1629 if (*ecode == OP_KET || eptr == saved_eptr) 1630 { 1631 ecode += 1 + LINK_SIZE; 1632 break; 1633 } 1634 1635 /* The repeating kets try the rest of the pattern or restart from the 1636 preceding bracket, in the appropriate order. In the second case, we can use 1637 tail recursion to avoid using another stack frame, unless we have an 1638 unlimited repeat of a group that can match an empty string. */ 1639 1640 flags = (*prev >= OP_SBRA)? match_cbegroup : 0; 1641 1642 if (*ecode == OP_KETRMIN) 1643 { 1644 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); 1645 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1646 if (flags != 0) /* Could match an empty string */ 1647 { 1648 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); 1649 RRETURN(rrc); 1650 } 1651 ecode = prev; 1652 goto TAIL_RECURSE; 1653 } 1654 else /* OP_KETRMAX */ 1655 { 1656 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); 1657 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1658 ecode += 1 + LINK_SIZE; 1659 flags = 0; 1660 goto TAIL_RECURSE; 1661 } 1662 /* Control never gets here */ 1663 1664 /* Start of subject unless notbol, or after internal newline if multiline */ 1665 1666 case OP_CIRC: 1667 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH); 1668 if ((ims & PCRE_MULTILINE) != 0) 1669 { 1670 if (eptr != md->start_subject && 1671 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 1672 MRRETURN(MATCH_NOMATCH); 1673 ecode++; 1674 break; 1675 } 1676 /* ... else fall through */ 1677 1678 /* Start of subject assertion */ 1679 1680 case OP_SOD: 1681 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH); 1682 ecode++; 1683 break; 1684 1685 /* Start of match assertion */ 1686 1687 case OP_SOM: 1688 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH); 1689 ecode++; 1690 break; 1691 1692 /* Reset the start of match point */ 1693 1694 case OP_SET_SOM: 1695 mstart = eptr; 1696 ecode++; 1697 break; 1698 1699 /* Assert before internal newline if multiline, or before a terminating 1700 newline unless endonly is set, else end of subject unless noteol is set. */ 1701 1702 case OP_DOLL: 1703 if ((ims & PCRE_MULTILINE) != 0) 1704 { 1705 if (eptr < md->end_subject) 1706 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); } 1707 else 1708 { 1709 if (md->noteol) MRRETURN(MATCH_NOMATCH); 1710 SCHECK_PARTIAL(); 1711 } 1712 ecode++; 1713 break; 1714 } 1715 else /* Not multiline */ 1716 { 1717 if (md->noteol) MRRETURN(MATCH_NOMATCH); 1718 if (!md->endonly) goto ASSERT_NL_OR_EOS; 1719 } 1720 1721 /* ... else fall through for endonly */ 1722 1723 /* End of subject assertion (\z) */ 1724 1725 case OP_EOD: 1726 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH); 1727 SCHECK_PARTIAL(); 1728 ecode++; 1729 break; 1730 1731 /* End of subject or ending \n assertion (\Z) */ 1732 1733 case OP_EODN: 1734 ASSERT_NL_OR_EOS: 1735 if (eptr < md->end_subject && 1736 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 1737 MRRETURN(MATCH_NOMATCH); 1738 1739 /* Either at end of string or \n before end. */ 1740 1741 SCHECK_PARTIAL(); 1742 ecode++; 1743 break; 1744 1745 /* Word boundary assertions */ 1746 1747 case OP_NOT_WORD_BOUNDARY: 1748 case OP_WORD_BOUNDARY: 1749 { 1750 1751 /* Find out if the previous and current characters are "word" characters. 1752 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 1753 be "non-word" characters. Remember the earliest consulted character for 1754 partial matching. */ 1755 1756 #ifdef SUPPORT_UTF8 1757 if (utf8) 1758 { 1759 /* Get status of previous character */ 1760 1761 if (eptr == md->start_subject) prev_is_word = FALSE; else 1762 { 1763 USPTR lastptr = eptr - 1; 1764 while((*lastptr & 0xc0) == 0x80) lastptr--; 1765 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 1766 GETCHAR(c, lastptr); 1767 #ifdef SUPPORT_UCP 1768 if (md->use_ucp) 1769 { 1770 if (c == '_') prev_is_word = TRUE; else 1771 { 1772 int cat = UCD_CATEGORY(c); 1773 prev_is_word = (cat == ucp_L || cat == ucp_N); 1774 } 1775 } 1776 else 1777 #endif 1778 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 1779 } 1780 1781 /* Get status of next character */ 1782 1783 if (eptr >= md->end_subject) 1784 { 1785 SCHECK_PARTIAL(); 1786 cur_is_word = FALSE; 1787 } 1788 else 1789 { 1790 GETCHAR(c, eptr); 1791 #ifdef SUPPORT_UCP 1792 if (md->use_ucp) 1793 { 1794 if (c == '_') cur_is_word = TRUE; else 1795 { 1796 int cat = UCD_CATEGORY(c); 1797 cur_is_word = (cat == ucp_L || cat == ucp_N); 1798 } 1799 } 1800 else 1801 #endif 1802 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 1803 } 1804 } 1805 else 1806 #endif 1807 1808 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for 1809 consistency with the behaviour of \w we do use it in this case. */ 1810 1811 { 1812 /* Get status of previous character */ 1813 1814 if (eptr == md->start_subject) prev_is_word = FALSE; else 1815 { 1816 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 1817 #ifdef SUPPORT_UCP 1818 if (md->use_ucp) 1819 { 1820 c = eptr[-1]; 1821 if (c == '_') prev_is_word = TRUE; else 1822 { 1823 int cat = UCD_CATEGORY(c); 1824 prev_is_word = (cat == ucp_L || cat == ucp_N); 1825 } 1826 } 1827 else 1828 #endif 1829 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); 1830 } 1831 1832 /* Get status of next character */ 1833 1834 if (eptr >= md->end_subject) 1835 { 1836 SCHECK_PARTIAL(); 1837 cur_is_word = FALSE; 1838 } 1839 else 1840 #ifdef SUPPORT_UCP 1841 if (md->use_ucp) 1842 { 1843 c = *eptr; 1844 if (c == '_') cur_is_word = TRUE; else 1845 { 1846 int cat = UCD_CATEGORY(c); 1847 cur_is_word = (cat == ucp_L || cat == ucp_N); 1848 } 1849 } 1850 else 1851 #endif 1852 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); 1853 } 1854 1855 /* Now see if the situation is what we want */ 1856 1857 if ((*ecode++ == OP_WORD_BOUNDARY)? 1858 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 1859 MRRETURN(MATCH_NOMATCH); 1860 } 1861 break; 1862 1863 /* Match a single character type; inline for speed */ 1864 1865 case OP_ANY: 1866 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); 1867 /* Fall through */ 1868 1869 case OP_ALLANY: 1870 if (eptr++ >= md->end_subject) 1871 { 1872 SCHECK_PARTIAL(); 1873 MRRETURN(MATCH_NOMATCH); 1874 } 1875 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 1876 ecode++; 1877 break; 1878 1879 /* Match a single byte, even in UTF-8 mode. This opcode really does match 1880 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 1881 1882 case OP_ANYBYTE: 1883 if (eptr++ >= md->end_subject) 1884 { 1885 SCHECK_PARTIAL(); 1886 MRRETURN(MATCH_NOMATCH); 1887 } 1888 ecode++; 1889 break; 1890 1891 case OP_NOT_DIGIT: 1892 if (eptr >= md->end_subject) 1893 { 1894 SCHECK_PARTIAL(); 1895 MRRETURN(MATCH_NOMATCH); 1896 } 1897 GETCHARINCTEST(c, eptr); 1898 if ( 1899 #ifdef SUPPORT_UTF8 1900 c < 256 && 1901 #endif 1902 (md->ctypes[c] & ctype_digit) != 0 1903 ) 1904 MRRETURN(MATCH_NOMATCH); 1905 ecode++; 1906 break; 1907 1908 case OP_DIGIT: 1909 if (eptr >= md->end_subject) 1910 { 1911 SCHECK_PARTIAL(); 1912 MRRETURN(MATCH_NOMATCH); 1913 } 1914 GETCHARINCTEST(c, eptr); 1915 if ( 1916 #ifdef SUPPORT_UTF8 1917 c >= 256 || 1918 #endif 1919 (md->ctypes[c] & ctype_digit) == 0 1920 ) 1921 MRRETURN(MATCH_NOMATCH); 1922 ecode++; 1923 break; 1924 1925 case OP_NOT_WHITESPACE: 1926 if (eptr >= md->end_subject) 1927 { 1928 SCHECK_PARTIAL(); 1929 MRRETURN(MATCH_NOMATCH); 1930 } 1931 GETCHARINCTEST(c, eptr); 1932 if ( 1933 #ifdef SUPPORT_UTF8 1934 c < 256 && 1935 #endif 1936 (md->ctypes[c] & ctype_space) != 0 1937 ) 1938 MRRETURN(MATCH_NOMATCH); 1939 ecode++; 1940 break; 1941 1942 case OP_WHITESPACE: 1943 if (eptr >= md->end_subject) 1944 { 1945 SCHECK_PARTIAL(); 1946 MRRETURN(MATCH_NOMATCH); 1947 } 1948 GETCHARINCTEST(c, eptr); 1949 if ( 1950 #ifdef SUPPORT_UTF8 1951 c >= 256 || 1952 #endif 1953 (md->ctypes[c] & ctype_space) == 0 1954 ) 1955 MRRETURN(MATCH_NOMATCH); 1956 ecode++; 1957 break; 1958 1959 case OP_NOT_WORDCHAR: 1960 if (eptr >= md->end_subject) 1961 { 1962 SCHECK_PARTIAL(); 1963 MRRETURN(MATCH_NOMATCH); 1964 } 1965 GETCHARINCTEST(c, eptr); 1966 if ( 1967 #ifdef SUPPORT_UTF8 1968 c < 256 && 1969 #endif 1970 (md->ctypes[c] & ctype_word) != 0 1971 ) 1972 MRRETURN(MATCH_NOMATCH); 1973 ecode++; 1974 break; 1975 1976 case OP_WORDCHAR: 1977 if (eptr >= md->end_subject) 1978 { 1979 SCHECK_PARTIAL(); 1980 MRRETURN(MATCH_NOMATCH); 1981 } 1982 GETCHARINCTEST(c, eptr); 1983 if ( 1984 #ifdef SUPPORT_UTF8 1985 c >= 256 || 1986 #endif 1987 (md->ctypes[c] & ctype_word) == 0 1988 ) 1989 MRRETURN(MATCH_NOMATCH); 1990 ecode++; 1991 break; 1992 1993 case OP_ANYNL: 1994 if (eptr >= md->end_subject) 1995 { 1996 SCHECK_PARTIAL(); 1997 MRRETURN(MATCH_NOMATCH); 1998 } 1999 GETCHARINCTEST(c, eptr); 2000 switch(c) 2001 { 2002 default: MRRETURN(MATCH_NOMATCH); 2003 case 0x000d: 2004 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 2005 break; 2006 2007 case 0x000a: 2008 break; 2009 2010 case 0x000b: 2011 case 0x000c: 2012 case 0x0085: 2013 case 0x2028: 2014 case 0x2029: 2015 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); 2016 break; 2017 } 2018 ecode++; 2019 break; 2020 2021 case OP_NOT_HSPACE: 2022 if (eptr >= md->end_subject) 2023 { 2024 SCHECK_PARTIAL(); 2025 MRRETURN(MATCH_NOMATCH); 2026 } 2027 GETCHARINCTEST(c, eptr); 2028 switch(c) 2029 { 2030 default: break; 2031 case 0x09: /* HT */ 2032 case 0x20: /* SPACE */ 2033 case 0xa0: /* NBSP */ 2034 case 0x1680: /* OGHAM SPACE MARK */ 2035 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2036 case 0x2000: /* EN QUAD */ 2037 case 0x2001: /* EM QUAD */ 2038 case 0x2002: /* EN SPACE */ 2039 case 0x2003: /* EM SPACE */ 2040 case 0x2004: /* THREE-PER-EM SPACE */ 2041 case 0x2005: /* FOUR-PER-EM SPACE */ 2042 case 0x2006: /* SIX-PER-EM SPACE */ 2043 case 0x2007: /* FIGURE SPACE */ 2044 case 0x2008: /* PUNCTUATION SPACE */ 2045 case 0x2009: /* THIN SPACE */ 2046 case 0x200A: /* HAIR SPACE */ 2047 case 0x202f: /* NARROW NO-BREAK SPACE */ 2048 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2049 case 0x3000: /* IDEOGRAPHIC SPACE */ 2050 MRRETURN(MATCH_NOMATCH); 2051 } 2052 ecode++; 2053 break; 2054 2055 case OP_HSPACE: 2056 if (eptr >= md->end_subject) 2057 { 2058 SCHECK_PARTIAL(); 2059 MRRETURN(MATCH_NOMATCH); 2060 } 2061 GETCHARINCTEST(c, eptr); 2062 switch(c) 2063 { 2064 default: MRRETURN(MATCH_NOMATCH); 2065 case 0x09: /* HT */ 2066 case 0x20: /* SPACE */ 2067 case 0xa0: /* NBSP */ 2068 case 0x1680: /* OGHAM SPACE MARK */ 2069 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2070 case 0x2000: /* EN QUAD */ 2071 case 0x2001: /* EM QUAD */ 2072 case 0x2002: /* EN SPACE */ 2073 case 0x2003: /* EM SPACE */ 2074 case 0x2004: /* THREE-PER-EM SPACE */ 2075 case 0x2005: /* FOUR-PER-EM SPACE */ 2076 case 0x2006: /* SIX-PER-EM SPACE */ 2077 case 0x2007: /* FIGURE SPACE */ 2078 case 0x2008: /* PUNCTUATION SPACE */ 2079 case 0x2009: /* THIN SPACE */ 2080 case 0x200A: /* HAIR SPACE */ 2081 case 0x202f: /* NARROW NO-BREAK SPACE */ 2082 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2083 case 0x3000: /* IDEOGRAPHIC SPACE */ 2084 break; 2085 } 2086 ecode++; 2087 break; 2088 2089 case OP_NOT_VSPACE: 2090 if (eptr >= md->end_subject) 2091 { 2092 SCHECK_PARTIAL(); 2093 MRRETURN(MATCH_NOMATCH); 2094 } 2095 GETCHARINCTEST(c, eptr); 2096 switch(c) 2097 { 2098 default: break; 2099 case 0x0a: /* LF */ 2100 case 0x0b: /* VT */ 2101 case 0x0c: /* FF */ 2102 case 0x0d: /* CR */ 2103 case 0x85: /* NEL */ 2104 case 0x2028: /* LINE SEPARATOR */ 2105 case 0x2029: /* PARAGRAPH SEPARATOR */ 2106 MRRETURN(MATCH_NOMATCH); 2107 } 2108 ecode++; 2109 break; 2110 2111 case OP_VSPACE: 2112 if (eptr >= md->end_subject) 2113 { 2114 SCHECK_PARTIAL(); 2115 MRRETURN(MATCH_NOMATCH); 2116 } 2117 GETCHARINCTEST(c, eptr); 2118 switch(c) 2119 { 2120 default: MRRETURN(MATCH_NOMATCH); 2121 case 0x0a: /* LF */ 2122 case 0x0b: /* VT */ 2123 case 0x0c: /* FF */ 2124 case 0x0d: /* CR */ 2125 case 0x85: /* NEL */ 2126 case 0x2028: /* LINE SEPARATOR */ 2127 case 0x2029: /* PARAGRAPH SEPARATOR */ 2128 break; 2129 } 2130 ecode++; 2131 break; 2132 2133 #ifdef SUPPORT_UCP 2134 /* Check the next character by Unicode property. We will get here only 2135 if the support is in the binary; otherwise a compile-time error occurs. */ 2136 2137 case OP_PROP: 2138 case OP_NOTPROP: 2139 if (eptr >= md->end_subject) 2140 { 2141 SCHECK_PARTIAL(); 2142 MRRETURN(MATCH_NOMATCH); 2143 } 2144 GETCHARINCTEST(c, eptr); 2145 { 2146 const ucd_record *prop = GET_UCD(c); 2147 2148 switch(ecode[1]) 2149 { 2150 case PT_ANY: 2151 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH); 2152 break; 2153 2154 case PT_LAMP: 2155 if ((prop->chartype == ucp_Lu || 2156 prop->chartype == ucp_Ll || 2157 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2158 MRRETURN(MATCH_NOMATCH); 2159 break; 2160 2161 case PT_GC: 2162 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) 2163 MRRETURN(MATCH_NOMATCH); 2164 break; 2165 2166 case PT_PC: 2167 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2168 MRRETURN(MATCH_NOMATCH); 2169 break; 2170 2171 case PT_SC: 2172 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2173 MRRETURN(MATCH_NOMATCH); 2174 break; 2175 2176 /* These are specials */ 2177 2178 case PT_ALNUM: 2179 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || 2180 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2181 MRRETURN(MATCH_NOMATCH); 2182 break; 2183 2184 case PT_SPACE: /* Perl space */ 2185 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || 2186 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) 2187 == (op == OP_NOTPROP)) 2188 MRRETURN(MATCH_NOMATCH); 2189 break; 2190 2191 case PT_PXSPACE: /* POSIX space */ 2192 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || 2193 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 2194 c == CHAR_FF || c == CHAR_CR) 2195 == (op == OP_NOTPROP)) 2196 MRRETURN(MATCH_NOMATCH); 2197 break; 2198 2199 case PT_WORD: 2200 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || 2201 _pcre_ucp_gentype[prop->chartype] == ucp_N || 2202 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2203 MRRETURN(MATCH_NOMATCH); 2204 break; 2205 2206 /* This should never occur */ 2207 2208 default: 2209 RRETURN(PCRE_ERROR_INTERNAL); 2210 } 2211 2212 ecode += 3; 2213 } 2214 break; 2215 2216 /* Match an extended Unicode sequence. We will get here only if the support 2217 is in the binary; otherwise a compile-time error occurs. */ 2218 2219 case OP_EXTUNI: 2220 if (eptr >= md->end_subject) 2221 { 2222 SCHECK_PARTIAL(); 2223 MRRETURN(MATCH_NOMATCH); 2224 } 2225 GETCHARINCTEST(c, eptr); 2226 { 2227 int category = UCD_CATEGORY(c); 2228 if (category == ucp_M) MRRETURN(MATCH_NOMATCH); 2229 while (eptr < md->end_subject) 2230 { 2231 int len = 1; 2232 if (!utf8) c = *eptr; else 2233 { 2234 GETCHARLEN(c, eptr, len); 2235 } 2236 category = UCD_CATEGORY(c); 2237 if (category != ucp_M) break; 2238 eptr += len; 2239 } 2240 } 2241 ecode++; 2242 break; 2243 #endif 2244 2245 2246 /* Match a back reference, possibly repeatedly. Look past the end of the 2247 item to see if there is repeat information following. The code is similar 2248 to that for character classes, but repeated for efficiency. Then obey 2249 similar code to character type repeats - written out again for speed. 2250 However, if the referenced string is the empty string, always treat 2251 it as matched, any number of times (otherwise there could be infinite 2252 loops). */ 2253 2254 case OP_REF: 2255 { 2256 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2257 ecode += 3; 2258 2259 /* If the reference is unset, there are two possibilities: 2260 2261 (a) In the default, Perl-compatible state, set the length to be longer 2262 than the amount of subject left; this ensures that every attempt at a 2263 match fails. We can't just fail here, because of the possibility of 2264 quantifiers with zero minima. 2265 2266 (b) If the JavaScript compatibility flag is set, set the length to zero 2267 so that the back reference matches an empty string. 2268 2269 Otherwise, set the length to the length of what was matched by the 2270 referenced subpattern. */ 2271 2272 if (offset >= offset_top || md->offset_vector[offset] < 0) 2273 length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); 2274 else 2275 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2276 2277 /* Set up for repetition, or handle the non-repeated case */ 2278 2279 switch (*ecode) 2280 { 2281 case OP_CRSTAR: 2282 case OP_CRMINSTAR: 2283 case OP_CRPLUS: 2284 case OP_CRMINPLUS: 2285 case OP_CRQUERY: 2286 case OP_CRMINQUERY: 2287 c = *ecode++ - OP_CRSTAR; 2288 minimize = (c & 1) != 0; 2289 min = rep_min[c]; /* Pick up values from tables; */ 2290 max = rep_max[c]; /* zero for max => infinity */ 2291 if (max == 0) max = INT_MAX; 2292 break; 2293 2294 case OP_CRRANGE: 2295 case OP_CRMINRANGE: 2296 minimize = (*ecode == OP_CRMINRANGE); 2297 min = GET2(ecode, 1); 2298 max = GET2(ecode, 3); 2299 if (max == 0) max = INT_MAX; 2300 ecode += 5; 2301 break; 2302 2303 default: /* No repeat follows */ 2304 if (!match_ref(offset, eptr, length, md, ims)) 2305 { 2306 CHECK_PARTIAL(); 2307 MRRETURN(MATCH_NOMATCH); 2308 } 2309 eptr += length; 2310 continue; /* With the main loop */ 2311 } 2312 2313 /* If the length of the reference is zero, just continue with the 2314 main loop. */ 2315 2316 if (length == 0) continue; 2317 2318 /* First, ensure the minimum number of matches are present. We get back 2319 the length of the reference string explicitly rather than passing the 2320 address of eptr, so that eptr can be a register variable. */ 2321 2322 for (i = 1; i <= min; i++) 2323 { 2324 if (!match_ref(offset, eptr, length, md, ims)) 2325 { 2326 CHECK_PARTIAL(); 2327 MRRETURN(MATCH_NOMATCH); 2328 } 2329 eptr += length; 2330 } 2331 2332 /* If min = max, continue at the same level without recursion. 2333 They are not both allowed to be zero. */ 2334 2335 if (min == max) continue; 2336 2337 /* If minimizing, keep trying and advancing the pointer */ 2338 2339 if (minimize) 2340 { 2341 for (fi = min;; fi++) 2342 { 2343 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); 2344 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2345 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2346 if (!match_ref(offset, eptr, length, md, ims)) 2347 { 2348 CHECK_PARTIAL(); 2349 MRRETURN(MATCH_NOMATCH); 2350 } 2351 eptr += length; 2352 } 2353 /* Control never gets here */ 2354 } 2355 2356 /* If maximizing, find the longest string and work backwards */ 2357 2358 else 2359 { 2360 pp = eptr; 2361 for (i = min; i < max; i++) 2362 { 2363 if (!match_ref(offset, eptr, length, md, ims)) 2364 { 2365 CHECK_PARTIAL(); 2366 break; 2367 } 2368 eptr += length; 2369 } 2370 while (eptr >= pp) 2371 { 2372 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); 2373 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2374 eptr -= length; 2375 } 2376 MRRETURN(MATCH_NOMATCH); 2377 } 2378 } 2379 /* Control never gets here */ 2380 2381 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2382 used when all the characters in the class have values in the range 0-255, 2383 and either the matching is caseful, or the characters are in the range 2384 0-127 when UTF-8 processing is enabled. The only difference between 2385 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2386 encountered. 2387 2388 First, look past the end of the item to see if there is repeat information 2389 following. Then obey similar code to character type repeats - written out 2390 again for speed. */ 2391 2392 case OP_NCLASS: 2393 case OP_CLASS: 2394 { 2395 data = ecode + 1; /* Save for matching */ 2396 ecode += 33; /* Advance past the item */ 2397 2398 switch (*ecode) 2399 { 2400 case OP_CRSTAR: 2401 case OP_CRMINSTAR: 2402 case OP_CRPLUS: 2403 case OP_CRMINPLUS: 2404 case OP_CRQUERY: 2405 case OP_CRMINQUERY: 2406 c = *ecode++ - OP_CRSTAR; 2407 minimize = (c & 1) != 0; 2408 min = rep_min[c]; /* Pick up values from tables; */ 2409 max = rep_max[c]; /* zero for max => infinity */ 2410 if (max == 0) max = INT_MAX; 2411 break; 2412 2413 case OP_CRRANGE: 2414 case OP_CRMINRANGE: 2415 minimize = (*ecode == OP_CRMINRANGE); 2416 min = GET2(ecode, 1); 2417 max = GET2(ecode, 3); 2418 if (max == 0) max = INT_MAX; 2419 ecode += 5; 2420 break; 2421 2422 default: /* No repeat follows */ 2423 min = max = 1; 2424 break; 2425 } 2426 2427 /* First, ensure the minimum number of matches are present. */ 2428 2429 #ifdef SUPPORT_UTF8 2430 /* UTF-8 mode */ 2431 if (utf8) 2432 { 2433 for (i = 1; i <= min; i++) 2434 { 2435 if (eptr >= md->end_subject) 2436 { 2437 SCHECK_PARTIAL(); 2438 MRRETURN(MATCH_NOMATCH); 2439 } 2440 GETCHARINC(c, eptr); 2441 if (c > 255) 2442 { 2443 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); 2444 } 2445 else 2446 { 2447 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); 2448 } 2449 } 2450 } 2451 else 2452 #endif 2453 /* Not UTF-8 mode */ 2454 { 2455 for (i = 1; i <= min; i++) 2456 { 2457 if (eptr >= md->end_subject) 2458 { 2459 SCHECK_PARTIAL(); 2460 MRRETURN(MATCH_NOMATCH); 2461 } 2462 c = *eptr++; 2463 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); 2464 } 2465 } 2466 2467 /* If max == min we can continue with the main loop without the 2468 need to recurse. */ 2469 2470 if (min == max) continue; 2471 2472 /* If minimizing, keep testing the rest of the expression and advancing 2473 the pointer while it matches the class. */ 2474 2475 if (minimize) 2476 { 2477 #ifdef SUPPORT_UTF8 2478 /* UTF-8 mode */ 2479 if (utf8) 2480 { 2481 for (fi = min;; fi++) 2482 { 2483 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); 2484 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2485 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2486 if (eptr >= md->end_subject) 2487 { 2488 SCHECK_PARTIAL(); 2489 MRRETURN(MATCH_NOMATCH); 2490 } 2491 GETCHARINC(c, eptr); 2492 if (c > 255) 2493 { 2494 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH); 2495 } 2496 else 2497 { 2498 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); 2499 } 2500 } 2501 } 2502 else 2503 #endif 2504 /* Not UTF-8 mode */ 2505 { 2506 for (fi = min;; fi++) 2507 { 2508 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); 2509 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2510 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2511 if (eptr >= md->end_subject) 2512 { 2513 SCHECK_PARTIAL(); 2514 MRRETURN(MATCH_NOMATCH); 2515 } 2516 c = *eptr++; 2517 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH); 2518 } 2519 } 2520 /* Control never gets here */ 2521 } 2522 2523 /* If maximizing, find the longest possible run, then work backwards. */ 2524 2525 else 2526 { 2527 pp = eptr; 2528 2529 #ifdef SUPPORT_UTF8 2530 /* UTF-8 mode */ 2531 if (utf8) 2532 { 2533 for (i = min; i < max; i++) 2534 { 2535 int len = 1; 2536 if (eptr >= md->end_subject) 2537 { 2538 SCHECK_PARTIAL(); 2539 break; 2540 } 2541 GETCHARLEN(c, eptr, len); 2542 if (c > 255) 2543 { 2544 if (op == OP_CLASS) break; 2545 } 2546 else 2547 { 2548 if ((data[c/8] & (1 << (c&7))) == 0) break; 2549 } 2550 eptr += len; 2551 } 2552 for (;;) 2553 { 2554 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); 2555 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2556 if (eptr-- == pp) break; /* Stop if tried at original pos */ 2557 BACKCHAR(eptr); 2558 } 2559 } 2560 else 2561 #endif 2562 /* Not UTF-8 mode */ 2563 { 2564 for (i = min; i < max; i++) 2565 { 2566 if (eptr >= md->end_subject) 2567 { 2568 SCHECK_PARTIAL(); 2569 break; 2570 } 2571 c = *eptr; 2572 if ((data[c/8] & (1 << (c&7))) == 0) break; 2573 eptr++; 2574 } 2575 while (eptr >= pp) 2576 { 2577 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); 2578 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2579 eptr--; 2580 } 2581 } 2582 2583 MRRETURN(MATCH_NOMATCH); 2584 } 2585 } 2586 /* Control never gets here */ 2587 2588 2589 /* Match an extended character class. This opcode is encountered only 2590 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 2591 mode, because Unicode properties are supported in non-UTF-8 mode. */ 2592 2593 #ifdef SUPPORT_UTF8 2594 case OP_XCLASS: 2595 { 2596 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 2597 ecode += GET(ecode, 1); /* Advance past the item */ 2598 2599 switch (*ecode) 2600 { 2601 case OP_CRSTAR: 2602 case OP_CRMINSTAR: 2603 case OP_CRPLUS: 2604 case OP_CRMINPLUS: 2605 case OP_CRQUERY: 2606 case OP_CRMINQUERY: 2607 c = *ecode++ - OP_CRSTAR; 2608 minimize = (c & 1) != 0; 2609 min = rep_min[c]; /* Pick up values from tables; */ 2610 max = rep_max[c]; /* zero for max => infinity */ 2611 if (max == 0) max = INT_MAX; 2612 break; 2613 2614 case OP_CRRANGE: 2615 case OP_CRMINRANGE: 2616 minimize = (*ecode == OP_CRMINRANGE); 2617 min = GET2(ecode, 1); 2618 max = GET2(ecode, 3); 2619 if (max == 0) max = INT_MAX; 2620 ecode += 5; 2621 break; 2622 2623 default: /* No repeat follows */ 2624 min = max = 1; 2625 break; 2626 } 2627 2628 /* First, ensure the minimum number of matches are present. */ 2629 2630 for (i = 1; i <= min; i++) 2631 { 2632 if (eptr >= md->end_subject) 2633 { 2634 SCHECK_PARTIAL(); 2635 MRRETURN(MATCH_NOMATCH); 2636 } 2637 GETCHARINCTEST(c, eptr); 2638 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); 2639 } 2640 2641 /* If max == min we can continue with the main loop without the 2642 need to recurse. */ 2643 2644 if (min == max) continue; 2645 2646 /* If minimizing, keep testing the rest of the expression and advancing 2647 the pointer while it matches the class. */ 2648 2649 if (minimize) 2650 { 2651 for (fi = min;; fi++) 2652 { 2653 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); 2654 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2655 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2656 if (eptr >= md->end_subject) 2657 { 2658 SCHECK_PARTIAL(); 2659 MRRETURN(MATCH_NOMATCH); 2660 } 2661 GETCHARINCTEST(c, eptr); 2662 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH); 2663 } 2664 /* Control never gets here */ 2665 } 2666 2667 /* If maximizing, find the longest possible run, then work backwards. */ 2668 2669 else 2670 { 2671 pp = eptr; 2672 for (i = min; i < max; i++) 2673 { 2674 int len = 1; 2675 if (eptr >= md->end_subject) 2676 { 2677 SCHECK_PARTIAL(); 2678 break; 2679 } 2680 GETCHARLENTEST(c, eptr, len); 2681 if (!_pcre_xclass(c, data)) break; 2682 eptr += len; 2683 } 2684 for(;;) 2685 { 2686 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); 2687 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2688 if (eptr-- == pp) break; /* Stop if tried at original pos */ 2689 if (utf8) BACKCHAR(eptr); 2690 } 2691 MRRETURN(MATCH_NOMATCH); 2692 } 2693 2694 /* Control never gets here */ 2695 } 2696 #endif /* End of XCLASS */ 2697 2698 /* Match a single character, casefully */ 2699 2700 case OP_CHAR: 2701 #ifdef SUPPORT_UTF8 2702 if (utf8) 2703 { 2704 length = 1; 2705 ecode++; 2706 GETCHARLEN(fc, ecode, length); 2707 if (length > md->end_subject - eptr) 2708 { 2709 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 2710 MRRETURN(MATCH_NOMATCH); 2711 } 2712 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH); 2713 } 2714 else 2715 #endif 2716 2717 /* Non-UTF-8 mode */ 2718 { 2719 if (md->end_subject - eptr < 1) 2720 { 2721 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 2722 MRRETURN(MATCH_NOMATCH); 2723 } 2724 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH); 2725 ecode += 2; 2726 } 2727 break; 2728 2729 /* Match a single character, caselessly */ 2730 2731 case OP_CHARNC: 2732 #ifdef SUPPORT_UTF8 2733 if (utf8) 2734 { 2735 length = 1; 2736 ecode++; 2737 GETCHARLEN(fc, ecode, length); 2738 2739 if (length > md->end_subject - eptr) 2740 { 2741 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 2742 MRRETURN(MATCH_NOMATCH); 2743 } 2744 2745 /* If the pattern character's value is < 128, we have only one byte, and 2746 can use the fast lookup table. */ 2747 2748 if (fc < 128) 2749 { 2750 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 2751 } 2752 2753 /* Otherwise we must pick up the subject character */ 2754 2755 else 2756 { 2757 unsigned int dc; 2758 GETCHARINC(dc, eptr); 2759 ecode += length; 2760 2761 /* If we have Unicode property support, we can use it to test the other 2762 case of the character, if there is one. */ 2763 2764 if (fc != dc) 2765 { 2766 #ifdef SUPPORT_UCP 2767 if (dc != UCD_OTHERCASE(fc)) 2768 #endif 2769 MRRETURN(MATCH_NOMATCH); 2770 } 2771 } 2772 } 2773 else 2774 #endif /* SUPPORT_UTF8 */ 2775 2776 /* Non-UTF-8 mode */ 2777 { 2778 if (md->end_subject - eptr < 1) 2779 { 2780 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 2781 MRRETURN(MATCH_NOMATCH); 2782 } 2783 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 2784 ecode += 2; 2785 } 2786 break; 2787 2788 /* Match a single character repeatedly. */ 2789 2790 case OP_EXACT: 2791 min = max = GET2(ecode, 1); 2792 ecode += 3; 2793 goto REPEATCHAR; 2794 2795 case OP_POSUPTO: 2796 possessive = TRUE; 2797 /* Fall through */ 2798 2799 case OP_UPTO: 2800 case OP_MINUPTO: 2801 min = 0; 2802 max = GET2(ecode, 1); 2803 minimize = *ecode == OP_MINUPTO; 2804 ecode += 3; 2805 goto REPEATCHAR; 2806 2807 case OP_POSSTAR: 2808 possessive = TRUE; 2809 min = 0; 2810 max = INT_MAX; 2811 ecode++; 2812 goto REPEATCHAR; 2813 2814 case OP_POSPLUS: 2815 possessive = TRUE; 2816 min = 1; 2817 max = INT_MAX; 2818 ecode++; 2819 goto REPEATCHAR; 2820 2821 case OP_POSQUERY: 2822 possessive = TRUE; 2823 min = 0; 2824 max = 1; 2825 ecode++; 2826 goto REPEATCHAR; 2827 2828 case OP_STAR: 2829 case OP_MINSTAR: 2830 case OP_PLUS: 2831 case OP_MINPLUS: 2832 case OP_QUERY: 2833 case OP_MINQUERY: 2834 c = *ecode++ - OP_STAR; 2835 minimize = (c & 1) != 0; 2836 2837 min = rep_min[c]; /* Pick up values from tables; */ 2838 max = rep_max[c]; /* zero for max => infinity */ 2839 if (max == 0) max = INT_MAX; 2840 2841 /* Common code for all repeated single-character matches. */ 2842 2843 REPEATCHAR: 2844 #ifdef SUPPORT_UTF8 2845 if (utf8) 2846 { 2847 length = 1; 2848 charptr = ecode; 2849 GETCHARLEN(fc, ecode, length); 2850 ecode += length; 2851 2852 /* Handle multibyte character matching specially here. There is 2853 support for caseless matching if UCP support is present. */ 2854 2855 if (length > 1) 2856 { 2857 #ifdef SUPPORT_UCP 2858 unsigned int othercase; 2859 if ((ims & PCRE_CASELESS) != 0 && 2860 (othercase = UCD_OTHERCASE(fc)) != fc) 2861 oclength = _pcre_ord2utf8(othercase, occhars); 2862 else oclength = 0; 2863 #endif /* SUPPORT_UCP */ 2864 2865 for (i = 1; i <= min; i++) 2866 { 2867 if (eptr <= md->end_subject - length && 2868 memcmp(eptr, charptr, length) == 0) eptr += length; 2869 #ifdef SUPPORT_UCP 2870 else if (oclength > 0 && 2871 eptr <= md->end_subject - oclength && 2872 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2873 #endif /* SUPPORT_UCP */ 2874 else 2875 { 2876 CHECK_PARTIAL(); 2877 MRRETURN(MATCH_NOMATCH); 2878 } 2879 } 2880 2881 if (min == max) continue; 2882 2883 if (minimize) 2884 { 2885 for (fi = min;; fi++) 2886 { 2887 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); 2888 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2889 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2890 if (eptr <= md->end_subject - length && 2891 memcmp(eptr, charptr, length) == 0) eptr += length; 2892 #ifdef SUPPORT_UCP 2893 else if (oclength > 0 && 2894 eptr <= md->end_subject - oclength && 2895 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2896 #endif /* SUPPORT_UCP */ 2897 else 2898 { 2899 CHECK_PARTIAL(); 2900 MRRETURN(MATCH_NOMATCH); 2901 } 2902 } 2903 /* Control never gets here */ 2904 } 2905 2906 else /* Maximize */ 2907 { 2908 pp = eptr; 2909 for (i = min; i < max; i++) 2910 { 2911 if (eptr <= md->end_subject - length && 2912 memcmp(eptr, charptr, length) == 0) eptr += length; 2913 #ifdef SUPPORT_UCP 2914 else if (oclength > 0 && 2915 eptr <= md->end_subject - oclength && 2916 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2917 #endif /* SUPPORT_UCP */ 2918 else 2919 { 2920 CHECK_PARTIAL(); 2921 break; 2922 } 2923 } 2924 2925 if (possessive) continue; 2926 2927 for(;;) 2928 { 2929 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); 2930 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2931 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); } 2932 #ifdef SUPPORT_UCP 2933 eptr--; 2934 BACKCHAR(eptr); 2935 #else /* without SUPPORT_UCP */ 2936 eptr -= length; 2937 #endif /* SUPPORT_UCP */ 2938 } 2939 } 2940 /* Control never gets here */ 2941 } 2942 2943 /* If the length of a UTF-8 character is 1, we fall through here, and 2944 obey the code as for non-UTF-8 characters below, though in this case the 2945 value of fc will always be < 128. */ 2946 } 2947 else 2948 #endif /* SUPPORT_UTF8 */ 2949 2950 /* When not in UTF-8 mode, load a single-byte character. */ 2951 2952 fc = *ecode++; 2953 2954 /* The value of fc at this point is always less than 256, though we may or 2955 may not be in UTF-8 mode. The code is duplicated for the caseless and 2956 caseful cases, for speed, since matching characters is likely to be quite 2957 common. First, ensure the minimum number of matches are present. If min = 2958 max, continue at the same level without recursing. Otherwise, if 2959 minimizing, keep trying the rest of the expression and advancing one 2960 matching character if failing, up to the maximum. Alternatively, if 2961 maximizing, find the maximum number of characters and work backwards. */ 2962 2963 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 2964 max, eptr)); 2965 2966 if ((ims & PCRE_CASELESS) != 0) 2967 { 2968 fc = md->lcc[fc]; 2969 for (i = 1; i <= min; i++) 2970 { 2971 if (eptr >= md->end_subject) 2972 { 2973 SCHECK_PARTIAL(); 2974 MRRETURN(MATCH_NOMATCH); 2975 } 2976 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 2977 } 2978 if (min == max) continue; 2979 if (minimize) 2980 { 2981 for (fi = min;; fi++) 2982 { 2983 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); 2984 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2985 if (fi >= max) MRRETURN(MATCH_NOMATCH); 2986 if (eptr >= md->end_subject) 2987 { 2988 SCHECK_PARTIAL(); 2989 MRRETURN(MATCH_NOMATCH); 2990 } 2991 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 2992 } 2993 /* Control never gets here */ 2994 } 2995 else /* Maximize */ 2996 { 2997 pp = eptr; 2998 for (i = min; i < max; i++) 2999 { 3000 if (eptr >= md->end_subject) 3001 { 3002 SCHECK_PARTIAL(); 3003 break; 3004 } 3005 if (fc != md->lcc[*eptr]) break; 3006 eptr++; 3007 } 3008 3009 if (possessive) continue; 3010 3011 while (eptr >= pp) 3012 { 3013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); 3014 eptr--; 3015 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3016 } 3017 MRRETURN(MATCH_NOMATCH); 3018 } 3019 /* Control never gets here */ 3020 } 3021 3022 /* Caseful comparisons (includes all multi-byte characters) */ 3023 3024 else 3025 { 3026 for (i = 1; i <= min; i++) 3027 { 3028 if (eptr >= md->end_subject) 3029 { 3030 SCHECK_PARTIAL(); 3031 MRRETURN(MATCH_NOMATCH); 3032 } 3033 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); 3034 } 3035 3036 if (min == max) continue; 3037 3038 if (minimize) 3039 { 3040 for (fi = min;; fi++) 3041 { 3042 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); 3043 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3044 if (fi >= max) MRRETURN(MATCH_NOMATCH); 3045 if (eptr >= md->end_subject) 3046 { 3047 SCHECK_PARTIAL(); 3048 MRRETURN(MATCH_NOMATCH); 3049 } 3050 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH); 3051 } 3052 /* Control never gets here */ 3053 } 3054 else /* Maximize */ 3055 { 3056 pp = eptr; 3057 for (i = min; i < max; i++) 3058 { 3059 if (eptr >= md->end_subject) 3060 { 3061 SCHECK_PARTIAL(); 3062 break; 3063 } 3064 if (fc != *eptr) break; 3065 eptr++; 3066 } 3067 if (possessive) continue; 3068 3069 while (eptr >= pp) 3070 { 3071 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); 3072 eptr--; 3073 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3074 } 3075 MRRETURN(MATCH_NOMATCH); 3076 } 3077 } 3078 /* Control never gets here */ 3079 3080 /* Match a negated single one-byte character. The character we are 3081 checking can be multibyte. */ 3082 3083 case OP_NOT: 3084 if (eptr >= md->end_subject) 3085 { 3086 SCHECK_PARTIAL(); 3087 MRRETURN(MATCH_NOMATCH); 3088 } 3089 ecode++; 3090 GETCHARINCTEST(c, eptr); 3091 if ((ims & PCRE_CASELESS) != 0) 3092 { 3093 #ifdef SUPPORT_UTF8 3094 if (c < 256) 3095 #endif 3096 c = md->lcc[c]; 3097 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH); 3098 } 3099 else 3100 { 3101 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH); 3102 } 3103 break; 3104 3105 /* Match a negated single one-byte character repeatedly. This is almost a 3106 repeat of the code for a repeated single character, but I haven't found a 3107 nice way of commoning these up that doesn't require a test of the 3108 positive/negative option for each character match. Maybe that wouldn't add 3109 very much to the time taken, but character matching *is* what this is all 3110 about... */ 3111 3112 case OP_NOTEXACT: 3113 min = max = GET2(ecode, 1); 3114 ecode += 3; 3115 goto REPEATNOTCHAR; 3116 3117 case OP_NOTUPTO: 3118 case OP_NOTMINUPTO: 3119 min = 0; 3120 max = GET2(ecode, 1); 3121 minimize = *ecode == OP_NOTMINUPTO; 3122 ecode += 3; 3123 goto REPEATNOTCHAR; 3124 3125 case OP_NOTPOSSTAR: 3126 possessive = TRUE; 3127 min = 0; 3128 max = INT_MAX; 3129 ecode++; 3130 goto REPEATNOTCHAR; 3131 3132 case OP_NOTPOSPLUS: 3133 possessive = TRUE; 3134 min = 1; 3135 max = INT_MAX; 3136 ecode++; 3137 goto REPEATNOTCHAR; 3138 3139 case OP_NOTPOSQUERY: 3140 possessive = TRUE; 3141 min = 0; 3142 max = 1; 3143 ecode++; 3144 goto REPEATNOTCHAR; 3145 3146 case OP_NOTPOSUPTO: 3147 possessive = TRUE; 3148 min = 0; 3149 max = GET2(ecode, 1); 3150 ecode += 3; 3151 goto REPEATNOTCHAR; 3152 3153 case OP_NOTSTAR: 3154 case OP_NOTMINSTAR: 3155 case OP_NOTPLUS: 3156 case OP_NOTMINPLUS: 3157 case OP_NOTQUERY: 3158 case OP_NOTMINQUERY: 3159 c = *ecode++ - OP_NOTSTAR; 3160 minimize = (c & 1) != 0; 3161 min = rep_min[c]; /* Pick up values from tables; */ 3162 max = rep_max[c]; /* zero for max => infinity */ 3163 if (max == 0) max = INT_MAX; 3164 3165 /* Common code for all repeated single-byte matches. */ 3166 3167 REPEATNOTCHAR: 3168 fc = *ecode++; 3169 3170 /* The code is duplicated for the caseless and caseful cases, for speed, 3171 since matching characters is likely to be quite common. First, ensure the 3172 minimum number of matches are present. If min = max, continue at the same 3173 level without recursing. Otherwise, if minimizing, keep trying the rest of 3174 the expression and advancing one matching character if failing, up to the 3175 maximum. Alternatively, if maximizing, find the maximum number of 3176 characters and work backwards. */ 3177 3178 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3179 max, eptr)); 3180 3181 if ((ims & PCRE_CASELESS) != 0) 3182 { 3183 fc = md->lcc[fc]; 3184 3185 #ifdef SUPPORT_UTF8 3186 /* UTF-8 mode */ 3187 if (utf8) 3188 { 3189 register unsigned int d; 3190 for (i = 1; i <= min; i++) 3191 { 3192 if (eptr >= md->end_subject) 3193 { 3194 SCHECK_PARTIAL(); 3195 MRRETURN(MATCH_NOMATCH); 3196 } 3197 GETCHARINC(d, eptr); 3198 if (d < 256) d = md->lcc[d]; 3199 if (fc == d) MRRETURN(MATCH_NOMATCH); 3200 } 3201 } 3202 else 3203 #endif 3204 3205 /* Not UTF-8 mode */ 3206 { 3207 for (i = 1; i <= min; i++) 3208 { 3209 if (eptr >= md->end_subject) 3210 { 3211 SCHECK_PARTIAL(); 3212 MRRETURN(MATCH_NOMATCH); 3213 } 3214 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 3215 } 3216 } 3217 3218 if (min == max) continue; 3219 3220 if (minimize) 3221 { 3222 #ifdef SUPPORT_UTF8 3223 /* UTF-8 mode */ 3224 if (utf8) 3225 { 3226 register unsigned int d; 3227 for (fi = min;; fi++) 3228 { 3229 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); 3230 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3231 if (fi >= max) MRRETURN(MATCH_NOMATCH); 3232 if (eptr >= md->end_subject) 3233 { 3234 SCHECK_PARTIAL(); 3235 MRRETURN(MATCH_NOMATCH); 3236 } 3237 GETCHARINC(d, eptr); 3238 if (d < 256) d = md->lcc[d]; 3239 if (fc == d) MRRETURN(MATCH_NOMATCH); 3240 } 3241 } 3242 else 3243 #endif 3244 /* Not UTF-8 mode */ 3245 { 3246 for (fi = min;; fi++) 3247 { 3248 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); 3249 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3250 if (fi >= max) MRRETURN(MATCH_NOMATCH); 3251 if (eptr >= md->end_subject) 3252 { 3253 SCHECK_PARTIAL(); 3254 MRRETURN(MATCH_NOMATCH); 3255 } 3256 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH); 3257 } 3258 } 3259 /* Control never gets here */ 3260 } 3261 3262 /* Maximize case */ 3263 3264 else 3265 { 3266 pp = eptr; 3267 3268 #ifdef SUPPORT_UTF8 3269 /* UTF-8 mode */ 3270 if (utf8) 3271 { 3272 register unsigned int d; 3273 for (i = min; i < max; i++) 3274 { 3275 int len = 1; 3276 if (eptr >= md->end_subject) 3277 { 3278 SCHECK_PARTIAL(); 3279 break; 3280 } 3281 GETCHARLEN(d, eptr, len); 3282 if (d < 256) d = md->lcc[d]; 3283 if (fc == d) break; 3284 eptr += len; 3285 } 3286 if (possessive) continue; 3287 for(;;) 3288 { 3289 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); 3290 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3291 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3292 BACKCHAR(eptr); 3293 } 3294 } 3295 else 3296 #endif 3297 /* Not UTF-8 mode */ 3298 { 3299 for (i = min; i < max; i++) 3300 { 3301 if (eptr >= md->end_subject) 3302 { 3303 SCHECK_PARTIAL(); 3304 break; 3305 } 3306 if (fc == md->lcc[*eptr]) break; 3307 eptr++; 3308 } 3309 if (possessive) continue; 3310 while (eptr >= pp) 3311 { 3312 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); 3313 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3314 eptr--; 3315 } 3316 } 3317 3318 MRRETURN(MATCH_NOMATCH); 3319 } 3320 /* Control never gets here */ 3321 } 3322 3323 /* Caseful comparisons */ 3324 3325 else 3326 { 3327 #ifdef SUPPORT_UTF8 3328 /* UTF-8 mode */ 3329 if (utf8) 3330 { 3331 register unsigned int d; 3332 for (i = 1; i <= min; i++) 3333 { 3334 if (eptr >= md->end_subject) 3335 { 3336 SCHECK_PARTIAL(); 3337 MRRETURN(MATCH_NOMATCH); 3338 } 3339 GETCHARINC(d, eptr); 3340 if (fc == d) MRRETURN(MATCH_NOMATCH); 3341 } 3342 } 3343 else 3344 #endif 3345 /* Not UTF-8 mode */ 3346 { 3347 for (i = 1; i <= min; i++) 3348 { 3349 if (eptr >= md->end_subject) 3350 { 3351 SCHECK_PARTIAL(); 3352 MRRETURN(MATCH_NOMATCH); 3353 } 3354 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); 3355 } 3356 } 3357 3358 if (min == max) continue; 3359 3360 if (minimize) 3361 { 3362 #ifdef SUPPORT_UTF8 3363 /* UTF-8 mode */ 3364 if (utf8) 3365 { 3366 register unsigned int d; 3367 for (fi = min;; fi++) 3368 { 3369 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); 3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3371 if (fi >= max) MRRETURN(MATCH_NOMATCH); 3372 if (eptr >= md->end_subject) 3373 { 3374 SCHECK_PARTIAL(); 3375 MRRETURN(MATCH_NOMATCH); 3376 } 3377 GETCHARINC(d, eptr); 3378 if (fc == d) MRRETURN(MATCH_NOMATCH); 3379 } 3380 } 3381 else 3382 #endif 3383 /* Not UTF-8 mode */ 3384 { 3385 for (fi = min;; fi++) 3386 { 3387 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); 3388 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3389 if (fi >= max) MRRETURN(MATCH_NOMATCH); 3390 if (eptr >= md->end_subject) 3391 { 3392 SCHECK_PARTIAL(); 3393 MRRETURN(MATCH_NOMATCH); 3394 } 3395 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH); 3396 } 3397 } 3398 /* Control never gets here */ 3399 } 3400 3401 /* Maximize case */ 3402 3403 else 3404 { 3405 pp = eptr; 3406 3407 #ifdef SUPPORT_UTF8 3408 /* UTF-8 mode */ 3409 if (utf8) 3410 { 3411 register unsigned int d; 3412 for (i = min; i < max; i++) 3413 { 3414 int len = 1; 3415 if (eptr >= md->end_subject) 3416 { 3417 SCHECK_PARTIAL(); 3418 break; 3419 } 3420 GETCHARLEN(d, eptr, len); 3421 if (fc == d) break; 3422 eptr += len; 3423 } 3424 if (possessive) continue; 3425 for(;;) 3426 { 3427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); 3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3429 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3430 BACKCHAR(eptr); 3431 } 3432 } 3433 else 3434 #endif 3435 /* Not UTF-8 mode */ 3436 { 3437 for (i = min; i < max; i++) 3438 { 3439 if (eptr >= md->end_subject) 3440 { 3441 SCHECK_PARTIAL(); 3442 break; 3443 } 3444 if (fc == *eptr) break; 3445 eptr++; 3446 } 3447 if (possessive) continue; 3448 while (eptr >= pp) 3449 { 3450 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); 3451 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3452 eptr--; 3453 } 3454 } 3455 3456 MRRETURN(MATCH_NOMATCH); 3457 } 3458 } 3459 /* Control never gets here */ 3460 3461 /* Match a single character type repeatedly; several different opcodes 3462 share code. This is very similar to the code for single characters, but we 3463 repeat it in the interests of efficiency. */ 3464 3465 case OP_TYPEEXACT: 3466 min = max = GET2(ecode, 1); 3467 minimize = TRUE; 3468 ecode += 3; 3469 goto REPEATTYPE; 3470 3471 case OP_TYPEUPTO: 3472 case OP_TYPEMINUPTO: 3473 min = 0; 3474 max = GET2(ecode, 1); 3475 minimize = *ecode == OP_TYPEMINUPTO; 3476 ecode += 3; 3477 goto REPEATTYPE; 3478 3479 case OP_TYPEPOSSTAR: 3480 possessive = TRUE; 3481 min = 0; 3482 max = INT_MAX; 3483 ecode++; 3484 goto REPEATTYPE; 3485 3486 case OP_TYPEPOSPLUS: 3487 possessive = TRUE; 3488 min = 1; 3489 max = INT_MAX; 3490 ecode++; 3491 goto REPEATTYPE; 3492 3493 case OP_TYPEPOSQUERY: 3494 possessive = TRUE; 3495 min = 0; 3496 max = 1; 3497 ecode++; 3498 goto REPEATTYPE; 3499 3500 case OP_TYPEPOSUPTO: 3501 possessive = TRUE; 3502 min = 0; 3503 max = GET2(ecode, 1); 3504 ecode += 3; 3505 goto REPEATTYPE; 3506 3507 case OP_TYPESTAR: 3508 case OP_TYPEMINSTAR: 3509 case OP_TYPEPLUS: 3510 case OP_TYPEMINPLUS: 3511 case OP_TYPEQUERY: 3512 case OP_TYPEMINQUERY: 3513 c = *ecode++ - OP_TYPESTAR; 3514 minimize = (c & 1) != 0; 3515 min = rep_min[c]; /* Pick up values from tables; */ 3516 max = rep_max[c]; /* zero for max => infinity */ 3517 if (max == 0) max = INT_MAX; 3518 3519 /* Common code for all repeated single character type matches. Note that 3520 in UTF-8 mode, '.' matches a character of any length, but for the other 3521 character types, the valid characters are all one-byte long. */ 3522 3523 REPEATTYPE: 3524 ctype = *ecode++; /* Code for the character type */ 3525 3526 #ifdef SUPPORT_UCP 3527 if (ctype == OP_PROP || ctype == OP_NOTPROP) 3528 { 3529 prop_fail_result = ctype == OP_NOTPROP; 3530 prop_type = *ecode++; 3531 prop_value = *ecode++; 3532 } 3533 else prop_type = -1; 3534 #endif 3535 3536 /* First, ensure the minimum number of matches are present. Use inline 3537 code for maximizing the speed, and do the type test once at the start 3538 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 3539 is tidier. Also separate the UCP code, which can be the same for both UTF-8 3540 and single-bytes. */ 3541 3542 if (min > 0) 3543 { 3544 #ifdef SUPPORT_UCP 3545 if (prop_type >= 0) 3546 { 3547 switch(prop_type) 3548 { 3549 case PT_ANY: 3550 if (prop_fail_result) MRRETURN(MATCH_NOMATCH); 3551 for (i = 1; i <= min; i++) 3552 { 3553 if (eptr >= md->end_subject) 3554 { 3555 SCHECK_PARTIAL(); 3556 MRRETURN(MATCH_NOMATCH); 3557 } 3558 GETCHARINCTEST(c, eptr); 3559 } 3560 break; 3561 3562 case PT_LAMP: 3563 for (i = 1; i <= min; i++) 3564 { 3565 if (eptr >= md->end_subject) 3566 { 3567 SCHECK_PARTIAL(); 3568 MRRETURN(MATCH_NOMATCH); 3569 } 3570 GETCHARINCTEST(c, eptr); 3571 prop_chartype = UCD_CHARTYPE(c); 3572 if ((prop_chartype == ucp_Lu || 3573 prop_chartype == ucp_Ll || 3574 prop_chartype == ucp_Lt) == prop_fail_result) 3575 MRRETURN(MATCH_NOMATCH); 3576 } 3577 break; 3578 3579 case PT_GC: 3580 for (i = 1; i <= min; i++) 3581 { 3582 if (eptr >= md->end_subject) 3583 { 3584 SCHECK_PARTIAL(); 3585 MRRETURN(MATCH_NOMATCH); 3586 } 3587 GETCHARINCTEST(c, eptr); 3588 prop_category = UCD_CATEGORY(c); 3589 if ((prop_category == prop_value) == prop_fail_result) 3590 MRRETURN(MATCH_NOMATCH); 3591 } 3592 break; 3593 3594 case PT_PC: 3595 for (i = 1; i <= min; i++) 3596 { 3597 if (eptr >= md->end_subject) 3598 { 3599 SCHECK_PARTIAL(); 3600 MRRETURN(MATCH_NOMATCH); 3601 } 3602 GETCHARINCTEST(c, eptr); 3603 prop_chartype = UCD_CHARTYPE(c); 3604 if ((prop_chartype == prop_value) == prop_fail_result) 3605 MRRETURN(MATCH_NOMATCH); 3606 } 3607 break; 3608 3609 case PT_SC: 3610 for (i = 1; i <= min; i++) 3611 { 3612 if (eptr >= md->end_subject) 3613 { 3614 SCHECK_PARTIAL(); 3615 MRRETURN(MATCH_NOMATCH); 3616 } 3617 GETCHARINCTEST(c, eptr); 3618 prop_script = UCD_SCRIPT(c); 3619 if ((prop_script == prop_value) == prop_fail_result) 3620 MRRETURN(MATCH_NOMATCH); 3621 } 3622 break; 3623 3624 case PT_ALNUM: 3625 for (i = 1; i <= min; i++) 3626 { 3627 if (eptr >= md->end_subject) 3628 { 3629 SCHECK_PARTIAL(); 3630 MRRETURN(MATCH_NOMATCH); 3631 } 3632 GETCHARINCTEST(c, eptr); 3633 prop_category = UCD_CATEGORY(c); 3634 if ((prop_category == ucp_L || prop_category == ucp_N) 3635 == prop_fail_result) 3636 MRRETURN(MATCH_NOMATCH); 3637 } 3638 break; 3639 3640 case PT_SPACE: /* Perl space */ 3641 for (i = 1; i <= min; i++) 3642 { 3643 if (eptr >= md->end_subject) 3644 { 3645 SCHECK_PARTIAL(); 3646 MRRETURN(MATCH_NOMATCH); 3647 } 3648 GETCHARINCTEST(c, eptr); 3649 prop_category = UCD_CATEGORY(c); 3650 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 3651 c == CHAR_FF || c == CHAR_CR) 3652 == prop_fail_result) 3653 MRRETURN(MATCH_NOMATCH); 3654 } 3655 break; 3656 3657 case PT_PXSPACE: /* POSIX space */ 3658 for (i = 1; i <= min; i++) 3659 { 3660 if (eptr >= md->end_subject) 3661 { 3662 SCHECK_PARTIAL(); 3663 MRRETURN(MATCH_NOMATCH); 3664 } 3665 GETCHARINCTEST(c, eptr); 3666 prop_category = UCD_CATEGORY(c); 3667 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 3668 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 3669 == prop_fail_result) 3670 MRRETURN(MATCH_NOMATCH); 3671 } 3672 break; 3673 3674 case PT_WORD: 3675 for (i = 1; i <= min; i++) 3676 { 3677 if (eptr >= md->end_subject) 3678 { 3679 SCHECK_PARTIAL(); 3680 MRRETURN(MATCH_NOMATCH); 3681 } 3682 GETCHARINCTEST(c, eptr); 3683 prop_category = UCD_CATEGORY(c); 3684 if ((prop_category == ucp_L || prop_category == ucp_N || 3685 c == CHAR_UNDERSCORE) 3686 == prop_fail_result) 3687 MRRETURN(MATCH_NOMATCH); 3688 } 3689 break; 3690 3691 /* This should not occur */ 3692 3693 default: 3694 RRETURN(PCRE_ERROR_INTERNAL); 3695 } 3696 } 3697 3698 /* Match extended Unicode sequences. We will get here only if the 3699 support is in the binary; otherwise a compile-time error occurs. */ 3700 3701 else if (ctype == OP_EXTUNI) 3702 { 3703 for (i = 1; i <= min; i++) 3704 { 3705 if (eptr >= md->end_subject) 3706 { 3707 SCHECK_PARTIAL(); 3708 MRRETURN(MATCH_NOMATCH); 3709 } 3710 GETCHARINCTEST(c, eptr); 3711 prop_category = UCD_CATEGORY(c); 3712 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); 3713 while (eptr < md->end_subject) 3714 { 3715 int len = 1; 3716 if (!utf8) c = *eptr; 3717 else { GETCHARLEN(c, eptr, len); } 3718 prop_category = UCD_CATEGORY(c); 3719 if (prop_category != ucp_M) break; 3720 eptr += len; 3721 } 3722 } 3723 } 3724 3725 else 3726 #endif /* SUPPORT_UCP */ 3727 3728 /* Handle all other cases when the coding is UTF-8 */ 3729 3730 #ifdef SUPPORT_UTF8 3731 if (utf8) switch(ctype) 3732 { 3733 case OP_ANY: 3734 for (i = 1; i <= min; i++) 3735 { 3736 if (eptr >= md->end_subject) 3737 { 3738 SCHECK_PARTIAL(); 3739 MRRETURN(MATCH_NOMATCH); 3740 } 3741 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); 3742 eptr++; 3743 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 3744 } 3745 break; 3746 3747 case OP_ALLANY: 3748 for (i = 1; i <= min; i++) 3749 { 3750 if (eptr >= md->end_subject) 3751 { 3752 SCHECK_PARTIAL(); 3753 MRRETURN(MATCH_NOMATCH); 3754 } 3755 eptr++; 3756 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 3757 } 3758 break; 3759 3760 case OP_ANYBYTE: 3761 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH); 3762 eptr += min; 3763 break; 3764 3765 case OP_ANYNL: 3766 for (i = 1; i <= min; i++) 3767 { 3768 if (eptr >= md->end_subject) 3769 { 3770 SCHECK_PARTIAL(); 3771 MRRETURN(MATCH_NOMATCH); 3772 } 3773 GETCHARINC(c, eptr); 3774 switch(c) 3775 { 3776 default: MRRETURN(MATCH_NOMATCH); 3777 case 0x000d: 3778 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 3779 break; 3780 3781 case 0x000a: 3782 break; 3783 3784 case 0x000b: 3785 case 0x000c: 3786 case 0x0085: 3787 case 0x2028: 3788 case 0x2029: 3789 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); 3790 break; 3791 } 3792 } 3793 break; 3794 3795 case OP_NOT_HSPACE: 3796 for (i = 1; i <= min; i++) 3797 { 3798 if (eptr >= md->end_subject) 3799 { 3800 SCHECK_PARTIAL(); 3801 MRRETURN(MATCH_NOMATCH); 3802 } 3803 GETCHARINC(c, eptr); 3804 switch(c) 3805 { 3806 default: break; 3807 case 0x09: /* HT */ 3808 case 0x20: /* SPACE */ 3809 case 0xa0: /* NBSP */ 3810 case 0x1680: /* OGHAM SPACE MARK */ 3811 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 3812 case 0x2000: /* EN QUAD */ 3813 case 0x2001: /* EM QUAD */ 3814 case 0x2002: /* EN SPACE */ 3815 case 0x2003: /* EM SPACE */ 3816 case 0x2004: /* THREE-PER-EM SPACE */ 3817 case 0x2005: /* FOUR-PER-EM SPACE */ 3818 case 0x2006: /* SIX-PER-EM SPACE */ 3819 case 0x2007: /* FIGURE SPACE */ 3820 case 0x2008: /* PUNCTUATION SPACE */ 3821 case 0x2009: /* THIN SPACE */ 3822 case 0x200A: /* HAIR SPACE */ 3823 case 0x202f: /* NARROW NO-BREAK SPACE */ 3824 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 3825 case 0x3000: /* IDEOGRAPHIC SPACE */ 3826 MRRETURN(MATCH_NOMATCH); 3827 } 3828 } 3829 break; 3830 3831 case OP_HSPACE: 3832 for (i = 1; i <= min; i++) 3833 { 3834 if (eptr >= md->end_subject) 3835 { 3836 SCHECK_PARTIAL(); 3837 MRRETURN(MATCH_NOMATCH); 3838 } 3839 GETCHARINC(c, eptr); 3840 switch(c) 3841 { 3842 default: MRRETURN(MATCH_NOMATCH); 3843 case 0x09: /* HT */ 3844 case 0x20: /* SPACE */ 3845 case 0xa0: /* NBSP */ 3846 case 0x1680: /* OGHAM SPACE MARK */ 3847 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 3848 case 0x2000: /* EN QUAD */ 3849 case 0x2001: /* EM QUAD */ 3850 case 0x2002: /* EN SPACE */ 3851 case 0x2003: /* EM SPACE */ 3852 case 0x2004: /* THREE-PER-EM SPACE */ 3853 case 0x2005: /* FOUR-PER-EM SPACE */ 3854 case 0x2006: /* SIX-PER-EM SPACE */ 3855 case 0x2007: /* FIGURE SPACE */ 3856 case 0x2008: /* PUNCTUATION SPACE */ 3857 case 0x2009: /* THIN SPACE */ 3858 case 0x200A: /* HAIR SPACE */ 3859 case 0x202f: /* NARROW NO-BREAK SPACE */ 3860 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 3861 case 0x3000: /* IDEOGRAPHIC SPACE */ 3862 break; 3863 } 3864 } 3865 break; 3866 3867 case OP_NOT_VSPACE: 3868 for (i = 1; i <= min; i++) 3869 { 3870 if (eptr >= md->end_subject) 3871 { 3872 SCHECK_PARTIAL(); 3873 MRRETURN(MATCH_NOMATCH); 3874 } 3875 GETCHARINC(c, eptr); 3876 switch(c) 3877 { 3878 default: break; 3879 case 0x0a: /* LF */ 3880 case 0x0b: /* VT */ 3881 case 0x0c: /* FF */ 3882 case 0x0d: /* CR */ 3883 case 0x85: /* NEL */ 3884 case 0x2028: /* LINE SEPARATOR */ 3885 case 0x2029: /* PARAGRAPH SEPARATOR */ 3886 MRRETURN(MATCH_NOMATCH); 3887 } 3888 } 3889 break; 3890 3891 case OP_VSPACE: 3892 for (i = 1; i <= min; i++) 3893 { 3894 if (eptr >= md->end_subject) 3895 { 3896 SCHECK_PARTIAL(); 3897 MRRETURN(MATCH_NOMATCH); 3898 } 3899 GETCHARINC(c, eptr); 3900 switch(c) 3901 { 3902 default: MRRETURN(MATCH_NOMATCH); 3903 case 0x0a: /* LF */ 3904 case 0x0b: /* VT */ 3905 case 0x0c: /* FF */ 3906 case 0x0d: /* CR */ 3907 case 0x85: /* NEL */ 3908 case 0x2028: /* LINE SEPARATOR */ 3909 case 0x2029: /* PARAGRAPH SEPARATOR */ 3910 break; 3911 } 3912 } 3913 break; 3914 3915 case OP_NOT_DIGIT: 3916 for (i = 1; i <= min; i++) 3917 { 3918 if (eptr >= md->end_subject) 3919 { 3920 SCHECK_PARTIAL(); 3921 MRRETURN(MATCH_NOMATCH); 3922 } 3923 GETCHARINC(c, eptr); 3924 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 3925 MRRETURN(MATCH_NOMATCH); 3926 } 3927 break; 3928 3929 case OP_DIGIT: 3930 for (i = 1; i <= min; i++) 3931 { 3932 if (eptr >= md->end_subject) 3933 { 3934 SCHECK_PARTIAL(); 3935 MRRETURN(MATCH_NOMATCH); 3936 } 3937 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) 3938 MRRETURN(MATCH_NOMATCH); 3939 /* No need to skip more bytes - we know it's a 1-byte character */ 3940 } 3941 break; 3942 3943 case OP_NOT_WHITESPACE: 3944 for (i = 1; i <= min; i++) 3945 { 3946 if (eptr >= md->end_subject) 3947 { 3948 SCHECK_PARTIAL(); 3949 MRRETURN(MATCH_NOMATCH); 3950 } 3951 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) 3952 MRRETURN(MATCH_NOMATCH); 3953 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); 3954 } 3955 break; 3956 3957 case OP_WHITESPACE: 3958 for (i = 1; i <= min; i++) 3959 { 3960 if (eptr >= md->end_subject) 3961 { 3962 SCHECK_PARTIAL(); 3963 MRRETURN(MATCH_NOMATCH); 3964 } 3965 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) 3966 MRRETURN(MATCH_NOMATCH); 3967 /* No need to skip more bytes - we know it's a 1-byte character */ 3968 } 3969 break; 3970 3971 case OP_NOT_WORDCHAR: 3972 for (i = 1; i <= min; i++) 3973 { 3974 if (eptr >= md->end_subject) 3975 { 3976 SCHECK_PARTIAL(); 3977 MRRETURN(MATCH_NOMATCH); 3978 } 3979 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) 3980 MRRETURN(MATCH_NOMATCH); 3981 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); 3982 } 3983 break; 3984 3985 case OP_WORDCHAR: 3986 for (i = 1; i <= min; i++) 3987 { 3988 if (eptr >= md->end_subject) 3989 { 3990 SCHECK_PARTIAL(); 3991 MRRETURN(MATCH_NOMATCH); 3992 } 3993 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) 3994 MRRETURN(MATCH_NOMATCH); 3995 /* No need to skip more bytes - we know it's a 1-byte character */ 3996 } 3997 break; 3998 3999 default: 4000 RRETURN(PCRE_ERROR_INTERNAL); 4001 } /* End switch(ctype) */ 4002 4003 else 4004 #endif /* SUPPORT_UTF8 */ 4005 4006 /* Code for the non-UTF-8 case for minimum matching of operators other 4007 than OP_PROP and OP_NOTPROP. */ 4008 4009 switch(ctype) 4010 { 4011 case OP_ANY: 4012 for (i = 1; i <= min; i++) 4013 { 4014 if (eptr >= md->end_subject) 4015 { 4016 SCHECK_PARTIAL(); 4017 MRRETURN(MATCH_NOMATCH); 4018 } 4019 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); 4020 eptr++; 4021 } 4022 break; 4023 4024 case OP_ALLANY: 4025 if (eptr > md->end_subject - min) 4026 { 4027 SCHECK_PARTIAL(); 4028 MRRETURN(MATCH_NOMATCH); 4029 } 4030 eptr += min; 4031 break; 4032 4033 case OP_ANYBYTE: 4034 if (eptr > md->end_subject - min) 4035 { 4036 SCHECK_PARTIAL(); 4037 MRRETURN(MATCH_NOMATCH); 4038 } 4039 eptr += min; 4040 break; 4041 4042 case OP_ANYNL: 4043 for (i = 1; i <= min; i++) 4044 { 4045 if (eptr >= md->end_subject) 4046 { 4047 SCHECK_PARTIAL(); 4048 MRRETURN(MATCH_NOMATCH); 4049 } 4050 switch(*eptr++) 4051 { 4052 default: MRRETURN(MATCH_NOMATCH); 4053 case 0x000d: 4054 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4055 break; 4056 case 0x000a: 4057 break; 4058 4059 case 0x000b: 4060 case 0x000c: 4061 case 0x0085: 4062 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); 4063 break; 4064 } 4065 } 4066 break; 4067 4068 case OP_NOT_HSPACE: 4069 for (i = 1; i <= min; i++) 4070 { 4071 if (eptr >= md->end_subject) 4072 { 4073 SCHECK_PARTIAL(); 4074 MRRETURN(MATCH_NOMATCH); 4075 } 4076 switch(*eptr++) 4077 { 4078 default: break; 4079 case 0x09: /* HT */ 4080 case 0x20: /* SPACE */ 4081 case 0xa0: /* NBSP */ 4082 MRRETURN(MATCH_NOMATCH); 4083 } 4084 } 4085 break; 4086 4087 case OP_HSPACE: 4088 for (i = 1; i <= min; i++) 4089 { 4090 if (eptr >= md->end_subject) 4091 { 4092 SCHECK_PARTIAL(); 4093 MRRETURN(MATCH_NOMATCH); 4094 } 4095 switch(*eptr++) 4096 { 4097 default: MRRETURN(MATCH_NOMATCH); 4098 case 0x09: /* HT */ 4099 case 0x20: /* SPACE */ 4100 case 0xa0: /* NBSP */ 4101 break; 4102 } 4103 } 4104 break; 4105 4106 case OP_NOT_VSPACE: 4107 for (i = 1; i <= min; i++) 4108 { 4109 if (eptr >= md->end_subject) 4110 { 4111 SCHECK_PARTIAL(); 4112 MRRETURN(MATCH_NOMATCH); 4113 } 4114 switch(*eptr++) 4115 { 4116 default: break; 4117 case 0x0a: /* LF */ 4118 case 0x0b: /* VT */ 4119 case 0x0c: /* FF */ 4120 case 0x0d: /* CR */ 4121 case 0x85: /* NEL */ 4122 MRRETURN(MATCH_NOMATCH); 4123 } 4124 } 4125 break; 4126 4127 case OP_VSPACE: 4128 for (i = 1; i <= min; i++) 4129 { 4130 if (eptr >= md->end_subject) 4131 { 4132 SCHECK_PARTIAL(); 4133 MRRETURN(MATCH_NOMATCH); 4134 } 4135 switch(*eptr++) 4136 { 4137 default: MRRETURN(MATCH_NOMATCH); 4138 case 0x0a: /* LF */ 4139 case 0x0b: /* VT */ 4140 case 0x0c: /* FF */ 4141 case 0x0d: /* CR */ 4142 case 0x85: /* NEL */ 4143 break; 4144 } 4145 } 4146 break; 4147 4148 case OP_NOT_DIGIT: 4149 for (i = 1; i <= min; i++) 4150 { 4151 if (eptr >= md->end_subject) 4152 { 4153 SCHECK_PARTIAL(); 4154 MRRETURN(MATCH_NOMATCH); 4155 } 4156 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); 4157 } 4158 break; 4159 4160 case OP_DIGIT: 4161 for (i = 1; i <= min; i++) 4162 { 4163 if (eptr >= md->end_subject) 4164 { 4165 SCHECK_PARTIAL(); 4166 MRRETURN(MATCH_NOMATCH); 4167 } 4168 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); 4169 } 4170 break; 4171 4172 case OP_NOT_WHITESPACE: 4173 for (i = 1; i <= min; i++) 4174 { 4175 if (eptr >= md->end_subject) 4176 { 4177 SCHECK_PARTIAL(); 4178 MRRETURN(MATCH_NOMATCH); 4179 } 4180 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); 4181 } 4182 break; 4183 4184 case OP_WHITESPACE: 4185 for (i = 1; i <= min; i++) 4186 { 4187 if (eptr >= md->end_subject) 4188 { 4189 SCHECK_PARTIAL(); 4190 MRRETURN(MATCH_NOMATCH); 4191 } 4192 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); 4193 } 4194 break; 4195 4196 case OP_NOT_WORDCHAR: 4197 for (i = 1; i <= min; i++) 4198 { 4199 if (eptr >= md->end_subject) 4200 { 4201 SCHECK_PARTIAL(); 4202 MRRETURN(MATCH_NOMATCH); 4203 } 4204 if ((md->ctypes[*eptr++] & ctype_word) != 0) 4205 MRRETURN(MATCH_NOMATCH); 4206 } 4207 break; 4208 4209 case OP_WORDCHAR: 4210 for (i = 1; i <= min; i++) 4211 { 4212 if (eptr >= md->end_subject) 4213 { 4214 SCHECK_PARTIAL(); 4215 MRRETURN(MATCH_NOMATCH); 4216 } 4217 if ((md->ctypes[*eptr++] & ctype_word) == 0) 4218 MRRETURN(MATCH_NOMATCH); 4219 } 4220 break; 4221 4222 default: 4223 RRETURN(PCRE_ERROR_INTERNAL); 4224 } 4225 } 4226 4227 /* If min = max, continue at the same level without recursing */ 4228 4229 if (min == max) continue; 4230 4231 /* If minimizing, we have to test the rest of the pattern before each 4232 subsequent match. Again, separate the UTF-8 case for speed, and also 4233 separate the UCP cases. */ 4234 4235 if (minimize) 4236 { 4237 #ifdef SUPPORT_UCP 4238 if (prop_type >= 0) 4239 { 4240 switch(prop_type) 4241 { 4242 case PT_ANY: 4243 for (fi = min;; fi++) 4244 { 4245 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); 4246 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4247 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4248 if (eptr >= md->end_subject) 4249 { 4250 SCHECK_PARTIAL(); 4251 MRRETURN(MATCH_NOMATCH); 4252 } 4253 GETCHARINCTEST(c, eptr); 4254 if (prop_fail_result) MRRETURN(MATCH_NOMATCH); 4255 } 4256 /* Control never gets here */ 4257 4258 case PT_LAMP: 4259 for (fi = min;; fi++) 4260 { 4261 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); 4262 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4263 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4264 if (eptr >= md->end_subject) 4265 { 4266 SCHECK_PARTIAL(); 4267 MRRETURN(MATCH_NOMATCH); 4268 } 4269 GETCHARINCTEST(c, eptr); 4270 prop_chartype = UCD_CHARTYPE(c); 4271 if ((prop_chartype == ucp_Lu || 4272 prop_chartype == ucp_Ll || 4273 prop_chartype == ucp_Lt) == prop_fail_result) 4274 MRRETURN(MATCH_NOMATCH); 4275 } 4276 /* Control never gets here */ 4277 4278 case PT_GC: 4279 for (fi = min;; fi++) 4280 { 4281 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); 4282 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4283 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4284 if (eptr >= md->end_subject) 4285 { 4286 SCHECK_PARTIAL(); 4287 MRRETURN(MATCH_NOMATCH); 4288 } 4289 GETCHARINCTEST(c, eptr); 4290 prop_category = UCD_CATEGORY(c); 4291 if ((prop_category == prop_value) == prop_fail_result) 4292 MRRETURN(MATCH_NOMATCH); 4293 } 4294 /* Control never gets here */ 4295 4296 case PT_PC: 4297 for (fi = min;; fi++) 4298 { 4299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); 4300 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4301 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4302 if (eptr >= md->end_subject) 4303 { 4304 SCHECK_PARTIAL(); 4305 MRRETURN(MATCH_NOMATCH); 4306 } 4307 GETCHARINCTEST(c, eptr); 4308 prop_chartype = UCD_CHARTYPE(c); 4309 if ((prop_chartype == prop_value) == prop_fail_result) 4310 MRRETURN(MATCH_NOMATCH); 4311 } 4312 /* Control never gets here */ 4313 4314 case PT_SC: 4315 for (fi = min;; fi++) 4316 { 4317 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); 4318 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4319 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4320 if (eptr >= md->end_subject) 4321 { 4322 SCHECK_PARTIAL(); 4323 MRRETURN(MATCH_NOMATCH); 4324 } 4325 GETCHARINCTEST(c, eptr); 4326 prop_script = UCD_SCRIPT(c); 4327 if ((prop_script == prop_value) == prop_fail_result) 4328 MRRETURN(MATCH_NOMATCH); 4329 } 4330 /* Control never gets here */ 4331 4332 case PT_ALNUM: 4333 for (fi = min;; fi++) 4334 { 4335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59); 4336 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4337 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4338 if (eptr >= md->end_subject) 4339 { 4340 SCHECK_PARTIAL(); 4341 MRRETURN(MATCH_NOMATCH); 4342 } 4343 GETCHARINCTEST(c, eptr); 4344 prop_category = UCD_CATEGORY(c); 4345 if ((prop_category == ucp_L || prop_category == ucp_N) 4346 == prop_fail_result) 4347 MRRETURN(MATCH_NOMATCH); 4348 } 4349 /* Control never gets here */ 4350 4351 case PT_SPACE: /* Perl space */ 4352 for (fi = min;; fi++) 4353 { 4354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60); 4355 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4356 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4357 if (eptr >= md->end_subject) 4358 { 4359 SCHECK_PARTIAL(); 4360 MRRETURN(MATCH_NOMATCH); 4361 } 4362 GETCHARINCTEST(c, eptr); 4363 prop_category = UCD_CATEGORY(c); 4364 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4365 c == CHAR_FF || c == CHAR_CR) 4366 == prop_fail_result) 4367 MRRETURN(MATCH_NOMATCH); 4368 } 4369 /* Control never gets here */ 4370 4371 case PT_PXSPACE: /* POSIX space */ 4372 for (fi = min;; fi++) 4373 { 4374 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61); 4375 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4376 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4377 if (eptr >= md->end_subject) 4378 { 4379 SCHECK_PARTIAL(); 4380 MRRETURN(MATCH_NOMATCH); 4381 } 4382 GETCHARINCTEST(c, eptr); 4383 prop_category = UCD_CATEGORY(c); 4384 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4385 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 4386 == prop_fail_result) 4387 MRRETURN(MATCH_NOMATCH); 4388 } 4389 /* Control never gets here */ 4390 4391 case PT_WORD: 4392 for (fi = min;; fi++) 4393 { 4394 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62); 4395 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4396 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4397 if (eptr >= md->end_subject) 4398 { 4399 SCHECK_PARTIAL(); 4400 MRRETURN(MATCH_NOMATCH); 4401 } 4402 GETCHARINCTEST(c, eptr); 4403 prop_category = UCD_CATEGORY(c); 4404 if ((prop_category == ucp_L || 4405 prop_category == ucp_N || 4406 c == CHAR_UNDERSCORE) 4407 == prop_fail_result) 4408 MRRETURN(MATCH_NOMATCH); 4409 } 4410 /* Control never gets here */ 4411 4412 /* This should never occur */ 4413 4414 default: 4415 RRETURN(PCRE_ERROR_INTERNAL); 4416 } 4417 } 4418 4419 /* Match extended Unicode sequences. We will get here only if the 4420 support is in the binary; otherwise a compile-time error occurs. */ 4421 4422 else if (ctype == OP_EXTUNI) 4423 { 4424 for (fi = min;; fi++) 4425 { 4426 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); 4427 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4428 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4429 if (eptr >= md->end_subject) 4430 { 4431 SCHECK_PARTIAL(); 4432 MRRETURN(MATCH_NOMATCH); 4433 } 4434 GETCHARINCTEST(c, eptr); 4435 prop_category = UCD_CATEGORY(c); 4436 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH); 4437 while (eptr < md->end_subject) 4438 { 4439 int len = 1; 4440 if (!utf8) c = *eptr; 4441 else { GETCHARLEN(c, eptr, len); } 4442 prop_category = UCD_CATEGORY(c); 4443 if (prop_category != ucp_M) break; 4444 eptr += len; 4445 } 4446 } 4447 } 4448 4449 else 4450 #endif /* SUPPORT_UCP */ 4451 4452 #ifdef SUPPORT_UTF8 4453 /* UTF-8 mode */ 4454 if (utf8) 4455 { 4456 for (fi = min;; fi++) 4457 { 4458 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); 4459 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4460 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4461 if (eptr >= md->end_subject) 4462 { 4463 SCHECK_PARTIAL(); 4464 MRRETURN(MATCH_NOMATCH); 4465 } 4466 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 4467 MRRETURN(MATCH_NOMATCH); 4468 GETCHARINC(c, eptr); 4469 switch(ctype) 4470 { 4471 case OP_ANY: /* This is the non-NL case */ 4472 case OP_ALLANY: 4473 case OP_ANYBYTE: 4474 break; 4475 4476 case OP_ANYNL: 4477 switch(c) 4478 { 4479 default: MRRETURN(MATCH_NOMATCH); 4480 case 0x000d: 4481 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4482 break; 4483 case 0x000a: 4484 break; 4485 4486 case 0x000b: 4487 case 0x000c: 4488 case 0x0085: 4489 case 0x2028: 4490 case 0x2029: 4491 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); 4492 break; 4493 } 4494 break; 4495 4496 case OP_NOT_HSPACE: 4497 switch(c) 4498 { 4499 default: break; 4500 case 0x09: /* HT */ 4501 case 0x20: /* SPACE */ 4502 case 0xa0: /* NBSP */ 4503 case 0x1680: /* OGHAM SPACE MARK */ 4504 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4505 case 0x2000: /* EN QUAD */ 4506 case 0x2001: /* EM QUAD */ 4507 case 0x2002: /* EN SPACE */ 4508 case 0x2003: /* EM SPACE */ 4509 case 0x2004: /* THREE-PER-EM SPACE */ 4510 case 0x2005: /* FOUR-PER-EM SPACE */ 4511 case 0x2006: /* SIX-PER-EM SPACE */ 4512 case 0x2007: /* FIGURE SPACE */ 4513 case 0x2008: /* PUNCTUATION SPACE */ 4514 case 0x2009: /* THIN SPACE */ 4515 case 0x200A: /* HAIR SPACE */ 4516 case 0x202f: /* NARROW NO-BREAK SPACE */ 4517 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4518 case 0x3000: /* IDEOGRAPHIC SPACE */ 4519 MRRETURN(MATCH_NOMATCH); 4520 } 4521 break; 4522 4523 case OP_HSPACE: 4524 switch(c) 4525 { 4526 default: MRRETURN(MATCH_NOMATCH); 4527 case 0x09: /* HT */ 4528 case 0x20: /* SPACE */ 4529 case 0xa0: /* NBSP */ 4530 case 0x1680: /* OGHAM SPACE MARK */ 4531 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4532 case 0x2000: /* EN QUAD */ 4533 case 0x2001: /* EM QUAD */ 4534 case 0x2002: /* EN SPACE */ 4535 case 0x2003: /* EM SPACE */ 4536 case 0x2004: /* THREE-PER-EM SPACE */ 4537 case 0x2005: /* FOUR-PER-EM SPACE */ 4538 case 0x2006: /* SIX-PER-EM SPACE */ 4539 case 0x2007: /* FIGURE SPACE */ 4540 case 0x2008: /* PUNCTUATION SPACE */ 4541 case 0x2009: /* THIN SPACE */ 4542 case 0x200A: /* HAIR SPACE */ 4543 case 0x202f: /* NARROW NO-BREAK SPACE */ 4544 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4545 case 0x3000: /* IDEOGRAPHIC SPACE */ 4546 break; 4547 } 4548 break; 4549 4550 case OP_NOT_VSPACE: 4551 switch(c) 4552 { 4553 default: break; 4554 case 0x0a: /* LF */ 4555 case 0x0b: /* VT */ 4556 case 0x0c: /* FF */ 4557 case 0x0d: /* CR */ 4558 case 0x85: /* NEL */ 4559 case 0x2028: /* LINE SEPARATOR */ 4560 case 0x2029: /* PARAGRAPH SEPARATOR */ 4561 MRRETURN(MATCH_NOMATCH); 4562 } 4563 break; 4564 4565 case OP_VSPACE: 4566 switch(c) 4567 { 4568 default: MRRETURN(MATCH_NOMATCH); 4569 case 0x0a: /* LF */ 4570 case 0x0b: /* VT */ 4571 case 0x0c: /* FF */ 4572 case 0x0d: /* CR */ 4573 case 0x85: /* NEL */ 4574 case 0x2028: /* LINE SEPARATOR */ 4575 case 0x2029: /* PARAGRAPH SEPARATOR */ 4576 break; 4577 } 4578 break; 4579 4580 case OP_NOT_DIGIT: 4581 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 4582 MRRETURN(MATCH_NOMATCH); 4583 break; 4584 4585 case OP_DIGIT: 4586 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 4587 MRRETURN(MATCH_NOMATCH); 4588 break; 4589 4590 case OP_NOT_WHITESPACE: 4591 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 4592 MRRETURN(MATCH_NOMATCH); 4593 break; 4594 4595 case OP_WHITESPACE: 4596 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 4597 MRRETURN(MATCH_NOMATCH); 4598 break; 4599 4600 case OP_NOT_WORDCHAR: 4601 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 4602 MRRETURN(MATCH_NOMATCH); 4603 break; 4604 4605 case OP_WORDCHAR: 4606 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 4607 MRRETURN(MATCH_NOMATCH); 4608 break; 4609 4610 default: 4611 RRETURN(PCRE_ERROR_INTERNAL); 4612 } 4613 } 4614 } 4615 else 4616 #endif 4617 /* Not UTF-8 mode */ 4618 { 4619 for (fi = min;; fi++) 4620 { 4621 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); 4622 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4623 if (fi >= max) MRRETURN(MATCH_NOMATCH); 4624 if (eptr >= md->end_subject) 4625 { 4626 SCHECK_PARTIAL(); 4627 MRRETURN(MATCH_NOMATCH); 4628 } 4629 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 4630 MRRETURN(MATCH_NOMATCH); 4631 c = *eptr++; 4632 switch(ctype) 4633 { 4634 case OP_ANY: /* This is the non-NL case */ 4635 case OP_ALLANY: 4636 case OP_ANYBYTE: 4637 break; 4638 4639 case OP_ANYNL: 4640 switch(c) 4641 { 4642 default: MRRETURN(MATCH_NOMATCH); 4643 case 0x000d: 4644 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4645 break; 4646 4647 case 0x000a: 4648 break; 4649 4650 case 0x000b: 4651 case 0x000c: 4652 case 0x0085: 4653 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH); 4654 break; 4655 } 4656 break; 4657 4658 case OP_NOT_HSPACE: 4659 switch(c) 4660 { 4661 default: break; 4662 case 0x09: /* HT */ 4663 case 0x20: /* SPACE */ 4664 case 0xa0: /* NBSP */ 4665 MRRETURN(MATCH_NOMATCH); 4666 } 4667 break; 4668 4669 case OP_HSPACE: 4670 switch(c) 4671 { 4672 default: MRRETURN(MATCH_NOMATCH); 4673 case 0x09: /* HT */ 4674 case 0x20: /* SPACE */ 4675 case 0xa0: /* NBSP */ 4676 break; 4677 } 4678 break; 4679 4680 case OP_NOT_VSPACE: 4681 switch(c) 4682 { 4683 default: break; 4684 case 0x0a: /* LF */ 4685 case 0x0b: /* VT */ 4686 case 0x0c: /* FF */ 4687 case 0x0d: /* CR */ 4688 case 0x85: /* NEL */ 4689 MRRETURN(MATCH_NOMATCH); 4690 } 4691 break; 4692 4693 case OP_VSPACE: 4694 switch(c) 4695 { 4696 default: MRRETURN(MATCH_NOMATCH); 4697 case 0x0a: /* LF */ 4698 case 0x0b: /* VT */ 4699 case 0x0c: /* FF */ 4700 case 0x0d: /* CR */ 4701 case 0x85: /* NEL */ 4702 break; 4703 } 4704 break; 4705 4706 case OP_NOT_DIGIT: 4707 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH); 4708 break; 4709 4710 case OP_DIGIT: 4711 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH); 4712 break; 4713 4714 case OP_NOT_WHITESPACE: 4715 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH); 4716 break; 4717 4718 case OP_WHITESPACE: 4719 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH); 4720 break; 4721 4722 case OP_NOT_WORDCHAR: 4723 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH); 4724 break; 4725 4726 case OP_WORDCHAR: 4727 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH); 4728 break; 4729 4730 default: 4731 RRETURN(PCRE_ERROR_INTERNAL); 4732 } 4733 } 4734 } 4735 /* Control never gets here */ 4736 } 4737 4738 /* If maximizing, it is worth using inline code for speed, doing the type 4739 test once at the start (i.e. keep it out of the loop). Again, keep the 4740 UTF-8 and UCP stuff separate. */ 4741 4742 else 4743 { 4744 pp = eptr; /* Remember where we started */ 4745 4746 #ifdef SUPPORT_UCP 4747 if (prop_type >= 0) 4748 { 4749 switch(prop_type) 4750 { 4751 case PT_ANY: 4752 for (i = min; i < max; i++) 4753 { 4754 int len = 1; 4755 if (eptr >= md->end_subject) 4756 { 4757 SCHECK_PARTIAL(); 4758 break; 4759 } 4760 GETCHARLENTEST(c, eptr, len); 4761 if (prop_fail_result) break; 4762 eptr+= len; 4763 } 4764 break; 4765 4766 case PT_LAMP: 4767 for (i = min; i < max; i++) 4768 { 4769 int len = 1; 4770 if (eptr >= md->end_subject) 4771 { 4772 SCHECK_PARTIAL(); 4773 break; 4774 } 4775 GETCHARLENTEST(c, eptr, len); 4776 prop_chartype = UCD_CHARTYPE(c); 4777 if ((prop_chartype == ucp_Lu || 4778 prop_chartype == ucp_Ll || 4779 prop_chartype == ucp_Lt) == prop_fail_result) 4780 break; 4781 eptr+= len; 4782 } 4783 break; 4784 4785 case PT_GC: 4786 for (i = min; i < max; i++) 4787 { 4788 int len = 1; 4789 if (eptr >= md->end_subject) 4790 { 4791 SCHECK_PARTIAL(); 4792 break; 4793 } 4794 GETCHARLENTEST(c, eptr, len); 4795 prop_category = UCD_CATEGORY(c); 4796 if ((prop_category == prop_value) == prop_fail_result) 4797 break; 4798 eptr+= len; 4799 } 4800 break; 4801 4802 case PT_PC: 4803 for (i = min; i < max; i++) 4804 { 4805 int len = 1; 4806 if (eptr >= md->end_subject) 4807 { 4808 SCHECK_PARTIAL(); 4809 break; 4810 } 4811 GETCHARLENTEST(c, eptr, len); 4812 prop_chartype = UCD_CHARTYPE(c); 4813 if ((prop_chartype == prop_value) == prop_fail_result) 4814 break; 4815 eptr+= len; 4816 } 4817 break; 4818 4819 case PT_SC: 4820 for (i = min; i < max; i++) 4821 { 4822 int len = 1; 4823 if (eptr >= md->end_subject) 4824 { 4825 SCHECK_PARTIAL(); 4826 break; 4827 } 4828 GETCHARLENTEST(c, eptr, len); 4829 prop_script = UCD_SCRIPT(c); 4830 if ((prop_script == prop_value) == prop_fail_result) 4831 break; 4832 eptr+= len; 4833 } 4834 break; 4835 4836 case PT_ALNUM: 4837 for (i = min; i < max; i++) 4838 { 4839 int len = 1; 4840 if (eptr >= md->end_subject) 4841 { 4842 SCHECK_PARTIAL(); 4843 break; 4844 } 4845 GETCHARLENTEST(c, eptr, len); 4846 prop_category = UCD_CATEGORY(c); 4847 if ((prop_category == ucp_L || prop_category == ucp_N) 4848 == prop_fail_result) 4849 break; 4850 eptr+= len; 4851 } 4852 break; 4853 4854 case PT_SPACE: /* Perl space */ 4855 for (i = min; i < max; i++) 4856 { 4857 int len = 1; 4858 if (eptr >= md->end_subject) 4859 { 4860 SCHECK_PARTIAL(); 4861 break; 4862 } 4863 GETCHARLENTEST(c, eptr, len); 4864 prop_category = UCD_CATEGORY(c); 4865 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4866 c == CHAR_FF || c == CHAR_CR) 4867 == prop_fail_result) 4868 break; 4869 eptr+= len; 4870 } 4871 break; 4872 4873 case PT_PXSPACE: /* POSIX space */ 4874 for (i = min; i < max; i++) 4875 { 4876 int len = 1; 4877 if (eptr >= md->end_subject) 4878 { 4879 SCHECK_PARTIAL(); 4880 break; 4881 } 4882 GETCHARLENTEST(c, eptr, len); 4883 prop_category = UCD_CATEGORY(c); 4884 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4885 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 4886 == prop_fail_result) 4887 break; 4888 eptr+= len; 4889 } 4890 break; 4891 4892 case PT_WORD: 4893 for (i = min; i < max; i++) 4894 { 4895 int len = 1; 4896 if (eptr >= md->end_subject) 4897 { 4898 SCHECK_PARTIAL(); 4899 break; 4900 } 4901 GETCHARLENTEST(c, eptr, len); 4902 prop_category = UCD_CATEGORY(c); 4903 if ((prop_category == ucp_L || prop_category == ucp_N || 4904 c == CHAR_UNDERSCORE) == prop_fail_result) 4905 break; 4906 eptr+= len; 4907 } 4908 break; 4909 4910 default: 4911 RRETURN(PCRE_ERROR_INTERNAL); 4912 } 4913 4914 /* eptr is now past the end of the maximum run */ 4915 4916 if (possessive) continue; 4917 for(;;) 4918 { 4919 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); 4920 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4921 if (eptr-- == pp) break; /* Stop if tried at original pos */ 4922 if (utf8) BACKCHAR(eptr); 4923 } 4924 } 4925 4926 /* Match extended Unicode sequences. We will get here only if the 4927 support is in the binary; otherwise a compile-time error occurs. */ 4928 4929 else if (ctype == OP_EXTUNI) 4930 { 4931 for (i = min; i < max; i++) 4932 { 4933 if (eptr >= md->end_subject) 4934 { 4935 SCHECK_PARTIAL(); 4936 break; 4937 } 4938 GETCHARINCTEST(c, eptr); 4939 prop_category = UCD_CATEGORY(c); 4940 if (prop_category == ucp_M) break; 4941 while (eptr < md->end_subject) 4942 { 4943 int len = 1; 4944 if (!utf8) c = *eptr; else 4945 { 4946 GETCHARLEN(c, eptr, len); 4947 } 4948 prop_category = UCD_CATEGORY(c); 4949 if (prop_category != ucp_M) break; 4950 eptr += len; 4951 } 4952 } 4953 4954 /* eptr is now past the end of the maximum run */ 4955 4956 if (possessive) continue; 4957 4958 for(;;) 4959 { 4960 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); 4961 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4962 if (eptr-- == pp) break; /* Stop if tried at original pos */ 4963 for (;;) /* Move back over one extended */ 4964 { 4965 int len = 1; 4966 if (!utf8) c = *eptr; else 4967 { 4968 BACKCHAR(eptr); 4969 GETCHARLEN(c, eptr, len); 4970 } 4971 prop_category = UCD_CATEGORY(c); 4972 if (prop_category != ucp_M) break; 4973 eptr--; 4974 } 4975 } 4976 } 4977 4978 else 4979 #endif /* SUPPORT_UCP */ 4980 4981 #ifdef SUPPORT_UTF8 4982 /* UTF-8 mode */ 4983 4984 if (utf8) 4985 { 4986 switch(ctype) 4987 { 4988 case OP_ANY: 4989 if (max < INT_MAX) 4990 { 4991 for (i = min; i < max; i++) 4992 { 4993 if (eptr >= md->end_subject) 4994 { 4995 SCHECK_PARTIAL(); 4996 break; 4997 } 4998 if (IS_NEWLINE(eptr)) break; 4999 eptr++; 5000 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 5001 } 5002 } 5003 5004 /* Handle unlimited UTF-8 repeat */ 5005 5006 else 5007 { 5008 for (i = min; i < max; i++) 5009 { 5010 if (eptr >= md->end_subject) 5011 { 5012 SCHECK_PARTIAL(); 5013 break; 5014 } 5015 if (IS_NEWLINE(eptr)) break; 5016 eptr++; 5017 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 5018 } 5019 } 5020 break; 5021 5022 case OP_ALLANY: 5023 if (max < INT_MAX) 5024 { 5025 for (i = min; i < max; i++) 5026 { 5027 if (eptr >= md->end_subject) 5028 { 5029 SCHECK_PARTIAL(); 5030 break; 5031 } 5032 eptr++; 5033 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 5034 } 5035 } 5036 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 5037 break; 5038 5039 /* The byte case is the same as non-UTF8 */ 5040 5041 case OP_ANYBYTE: 5042 c = max - min; 5043 if (c > (unsigned int)(md->end_subject - eptr)) 5044 { 5045 eptr = md->end_subject; 5046 SCHECK_PARTIAL(); 5047 } 5048 else eptr += c; 5049 break; 5050 5051 case OP_ANYNL: 5052 for (i = min; i < max; i++) 5053 { 5054 int len = 1; 5055 if (eptr >= md->end_subject) 5056 { 5057 SCHECK_PARTIAL(); 5058 break; 5059 } 5060 GETCHARLEN(c, eptr, len); 5061 if (c == 0x000d) 5062 { 5063 if (++eptr >= md->end_subject) break; 5064 if (*eptr == 0x000a) eptr++; 5065 } 5066 else 5067 { 5068 if (c != 0x000a && 5069 (md->bsr_anycrlf || 5070 (c != 0x000b && c != 0x000c && 5071 c != 0x0085 && c != 0x2028 && c != 0x2029))) 5072 break; 5073 eptr += len; 5074 } 5075 } 5076 break; 5077 5078 case OP_NOT_HSPACE: 5079 case OP_HSPACE: 5080 for (i = min; i < max; i++) 5081 { 5082 BOOL gotspace; 5083 int len = 1; 5084 if (eptr >= md->end_subject) 5085 { 5086 SCHECK_PARTIAL(); 5087 break; 5088 } 5089 GETCHARLEN(c, eptr, len); 5090 switch(c) 5091 { 5092 default: gotspace = FALSE; break; 5093 case 0x09: /* HT */ 5094 case 0x20: /* SPACE */ 5095 case 0xa0: /* NBSP */ 5096 case 0x1680: /* OGHAM SPACE MARK */ 5097 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5098 case 0x2000: /* EN QUAD */ 5099 case 0x2001: /* EM QUAD */ 5100 case 0x2002: /* EN SPACE */ 5101 case 0x2003: /* EM SPACE */ 5102 case 0x2004: /* THREE-PER-EM SPACE */ 5103 case 0x2005: /* FOUR-PER-EM SPACE */ 5104 case 0x2006: /* SIX-PER-EM SPACE */ 5105 case 0x2007: /* FIGURE SPACE */ 5106 case 0x2008: /* PUNCTUATION SPACE */ 5107 case 0x2009: /* THIN SPACE */ 5108 case 0x200A: /* HAIR SPACE */ 5109 case 0x202f: /* NARROW NO-BREAK SPACE */ 5110 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5111 case 0x3000: /* IDEOGRAPHIC SPACE */ 5112 gotspace = TRUE; 5113 break; 5114 } 5115 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5116 eptr += len; 5117 } 5118 break; 5119 5120 case OP_NOT_VSPACE: 5121 case OP_VSPACE: 5122 for (i = min; i < max; i++) 5123 { 5124 BOOL gotspace; 5125 int len = 1; 5126 if (eptr >= md->end_subject) 5127 { 5128 SCHECK_PARTIAL(); 5129 break; 5130 } 5131 GETCHARLEN(c, eptr, len); 5132 switch(c) 5133 { 5134 default: gotspace = FALSE; break; 5135 case 0x0a: /* LF */ 5136 case 0x0b: /* VT */ 5137 case 0x0c: /* FF */ 5138 case 0x0d: /* CR */ 5139 case 0x85: /* NEL */ 5140 case 0x2028: /* LINE SEPARATOR */ 5141 case 0x2029: /* PARAGRAPH SEPARATOR */ 5142 gotspace = TRUE; 5143 break; 5144 } 5145 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5146 eptr += len; 5147 } 5148 break; 5149 5150 case OP_NOT_DIGIT: 5151 for (i = min; i < max; i++) 5152 { 5153 int len = 1; 5154 if (eptr >= md->end_subject) 5155 { 5156 SCHECK_PARTIAL(); 5157 break; 5158 } 5159 GETCHARLEN(c, eptr, len); 5160 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 5161 eptr+= len; 5162 } 5163 break; 5164 5165 case OP_DIGIT: 5166 for (i = min; i < max; i++) 5167 { 5168 int len = 1; 5169 if (eptr >= md->end_subject) 5170 { 5171 SCHECK_PARTIAL(); 5172 break; 5173 } 5174 GETCHARLEN(c, eptr, len); 5175 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 5176 eptr+= len; 5177 } 5178 break; 5179 5180 case OP_NOT_WHITESPACE: 5181 for (i = min; i < max; i++) 5182 { 5183 int len = 1; 5184 if (eptr >= md->end_subject) 5185 { 5186 SCHECK_PARTIAL(); 5187 break; 5188 } 5189 GETCHARLEN(c, eptr, len); 5190 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 5191 eptr+= len; 5192 } 5193 break; 5194 5195 case OP_WHITESPACE: 5196 for (i = min; i < max; i++) 5197 { 5198 int len = 1; 5199 if (eptr >= md->end_subject) 5200 { 5201 SCHECK_PARTIAL(); 5202 break; 5203 } 5204 GETCHARLEN(c, eptr, len); 5205 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 5206 eptr+= len; 5207 } 5208 break; 5209 5210 case OP_NOT_WORDCHAR: 5211 for (i = min; i < max; i++) 5212 { 5213 int len = 1; 5214 if (eptr >= md->end_subject) 5215 { 5216 SCHECK_PARTIAL(); 5217 break; 5218 } 5219 GETCHARLEN(c, eptr, len); 5220 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 5221 eptr+= len; 5222 } 5223 break; 5224 5225 case OP_WORDCHAR: 5226 for (i = min; i < max; i++) 5227 { 5228 int len = 1; 5229 if (eptr >= md->end_subject) 5230 { 5231 SCHECK_PARTIAL(); 5232 break; 5233 } 5234 GETCHARLEN(c, eptr, len); 5235 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 5236 eptr+= len; 5237 } 5238 break; 5239 5240 default: 5241 RRETURN(PCRE_ERROR_INTERNAL); 5242 } 5243 5244 /* eptr is now past the end of the maximum run */ 5245 5246 if (possessive) continue; 5247 for(;;) 5248 { 5249 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); 5250 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5251 if (eptr-- == pp) break; /* Stop if tried at original pos */ 5252 BACKCHAR(eptr); 5253 } 5254 } 5255 else 5256 #endif /* SUPPORT_UTF8 */ 5257 5258 /* Not UTF-8 mode */ 5259 { 5260 switch(ctype) 5261 { 5262 case OP_ANY: 5263 for (i = min; i < max; i++) 5264 { 5265 if (eptr >= md->end_subject) 5266 { 5267 SCHECK_PARTIAL(); 5268 break; 5269 } 5270 if (IS_NEWLINE(eptr)) break; 5271 eptr++; 5272 } 5273 break; 5274 5275 case OP_ALLANY: 5276 case OP_ANYBYTE: 5277 c = max - min; 5278 if (c > (unsigned int)(md->end_subject - eptr)) 5279 { 5280 eptr = md->end_subject; 5281 SCHECK_PARTIAL(); 5282 } 5283 else eptr += c; 5284 break; 5285 5286 case OP_ANYNL: 5287 for (i = min; i < max; i++) 5288 { 5289 if (eptr >= md->end_subject) 5290 { 5291 SCHECK_PARTIAL(); 5292 break; 5293 } 5294 c = *eptr; 5295 if (c == 0x000d) 5296 { 5297 if (++eptr >= md->end_subject) break; 5298 if (*eptr == 0x000a) eptr++; 5299 } 5300 else 5301 { 5302 if (c != 0x000a && 5303 (md->bsr_anycrlf || 5304 (c != 0x000b && c != 0x000c && c != 0x0085))) 5305 break; 5306 eptr++; 5307 } 5308 } 5309 break; 5310 5311 case OP_NOT_HSPACE: 5312 for (i = min; i < max; i++) 5313 { 5314 if (eptr >= md->end_subject) 5315 { 5316 SCHECK_PARTIAL(); 5317 break; 5318 } 5319 c = *eptr; 5320 if (c == 0x09 || c == 0x20 || c == 0xa0) break; 5321 eptr++; 5322 } 5323 break; 5324 5325 case OP_HSPACE: 5326 for (i = min; i < max; i++) 5327 { 5328 if (eptr >= md->end_subject) 5329 { 5330 SCHECK_PARTIAL(); 5331 break; 5332 } 5333 c = *eptr; 5334 if (c != 0x09 && c != 0x20 && c != 0xa0) break; 5335 eptr++; 5336 } 5337 break; 5338 5339 case OP_NOT_VSPACE: 5340 for (i = min; i < max; i++) 5341 { 5342 if (eptr >= md->end_subject) 5343 { 5344 SCHECK_PARTIAL(); 5345 break; 5346 } 5347 c = *eptr; 5348 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) 5349 break; 5350 eptr++; 5351 } 5352 break; 5353 5354 case OP_VSPACE: 5355 for (i = min; i < max; i++) 5356 { 5357 if (eptr >= md->end_subject) 5358 { 5359 SCHECK_PARTIAL(); 5360 break; 5361 } 5362 c = *eptr; 5363 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) 5364 break; 5365 eptr++; 5366 } 5367 break; 5368 5369 case OP_NOT_DIGIT: 5370 for (i = min; i < max; i++) 5371 { 5372 if (eptr >= md->end_subject) 5373 { 5374 SCHECK_PARTIAL(); 5375 break; 5376 } 5377 if ((md->ctypes[*eptr] & ctype_digit) != 0) break; 5378 eptr++; 5379 } 5380 break; 5381 5382 case OP_DIGIT: 5383 for (i = min; i < max; i++) 5384 { 5385 if (eptr >= md->end_subject) 5386 { 5387 SCHECK_PARTIAL(); 5388 break; 5389 } 5390 if ((md->ctypes[*eptr] & ctype_digit) == 0) break; 5391 eptr++; 5392 } 5393 break; 5394 5395 case OP_NOT_WHITESPACE: 5396 for (i = min; i < max; i++) 5397 { 5398 if (eptr >= md->end_subject) 5399 { 5400 SCHECK_PARTIAL(); 5401 break; 5402 } 5403 if ((md->ctypes[*eptr] & ctype_space) != 0) break; 5404 eptr++; 5405 } 5406 break; 5407 5408 case OP_WHITESPACE: 5409 for (i = min; i < max; i++) 5410 { 5411 if (eptr >= md->end_subject) 5412 { 5413 SCHECK_PARTIAL(); 5414 break; 5415 } 5416 if ((md->ctypes[*eptr] & ctype_space) == 0) break; 5417 eptr++; 5418 } 5419 break; 5420 5421 case OP_NOT_WORDCHAR: 5422 for (i = min; i < max; i++) 5423 { 5424 if (eptr >= md->end_subject) 5425 { 5426 SCHECK_PARTIAL(); 5427 break; 5428 } 5429 if ((md->ctypes[*eptr] & ctype_word) != 0) break; 5430 eptr++; 5431 } 5432 break; 5433 5434 case OP_WORDCHAR: 5435 for (i = min; i < max; i++) 5436 { 5437 if (eptr >= md->end_subject) 5438 { 5439 SCHECK_PARTIAL(); 5440 break; 5441 } 5442 if ((md->ctypes[*eptr] & ctype_word) == 0) break; 5443 eptr++; 5444 } 5445 break; 5446 5447 default: 5448 RRETURN(PCRE_ERROR_INTERNAL); 5449 } 5450 5451 /* eptr is now past the end of the maximum run */ 5452 5453 if (possessive) continue; 5454 while (eptr >= pp) 5455 { 5456 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); 5457 eptr--; 5458 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5459 } 5460 } 5461 5462 /* Get here if we can't make it match with any permitted repetitions */ 5463 5464 MRRETURN(MATCH_NOMATCH); 5465 } 5466 /* Control never gets here */ 5467 5468 /* There's been some horrible disaster. Arrival here can only mean there is 5469 something seriously wrong in the code above or the OP_xxx definitions. */ 5470 5471 default: 5472 DPRINTF(("Unknown opcode %d\n", *ecode)); 5473 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 5474 } 5475 5476 /* Do not stick any code in here without much thought; it is assumed 5477 that "continue" in the code above comes out to here to repeat the main 5478 loop. */ 5479 5480 } /* End of main loop */ 5481 /* Control never reaches here */ 5482 5483 5484 /* When compiling to use the heap rather than the stack for recursive calls to 5485 match(), the RRETURN() macro jumps here. The number that is saved in 5486 frame->Xwhere indicates which label we actually want to return to. */ 5487 5488 #ifdef NO_RECURSE 5489 #define LBL(val) case val: goto L_RM##val; 5490 HEAP_RETURN: 5491 switch (frame->Xwhere) 5492 { 5493 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 5494 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 5495 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 5496 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 5497 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) 5498 #ifdef SUPPORT_UTF8 5499 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) 5500 LBL(32) LBL(34) LBL(42) LBL(46) 5501 #ifdef SUPPORT_UCP 5502 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 5503 LBL(59) LBL(60) LBL(61) LBL(62) 5504 #endif /* SUPPORT_UCP */ 5505 #endif /* SUPPORT_UTF8 */ 5506 default: 5507 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 5508 return PCRE_ERROR_INTERNAL; 5509 } 5510 #undef LBL 5511 #endif /* NO_RECURSE */ 5512 } 5513 5514 5515 /*************************************************************************** 5516 **************************************************************************** 5517 RECURSION IN THE match() FUNCTION 5518 5519 Undefine all the macros that were defined above to handle this. */ 5520 5521 #ifdef NO_RECURSE 5522 #undef eptr 5523 #undef ecode 5524 #undef mstart 5525 #undef offset_top 5526 #undef ims 5527 #undef eptrb 5528 #undef flags 5529 5530 #undef callpat 5531 #undef charptr 5532 #undef data 5533 #undef next 5534 #undef pp 5535 #undef prev 5536 #undef saved_eptr 5537 5538 #undef new_recursive 5539 5540 #undef cur_is_word 5541 #undef condition 5542 #undef prev_is_word 5543 5544 #undef original_ims 5545 5546 #undef ctype 5547 #undef length 5548 #undef max 5549 #undef min 5550 #undef number 5551 #undef offset 5552 #undef op 5553 #undef save_capture_last 5554 #undef save_offset1 5555 #undef save_offset2 5556 #undef save_offset3 5557 #undef stacksave 5558 5559 #undef newptrb 5560 5561 #endif 5562 5563 /* These two are defined as macros in both cases */ 5564 5565 #undef fc 5566 #undef fi 5567 5568 /*************************************************************************** 5569 ***************************************************************************/ 5570 5571 5572 5573 /************************************************* 5574 * Execute a Regular Expression * 5575 *************************************************/ 5576 5577 /* This function applies a compiled re to a subject string and picks out 5578 portions of the string if it matches. Two elements in the vector are set for 5579 each substring: the offsets to the start and end of the substring. 5580 5581 Arguments: 5582 argument_re points to the compiled expression 5583 extra_data points to extra data or is NULL 5584 subject points to the subject string 5585 length length of subject string (may contain binary zeros) 5586 start_offset where to start in the subject string 5587 options option bits 5588 offsets points to a vector of ints to be filled in with offsets 5589 offsetcount the number of elements in the vector 5590 5591 Returns: > 0 => success; value is the number of elements filled in 5592 = 0 => success, but offsets is not big enough 5593 -1 => failed to match 5594 < -1 => some kind of unexpected problem 5595 */ 5596 5597 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 5598 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 5599 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 5600 int offsetcount) 5601 { 5602 int rc, resetcount, ocount; 5603 int first_byte = -1; 5604 int req_byte = -1; 5605 int req_byte2 = -1; 5606 int newline; 5607 unsigned long int ims; 5608 BOOL using_temporary_offsets = FALSE; 5609 BOOL anchored; 5610 BOOL startline; 5611 BOOL firstline; 5612 BOOL first_byte_caseless = FALSE; 5613 BOOL req_byte_caseless = FALSE; 5614 BOOL utf8; 5615 match_data match_block; 5616 match_data *md = &match_block; 5617 const uschar *tables; 5618 const uschar *start_bits = NULL; 5619 USPTR start_match = (USPTR)subject + start_offset; 5620 USPTR end_subject; 5621 USPTR start_partial = NULL; 5622 USPTR req_byte_ptr = start_match - 1; 5623 5624 pcre_study_data internal_study; 5625 const pcre_study_data *study; 5626 5627 real_pcre internal_re; 5628 const real_pcre *external_re = (const real_pcre *)argument_re; 5629 const real_pcre *re = external_re; 5630 5631 /* Plausibility checks */ 5632 5633 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 5634 if (re == NULL || subject == NULL || 5635 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 5636 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 5637 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 5638 5639 /* This information is for finding all the numbers associated with a given 5640 name, for condition testing. */ 5641 5642 md->name_table = (uschar *)re + re->name_table_offset; 5643 md->name_count = re->name_count; 5644 md->name_entry_size = re->name_entry_size; 5645 5646 /* Fish out the optional data from the extra_data structure, first setting 5647 the default values. */ 5648 5649 study = NULL; 5650 md->match_limit = MATCH_LIMIT; 5651 md->match_limit_recursion = MATCH_LIMIT_RECURSION; 5652 md->callout_data = NULL; 5653 5654 /* The table pointer is always in native byte order. */ 5655 5656 tables = external_re->tables; 5657 5658 if (extra_data != NULL) 5659 { 5660 register unsigned int flags = extra_data->flags; 5661 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 5662 study = (const pcre_study_data *)extra_data->study_data; 5663 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 5664 md->match_limit = extra_data->match_limit; 5665 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 5666 md->match_limit_recursion = extra_data->match_limit_recursion; 5667 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 5668 md->callout_data = extra_data->callout_data; 5669 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 5670 } 5671 5672 /* If the exec call supplied NULL for tables, use the inbuilt ones. This 5673 is a feature that makes it possible to save compiled regex and re-use them 5674 in other programs later. */ 5675 5676 if (tables == NULL) tables = _pcre_default_tables; 5677 5678 /* Check that the first field in the block is the magic number. If it is not, 5679 test for a regex that was compiled on a host of opposite endianness. If this is 5680 the case, flipped values are put in internal_re and internal_study if there was 5681 study data too. */ 5682 5683 if (re->magic_number != MAGIC_NUMBER) 5684 { 5685 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); 5686 if (re == NULL) return PCRE_ERROR_BADMAGIC; 5687 if (study != NULL) study = &internal_study; 5688 } 5689 5690 /* Set up other data */ 5691 5692 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 5693 startline = (re->flags & PCRE_STARTLINE) != 0; 5694 firstline = (re->options & PCRE_FIRSTLINE) != 0; 5695 5696 /* The code starts after the real_pcre block and the capture name table. */ 5697 5698 md->start_code = (const uschar *)external_re + re->name_table_offset + 5699 re->name_count * re->name_entry_size; 5700 5701 md->start_subject = (USPTR)subject; 5702 md->start_offset = start_offset; 5703 md->end_subject = md->start_subject + length; 5704 end_subject = md->end_subject; 5705 5706 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 5707 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; 5708 md->use_ucp = (re->options & PCRE_UCP) != 0; 5709 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 5710 5711 md->notbol = (options & PCRE_NOTBOL) != 0; 5712 md->noteol = (options & PCRE_NOTEOL) != 0; 5713 md->notempty = (options & PCRE_NOTEMPTY) != 0; 5714 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 5715 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 5716 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 5717 md->hitend = FALSE; 5718 md->mark = NULL; /* In case never set */ 5719 5720 md->recursive = NULL; /* No recursion at top level */ 5721 5722 md->lcc = tables + lcc_offset; 5723 md->ctypes = tables + ctypes_offset; 5724 5725 /* Handle different \R options. */ 5726 5727 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 5728 { 5729 case 0: 5730 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 5731 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 5732 else 5733 #ifdef BSR_ANYCRLF 5734 md->bsr_anycrlf = TRUE; 5735 #else 5736 md->bsr_anycrlf = FALSE; 5737 #endif 5738 break; 5739 5740 case PCRE_BSR_ANYCRLF: 5741 md->bsr_anycrlf = TRUE; 5742 break; 5743 5744 case PCRE_BSR_UNICODE: 5745 md->bsr_anycrlf = FALSE; 5746 break; 5747 5748 default: return PCRE_ERROR_BADNEWLINE; 5749 } 5750 5751 /* Handle different types of newline. The three bits give eight cases. If 5752 nothing is set at run time, whatever was used at compile time applies. */ 5753 5754 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 5755 (pcre_uint32)options) & PCRE_NEWLINE_BITS) 5756 { 5757 case 0: newline = NEWLINE; break; /* Compile-time default */ 5758 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 5759 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 5760 case PCRE_NEWLINE_CR+ 5761 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 5762 case PCRE_NEWLINE_ANY: newline = -1; break; 5763 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 5764 default: return PCRE_ERROR_BADNEWLINE; 5765 } 5766 5767 if (newline == -2) 5768 { 5769 md->nltype = NLTYPE_ANYCRLF; 5770 } 5771 else if (newline < 0) 5772 { 5773 md->nltype = NLTYPE_ANY; 5774 } 5775 else 5776 { 5777 md->nltype = NLTYPE_FIXED; 5778 if (newline > 255) 5779 { 5780 md->nllen = 2; 5781 md->nl[0] = (newline >> 8) & 255; 5782 md->nl[1] = newline & 255; 5783 } 5784 else 5785 { 5786 md->nllen = 1; 5787 md->nl[0] = newline; 5788 } 5789 } 5790 5791 /* Partial matching was originally supported only for a restricted set of 5792 regexes; from release 8.00 there are no restrictions, but the bits are still 5793 defined (though never set). So there's no harm in leaving this code. */ 5794 5795 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 5796 return PCRE_ERROR_BADPARTIAL; 5797 5798 /* Check a UTF-8 string if required. Unfortunately there's no way of passing 5799 back the character offset. */ 5800 5801 #ifdef SUPPORT_UTF8 5802 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) 5803 { 5804 int tb; 5805 if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0) 5806 return (tb == length && md->partial > 1)? 5807 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 5808 if (start_offset > 0 && start_offset < length) 5809 { 5810 tb = ((USPTR)subject)[start_offset] & 0xc0; 5811 if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET; 5812 } 5813 } 5814 #endif 5815 5816 /* The ims options can vary during the matching as a result of the presence 5817 of (?ims) items in the pattern. They are kept in a local variable so that 5818 restoring at the exit of a group is easy. */ 5819 5820 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); 5821 5822 /* If the expression has got more back references than the offsets supplied can 5823 hold, we get a temporary chunk of working store to use during the matching. 5824 Otherwise, we can use the vector supplied, rounding down its size to a multiple 5825 of 3. */ 5826 5827 ocount = offsetcount - (offsetcount % 3); 5828 5829 if (re->top_backref > 0 && re->top_backref >= ocount/3) 5830 { 5831 ocount = re->top_backref * 3 + 3; 5832 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); 5833 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 5834 using_temporary_offsets = TRUE; 5835 DPRINTF(("Got memory to hold back references\n")); 5836 } 5837 else md->offset_vector = offsets; 5838 5839 md->offset_end = ocount; 5840 md->offset_max = (2*ocount)/3; 5841 md->offset_overflow = FALSE; 5842 md->capture_last = -1; 5843 5844 /* Compute the minimum number of offsets that we need to reset each time. Doing 5845 this makes a huge difference to execution time when there aren't many brackets 5846 in the pattern. */ 5847 5848 resetcount = 2 + re->top_bracket * 2; 5849 if (resetcount > offsetcount) resetcount = ocount; 5850 5851 /* Reset the working variable associated with each extraction. These should 5852 never be used unless previously set, but they get saved and restored, and so we 5853 initialize them to avoid reading uninitialized locations. */ 5854 5855 if (md->offset_vector != NULL) 5856 { 5857 register int *iptr = md->offset_vector + ocount; 5858 register int *iend = iptr - resetcount/2 + 1; 5859 while (--iptr >= iend) *iptr = -1; 5860 } 5861 5862 /* Set up the first character to match, if available. The first_byte value is 5863 never set for an anchored regular expression, but the anchoring may be forced 5864 at run time, so we have to test for anchoring. The first char may be unset for 5865 an unanchored pattern, of course. If there's no first char and the pattern was 5866 studied, there may be a bitmap of possible first characters. */ 5867 5868 if (!anchored) 5869 { 5870 if ((re->flags & PCRE_FIRSTSET) != 0) 5871 { 5872 first_byte = re->first_byte & 255; 5873 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) 5874 first_byte = md->lcc[first_byte]; 5875 } 5876 else 5877 if (!startline && study != NULL && 5878 (study->flags & PCRE_STUDY_MAPPED) != 0) 5879 start_bits = study->start_bits; 5880 } 5881 5882 /* For anchored or unanchored matches, there may be a "last known required 5883 character" set. */ 5884 5885 if ((re->flags & PCRE_REQCHSET) != 0) 5886 { 5887 req_byte = re->req_byte & 255; 5888 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 5889 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ 5890 } 5891 5892 5893 /* ==========================================================================*/ 5894 5895 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 5896 the loop runs just once. */ 5897 5898 for(;;) 5899 { 5900 USPTR save_end_subject = end_subject; 5901 USPTR new_start_match; 5902 5903 /* Reset the maximum number of extractions we might see. */ 5904 5905 if (md->offset_vector != NULL) 5906 { 5907 register int *iptr = md->offset_vector; 5908 register int *iend = iptr + resetcount; 5909 while (iptr < iend) *iptr++ = -1; 5910 } 5911 5912 /* If firstline is TRUE, the start of the match is constrained to the first 5913 line of a multiline string. That is, the match must be before or at the first 5914 newline. Implement this by temporarily adjusting end_subject so that we stop 5915 scanning at a newline. If the match fails at the newline, later code breaks 5916 this loop. */ 5917 5918 if (firstline) 5919 { 5920 USPTR t = start_match; 5921 #ifdef SUPPORT_UTF8 5922 if (utf8) 5923 { 5924 while (t < md->end_subject && !IS_NEWLINE(t)) 5925 { 5926 t++; 5927 while (t < end_subject && (*t & 0xc0) == 0x80) t++; 5928 } 5929 } 5930 else 5931 #endif 5932 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 5933 end_subject = t; 5934 } 5935 5936 /* There are some optimizations that avoid running the match if a known 5937 starting point is not found, or if a known later character is not present. 5938 However, there is an option that disables these, for testing and for ensuring 5939 that all callouts do actually occur. The option can be set in the regex by 5940 (*NO_START_OPT) or passed in match-time options. */ 5941 5942 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 5943 { 5944 /* Advance to a unique first byte if there is one. */ 5945 5946 if (first_byte >= 0) 5947 { 5948 if (first_byte_caseless) 5949 while (start_match < end_subject && md->lcc[*start_match] != first_byte) 5950 start_match++; 5951 else 5952 while (start_match < end_subject && *start_match != first_byte) 5953 start_match++; 5954 } 5955 5956 /* Or to just after a linebreak for a multiline match */ 5957 5958 else if (startline) 5959 { 5960 if (start_match > md->start_subject + start_offset) 5961 { 5962 #ifdef SUPPORT_UTF8 5963 if (utf8) 5964 { 5965 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 5966 { 5967 start_match++; 5968 while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 5969 start_match++; 5970 } 5971 } 5972 else 5973 #endif 5974 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 5975 start_match++; 5976 5977 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 5978 and we are now at a LF, advance the match position by one more character. 5979 */ 5980 5981 if (start_match[-1] == CHAR_CR && 5982 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 5983 start_match < end_subject && 5984 *start_match == CHAR_NL) 5985 start_match++; 5986 } 5987 } 5988 5989 /* Or to a non-unique first byte after study */ 5990 5991 else if (start_bits != NULL) 5992 { 5993 while (start_match < end_subject) 5994 { 5995 register unsigned int c = *start_match; 5996 if ((start_bits[c/8] & (1 << (c&7))) == 0) 5997 { 5998 start_match++; 5999 #ifdef SUPPORT_UTF8 6000 if (utf8) 6001 while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 6002 start_match++; 6003 #endif 6004 } 6005 else break; 6006 } 6007 } 6008 } /* Starting optimizations */ 6009 6010 /* Restore fudged end_subject */ 6011 6012 end_subject = save_end_subject; 6013 6014 /* The following two optimizations are disabled for partial matching or if 6015 disabling is explicitly requested. */ 6016 6017 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 6018 { 6019 /* If the pattern was studied, a minimum subject length may be set. This is 6020 a lower bound; no actual string of that length may actually match the 6021 pattern. Although the value is, strictly, in characters, we treat it as 6022 bytes to avoid spending too much time in this optimization. */ 6023 6024 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 6025 (pcre_uint32)(end_subject - start_match) < study->minlength) 6026 { 6027 rc = MATCH_NOMATCH; 6028 break; 6029 } 6030 6031 /* If req_byte is set, we know that that character must appear in the 6032 subject for the match to succeed. If the first character is set, req_byte 6033 must be later in the subject; otherwise the test starts at the match point. 6034 This optimization can save a huge amount of backtracking in patterns with 6035 nested unlimited repeats that aren't going to match. Writing separate code 6036 for cased/caseless versions makes it go faster, as does using an 6037 autoincrement and backing off on a match. 6038 6039 HOWEVER: when the subject string is very, very long, searching to its end 6040 can take a long time, and give bad performance on quite ordinary patterns. 6041 This showed up when somebody was matching something like /^\d+C/ on a 6042 32-megabyte string... so we don't do this when the string is sufficiently 6043 long. */ 6044 6045 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) 6046 { 6047 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); 6048 6049 /* We don't need to repeat the search if we haven't yet reached the 6050 place we found it at last time. */ 6051 6052 if (p > req_byte_ptr) 6053 { 6054 if (req_byte_caseless) 6055 { 6056 while (p < end_subject) 6057 { 6058 register int pp = *p++; 6059 if (pp == req_byte || pp == req_byte2) { p--; break; } 6060 } 6061 } 6062 else 6063 { 6064 while (p < end_subject) 6065 { 6066 if (*p++ == req_byte) { p--; break; } 6067 } 6068 } 6069 6070 /* If we can't find the required character, break the matching loop, 6071 forcing a match failure. */ 6072 6073 if (p >= end_subject) 6074 { 6075 rc = MATCH_NOMATCH; 6076 break; 6077 } 6078 6079 /* If we have found the required character, save the point where we 6080 found it, so that we don't search again next time round the loop if 6081 the start hasn't passed this character yet. */ 6082 6083 req_byte_ptr = p; 6084 } 6085 } 6086 } 6087 6088 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 6089 printf(">>>> Match against: "); 6090 pchars(start_match, end_subject - start_match, TRUE, md); 6091 printf("\n"); 6092 #endif 6093 6094 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6095 first starting point for which a partial match was found. */ 6096 6097 md->start_match_ptr = start_match; 6098 md->start_used_ptr = start_match; 6099 md->match_call_count = 0; 6100 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL, 6101 0, 0); 6102 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; 6103 6104 switch(rc) 6105 { 6106 /* SKIP passes back the next starting point explicitly, but if it is the 6107 same as the match we have just done, treat it as NOMATCH. */ 6108 6109 case MATCH_SKIP: 6110 if (md->start_match_ptr != start_match) 6111 { 6112 new_start_match = md->start_match_ptr; 6113 break; 6114 } 6115 /* Fall through */ 6116 6117 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6118 the SKIP's arg was not found. We also treat this as NOMATCH. */ 6119 6120 case MATCH_SKIP_ARG: 6121 /* Fall through */ 6122 6123 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6124 exactly like PRUNE. */ 6125 6126 case MATCH_NOMATCH: 6127 case MATCH_PRUNE: 6128 case MATCH_THEN: 6129 new_start_match = start_match + 1; 6130 #ifdef SUPPORT_UTF8 6131 if (utf8) 6132 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) 6133 new_start_match++; 6134 #endif 6135 break; 6136 6137 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6138 6139 case MATCH_COMMIT: 6140 rc = MATCH_NOMATCH; 6141 goto ENDLOOP; 6142 6143 /* Any other return is either a match, or some kind of error. */ 6144 6145 default: 6146 goto ENDLOOP; 6147 } 6148 6149 /* Control reaches here for the various types of "no match at this point" 6150 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 6151 6152 rc = MATCH_NOMATCH; 6153 6154 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 6155 newline in the subject (though it may continue over the newline). Therefore, 6156 if we have just failed to match, starting at a newline, do not continue. */ 6157 6158 if (firstline && IS_NEWLINE(start_match)) break; 6159 6160 /* Advance to new matching position */ 6161 6162 start_match = new_start_match; 6163 6164 /* Break the loop if the pattern is anchored or if we have passed the end of 6165 the subject. */ 6166 6167 if (anchored || start_match > end_subject) break; 6168 6169 /* If we have just passed a CR and we are now at a LF, and the pattern does 6170 not contain any explicit matches for \r or \n, and the newline option is CRLF 6171 or ANY or ANYCRLF, advance the match position by one more character. */ 6172 6173 if (start_match[-1] == CHAR_CR && 6174 start_match < end_subject && 6175 *start_match == CHAR_NL && 6176 (re->flags & PCRE_HASCRORLF) == 0 && 6177 (md->nltype == NLTYPE_ANY || 6178 md->nltype == NLTYPE_ANYCRLF || 6179 md->nllen == 2)) 6180 start_match++; 6181 6182 md->mark = NULL; /* Reset for start of next match attempt */ 6183 } /* End of for(;;) "bumpalong" loop */ 6184 6185 /* ==========================================================================*/ 6186 6187 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 6188 conditions is true: 6189 6190 (1) The pattern is anchored or the match was failed by (*COMMIT); 6191 6192 (2) We are past the end of the subject; 6193 6194 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 6195 this option requests that a match occur at or before the first newline in 6196 the subject. 6197 6198 When we have a match and the offset vector is big enough to deal with any 6199 backreferences, captured substring offsets will already be set up. In the case 6200 where we had to get some local store to hold offsets for backreference 6201 processing, copy those that we can. In this case there need not be overflow if 6202 certain parts of the pattern were not used, even though there are more 6203 capturing parentheses than vector slots. */ 6204 6205 ENDLOOP: 6206 6207 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 6208 { 6209 if (using_temporary_offsets) 6210 { 6211 if (offsetcount >= 4) 6212 { 6213 memcpy(offsets + 2, md->offset_vector + 2, 6214 (offsetcount - 2) * sizeof(int)); 6215 DPRINTF(("Copied offsets from temporary memory\n")); 6216 } 6217 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; 6218 DPRINTF(("Freeing temporary memory\n")); 6219 (pcre_free)(md->offset_vector); 6220 } 6221 6222 /* Set the return code to the number of captured strings, or 0 if there are 6223 too many to fit into the vector. */ 6224 6225 rc = md->offset_overflow? 0 : md->end_offset_top/2; 6226 6227 /* If there is space, set up the whole thing as substring 0. The value of 6228 md->start_match_ptr might be modified if \K was encountered on the success 6229 matching path. */ 6230 6231 if (offsetcount < 2) rc = 0; else 6232 { 6233 offsets[0] = (int)(md->start_match_ptr - md->start_subject); 6234 offsets[1] = (int)(md->end_match_ptr - md->start_subject); 6235 } 6236 6237 DPRINTF((">>>> returning %d\n", rc)); 6238 goto RETURN_MARK; 6239 } 6240 6241 /* Control gets here if there has been an error, or if the overall match 6242 attempt has failed at all permitted starting positions. */ 6243 6244 if (using_temporary_offsets) 6245 { 6246 DPRINTF(("Freeing temporary memory\n")); 6247 (pcre_free)(md->offset_vector); 6248 } 6249 6250 /* For anything other than nomatch or partial match, just return the code. */ 6251 6252 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 6253 { 6254 DPRINTF((">>>> error: returning %d\n", rc)); 6255 return rc; 6256 } 6257 6258 /* Handle partial matches - disable any mark data */ 6259 6260 if (start_partial != NULL) 6261 { 6262 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 6263 md->mark = NULL; 6264 if (offsetcount > 1) 6265 { 6266 offsets[0] = (int)(start_partial - (USPTR)subject); 6267 offsets[1] = (int)(end_subject - (USPTR)subject); 6268 } 6269 rc = PCRE_ERROR_PARTIAL; 6270 } 6271 6272 /* This is the classic nomatch case */ 6273 6274 else 6275 { 6276 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 6277 rc = PCRE_ERROR_NOMATCH; 6278 } 6279 6280 /* Return the MARK data if it has been requested. */ 6281 6282 RETURN_MARK: 6283 6284 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 6285 *(extra_data->mark) = (unsigned char *)(md->mark); 6286 return rc; 6287 } 6288 6289 /* End of pcre_exec.c */ 6290