1 /* This is JavaScriptCore's variant of the PCRE library. While this library 2 started out as a copy of PCRE, many of the features of PCRE have been 3 removed. This library now supports only the regular expression features 4 required by the JavaScript language specification, and has only the functions 5 needed by JavaScriptCore and the rest of WebKit. 6 7 Originally written by Philip Hazel 8 Copyright (c) 1997-2006 University of Cambridge 9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 10 Copyright (C) 2007 Eric Seidel <eric (at) webkit.org> 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 /* This module contains jsRegExpExecute(), the externally visible function 42 that does pattern matching using an NFA algorithm, following the rules from 43 the JavaScript specification. There are also some supporting functions. */ 44 45 #include "config.h" 46 #include "pcre_internal.h" 47 48 #include <limits.h> 49 #include <wtf/ASCIICType.h> 50 #include <wtf/Vector.h> 51 52 #if REGEXP_HISTOGRAM 53 #include <wtf/DateMath.h> 54 #include <runtime/UString.h> 55 #endif 56 57 using namespace WTF; 58 59 #if COMPILER(GCC) 60 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 61 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 62 #endif 63 64 /* Avoid warnings on Windows. */ 65 #undef min 66 #undef max 67 68 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 69 typedef int ReturnLocation; 70 #else 71 typedef void* ReturnLocation; 72 #endif 73 74 #if !REGEXP_HISTOGRAM 75 76 class HistogramTimeLogger { 77 public: 78 HistogramTimeLogger(const JSRegExp*) { } 79 }; 80 81 #else 82 83 using namespace JSC; 84 85 class Histogram { 86 public: 87 ~Histogram(); 88 void add(const JSRegExp*, double); 89 90 private: 91 typedef HashMap<RefPtr<UString::Rep>, double> Map; 92 Map times; 93 }; 94 95 class HistogramTimeLogger { 96 public: 97 HistogramTimeLogger(const JSRegExp*); 98 ~HistogramTimeLogger(); 99 100 private: 101 const JSRegExp* m_re; 102 double m_startTime; 103 }; 104 105 #endif 106 107 /* Structure for building a chain of data for holding the values of 108 the subject pointer at the start of each bracket, used to detect when 109 an empty string has been matched by a bracket to break infinite loops. */ 110 struct BracketChainNode { 111 BracketChainNode* previousBracket; 112 const UChar* bracketStart; 113 }; 114 115 struct MatchFrame : FastAllocBase { 116 ReturnLocation returnLocation; 117 struct MatchFrame* previousFrame; 118 119 /* Function arguments that may change */ 120 struct { 121 const UChar* subjectPtr; 122 const unsigned char* instructionPtr; 123 int offsetTop; 124 BracketChainNode* bracketChain; 125 } args; 126 127 128 /* PCRE uses "fake" recursion built off of gotos, thus 129 stack-based local variables are not safe to use. Instead we have to 130 store local variables on the current MatchFrame. */ 131 struct { 132 const unsigned char* data; 133 const unsigned char* startOfRepeatingBracket; 134 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare 135 const unsigned char* instructionPtrAtStartOfOnce; 136 137 int repeatOthercase; 138 139 int ctype; 140 int fc; 141 int fi; 142 int length; 143 int max; 144 int number; 145 int offset; 146 int saveOffset1; 147 int saveOffset2; 148 int saveOffset3; 149 150 BracketChainNode bracketChainNode; 151 } locals; 152 }; 153 154 /* Structure for passing "static" information around between the functions 155 doing traditional NFA matching, so that they are thread-safe. */ 156 157 struct MatchData { 158 int* offsetVector; /* Offset vector */ 159 int offsetEnd; /* One past the end */ 160 int offsetMax; /* The maximum usable for return data */ 161 bool offsetOverflow; /* Set if too many extractions */ 162 const UChar* startSubject; /* Start of the subject string */ 163 const UChar* endSubject; /* End of the subject string */ 164 const UChar* endMatchPtr; /* Subject position at end match */ 165 int endOffsetTop; /* Highwater mark at end of match */ 166 bool multiline; 167 bool ignoreCase; 168 }; 169 170 /* The maximum remaining length of subject we are prepared to search for a 171 reqByte match. */ 172 173 #define REQ_BYTE_MAX 1000 174 175 /* The below limit restricts the number of "recursive" match calls in order to 176 avoid spending exponential time on complex regular expressions. */ 177 178 static const unsigned matchLimit = 1000000; 179 180 #ifdef DEBUG 181 /************************************************* 182 * Debugging function to print chars * 183 *************************************************/ 184 185 /* Print a sequence of chars in printable format, stopping at the end of the 186 subject if the requested. 187 188 Arguments: 189 p points to characters 190 length number to print 191 isSubject true if printing from within md.startSubject 192 md pointer to matching data block, if isSubject is true 193 */ 194 195 static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md) 196 { 197 if (isSubject && length > md.endSubject - p) 198 length = md.endSubject - p; 199 while (length-- > 0) { 200 int c; 201 if (isprint(c = *(p++))) 202 printf("%c", c); 203 else if (c < 256) 204 printf("\\x%02x", c); 205 else 206 printf("\\x{%x}", c); 207 } 208 } 209 #endif 210 211 /************************************************* 212 * Match a back-reference * 213 *************************************************/ 214 215 /* If a back reference hasn't been set, the length that is passed is greater 216 than the number of characters left in the string, so the match fails. 217 218 Arguments: 219 offset index into the offset vector 220 subjectPtr points into the subject 221 length length to be matched 222 md points to match data block 223 224 Returns: true if matched 225 */ 226 227 static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md) 228 { 229 const UChar* p = md.startSubject + md.offsetVector[offset]; 230 231 #ifdef DEBUG 232 if (subjectPtr >= md.endSubject) 233 printf("matching subject <null>"); 234 else { 235 printf("matching subject "); 236 pchars(subjectPtr, length, true, md); 237 } 238 printf(" against backref "); 239 pchars(p, length, false, md); 240 printf("\n"); 241 #endif 242 243 /* Always fail if not enough characters left */ 244 245 if (length > md.endSubject - subjectPtr) 246 return false; 247 248 /* Separate the caselesss case for speed */ 249 250 if (md.ignoreCase) { 251 while (length-- > 0) { 252 UChar c = *p++; 253 int othercase = jsc_pcre_ucp_othercase(c); 254 UChar d = *subjectPtr++; 255 if (c != d && othercase != d) 256 return false; 257 } 258 } 259 else { 260 while (length-- > 0) 261 if (*p++ != *subjectPtr++) 262 return false; 263 } 264 265 return true; 266 } 267 268 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 269 270 /* Use numbered labels and switch statement at the bottom of the match function. */ 271 272 #define RMATCH_WHERE(num) num 273 #define RRETURN_LABEL RRETURN_SWITCH 274 275 #else 276 277 /* Use GCC's computed goto extension. */ 278 279 /* For one test case this is more than 40% faster than the switch statement. 280 We could avoid the use of the num argument entirely by using local labels, 281 but using it for the GCC case as well as the non-GCC case allows us to share 282 a bit more code and notice if we use conflicting numbers.*/ 283 284 #define RMATCH_WHERE(num) &&RRETURN_##num 285 #define RRETURN_LABEL *stack.currentFrame->returnLocation 286 287 #endif 288 289 #define RECURSIVE_MATCH_COMMON(num) \ 290 goto RECURSE;\ 291 RRETURN_##num: \ 292 stack.popCurrentFrame(); 293 294 #define RECURSIVE_MATCH(num, ra, rb) \ 295 do { \ 296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ 297 RECURSIVE_MATCH_COMMON(num) \ 298 } while (0) 299 300 #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \ 301 do { \ 302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ 303 startNewGroup(stack.currentFrame); \ 304 RECURSIVE_MATCH_COMMON(num) \ 305 } while (0) 306 307 #define RRETURN goto RRETURN_LABEL 308 309 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) 310 311 /************************************************* 312 * Match from current position * 313 *************************************************/ 314 315 /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character 316 in the subject string, while substringStart holds the value of subjectPtr at the start of the 317 last bracketed group - used for breaking infinite loops matching zero-length 318 strings. This function is called recursively in many circumstances. Whenever it 319 returns a negative (error) response, the outer match() call must also return the 320 same response. 321 322 Arguments: 323 subjectPtr pointer in subject 324 instructionPtr position in code 325 offsetTop current top pointer 326 md pointer to "static" info for the match 327 328 Returns: 1 if matched ) these values are >= 0 329 0 if failed to match ) 330 a negative error value if aborted by an error condition 331 (e.g. stopped by repeated call or recursion limit) 332 */ 333 334 static const unsigned numFramesOnStack = 16; 335 336 struct MatchStack { 337 MatchStack() 338 : framesEnd(frames + numFramesOnStack) 339 , currentFrame(frames) 340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame 341 { 342 ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack); 343 } 344 345 MatchFrame frames[numFramesOnStack]; 346 MatchFrame* framesEnd; 347 MatchFrame* currentFrame; 348 unsigned size; 349 350 inline bool canUseStackBufferForNextFrame() 351 { 352 return size < numFramesOnStack; 353 } 354 355 inline MatchFrame* allocateNextFrame() 356 { 357 if (canUseStackBufferForNextFrame()) 358 return currentFrame + 1; 359 return new MatchFrame; 360 } 361 362 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) 363 { 364 MatchFrame* newframe = allocateNextFrame(); 365 newframe->previousFrame = currentFrame; 366 367 newframe->args.subjectPtr = currentFrame->args.subjectPtr; 368 newframe->args.offsetTop = currentFrame->args.offsetTop; 369 newframe->args.instructionPtr = instructionPtr; 370 newframe->args.bracketChain = bracketChain; 371 newframe->returnLocation = returnLocation; 372 size++; 373 374 currentFrame = newframe; 375 } 376 377 inline void popCurrentFrame() 378 { 379 MatchFrame* oldFrame = currentFrame; 380 currentFrame = currentFrame->previousFrame; 381 if (size > numFramesOnStack) 382 delete oldFrame; 383 size--; 384 } 385 386 void popAllFrames() 387 { 388 while (size) 389 popCurrentFrame(); 390 } 391 }; 392 393 static int matchError(int errorCode, MatchStack& stack) 394 { 395 stack.popAllFrames(); 396 return errorCode; 397 } 398 399 /* Get the next UTF-8 character, not advancing the pointer, incrementing length 400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */ 401 402 static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len) 403 { 404 c = *subjectPtr; 405 if ((c & 0xc0) == 0xc0) { 406 int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ 407 int gcss = 6 * gcaa; 408 c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss; 409 for (int gcii = 1; gcii <= gcaa; gcii++) { 410 gcss -= 6; 411 c |= (subjectPtr[gcii] & 0x3f) << gcss; 412 } 413 len += gcaa; 414 } 415 } 416 417 static inline void startNewGroup(MatchFrame* currentFrame) 418 { 419 /* At the start of a bracketed group, add the current subject pointer to the 420 stack of such pointers, to be re-instated at the end of the group when we hit 421 the closing ket. When match() is called in other circumstances, we don't add to 422 this stack. */ 423 424 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain; 425 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr; 426 currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; 427 } 428 429 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing 430 static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats) 431 { 432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR 433 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 }; 434 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 }; 435 436 ASSERT(instructionOffset >= 0); 437 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); 438 439 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2 440 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; 441 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; 442 } 443 444 static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md) 445 { 446 bool isMatch = false; 447 int min; 448 bool minimize = false; /* Initialization not really needed, but some compilers think so. */ 449 unsigned remainingMatchCount = matchLimit; 450 int othercase; /* Declare here to avoid errors during jumps */ 451 452 MatchStack stack; 453 454 /* The opcode jump table. */ 455 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 456 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, 457 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; 458 #undef EMIT_JUMP_TABLE_ENTRY 459 #endif 460 461 /* One-time setup of the opcode jump table. */ 462 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 463 for (int i = 255; !opcodeJumpTable[i]; i--) 464 opcodeJumpTable[i] = &&CAPTURING_BRACKET; 465 #endif 466 467 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 468 // Shark shows this as a hot line 469 // Using a static const here makes this line disappear, but makes later access hotter (not sure why) 470 stack.currentFrame->returnLocation = &&RETURN; 471 #else 472 stack.currentFrame->returnLocation = 0; 473 #endif 474 stack.currentFrame->args.subjectPtr = subjectPtr; 475 stack.currentFrame->args.instructionPtr = instructionPtr; 476 stack.currentFrame->args.offsetTop = offsetTop; 477 stack.currentFrame->args.bracketChain = 0; 478 startNewGroup(stack.currentFrame); 479 480 /* This is where control jumps back to to effect "recursion" */ 481 482 RECURSE: 483 if (!--remainingMatchCount) 484 return matchError(JSRegExpErrorHitLimit, stack); 485 486 /* Now start processing the operations. */ 487 488 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 489 while (true) 490 #endif 491 { 492 493 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 494 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode 495 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr] 496 #else 497 #define BEGIN_OPCODE(opcode) case OP_##opcode 498 #define NEXT_OPCODE continue 499 #endif 500 501 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 502 NEXT_OPCODE; 503 #else 504 switch (*stack.currentFrame->args.instructionPtr) 505 #endif 506 { 507 /* Non-capturing bracket: optimized */ 508 509 BEGIN_OPCODE(BRA): 510 NON_CAPTURING_BRACKET: 511 DPRINTF(("start bracket 0\n")); 512 do { 513 RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); 514 if (isMatch) 515 RRETURN; 516 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); 517 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); 518 DPRINTF(("bracket 0 failed\n")); 519 RRETURN; 520 521 /* Skip over large extraction number data if encountered. */ 522 523 BEGIN_OPCODE(BRANUMBER): 524 stack.currentFrame->args.instructionPtr += 3; 525 NEXT_OPCODE; 526 527 /* End of the pattern. */ 528 529 BEGIN_OPCODE(END): 530 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */ 531 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */ 532 isMatch = true; 533 RRETURN; 534 535 /* Assertion brackets. Check the alternative branches in turn - the 536 matching won't pass the KET for an assertion. If any one branch matches, 537 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 538 start of each branch to move the current point backwards, so the code at 539 this level is identical to the lookahead case. */ 540 541 BEGIN_OPCODE(ASSERT): 542 do { 543 RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); 544 if (isMatch) 545 break; 546 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); 547 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); 548 if (*stack.currentFrame->args.instructionPtr == OP_KET) 549 RRETURN_NO_MATCH; 550 551 /* Continue from after the assertion, updating the offsets high water 552 mark, since extracts may have been taken during the assertion. */ 553 554 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); 555 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; 556 stack.currentFrame->args.offsetTop = md.endOffsetTop; 557 NEXT_OPCODE; 558 559 /* Negative assertion: all branches must fail to match */ 560 561 BEGIN_OPCODE(ASSERT_NOT): 562 do { 563 RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); 564 if (isMatch) 565 RRETURN_NO_MATCH; 566 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); 567 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); 568 569 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; 570 NEXT_OPCODE; 571 572 /* An alternation is the end of a branch; scan along to find the end of the 573 bracketed group and go to there. */ 574 575 BEGIN_OPCODE(ALT): 576 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); 577 NEXT_OPCODE; 578 579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating 580 that it may occur zero times. It may repeat infinitely, or not at all - 581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper 582 repeat limits are compiled as a number of copies, with the optional ones 583 preceded by BRAZERO or BRAMINZERO. */ 584 585 BEGIN_OPCODE(BRAZERO): { 586 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; 587 RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); 588 if (isMatch) 589 RRETURN; 590 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); 591 stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE; 592 NEXT_OPCODE; 593 } 594 595 BEGIN_OPCODE(BRAMINZERO): { 596 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; 597 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket); 598 RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); 599 if (isMatch) 600 RRETURN; 601 stack.currentFrame->args.instructionPtr++; 602 NEXT_OPCODE; 603 } 604 605 /* End of a group, repeated or non-repeating. If we are at the end of 606 an assertion "group", stop matching and return 1, but record the 607 current high water mark for use by positive assertions. Do this also 608 for the "once" (not-backup up) groups. */ 609 610 BEGIN_OPCODE(KET): 611 BEGIN_OPCODE(KETRMIN): 612 BEGIN_OPCODE(KETRMAX): 613 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1); 614 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart; 615 616 /* Back up the stack of bracket start pointers. */ 617 618 stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket; 619 620 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) { 621 md.endOffsetTop = stack.currentFrame->args.offsetTop; 622 isMatch = true; 623 RRETURN; 624 } 625 626 /* In all other cases except a conditional group we have to check the 627 group number back at the start and if necessary complete handling an 628 extraction by setting the offsets and bumping the high water mark. */ 629 630 stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA; 631 632 /* For extended extraction brackets (large number), we have to fish out 633 the number from a dummy opcode at the start. */ 634 635 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) 636 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); 637 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; 638 639 #ifdef DEBUG 640 printf("end bracket %d", stack.currentFrame->locals.number); 641 printf("\n"); 642 #endif 643 644 /* Test for a numbered group. This includes groups called as a result 645 of recursion. Note that whole-pattern recursion is coded as a recurse 646 into group 0, so it won't be picked up here. Instead, we catch it when 647 the OP_END is reached. */ 648 649 if (stack.currentFrame->locals.number > 0) { 650 if (stack.currentFrame->locals.offset >= md.offsetMax) 651 md.offsetOverflow = true; 652 else { 653 md.offsetVector[stack.currentFrame->locals.offset] = 654 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; 655 md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject; 656 if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset) 657 stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2; 658 } 659 } 660 661 /* For a non-repeating ket, just continue at this level. This also 662 happens for a repeating ket if no characters were matched in the group. 663 This is the forcible breaking of infinite loops as implemented in Perl 664 5.005. If there is an options reset, it will get obeyed in the normal 665 course of events. */ 666 667 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { 668 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; 669 NEXT_OPCODE; 670 } 671 672 /* The repeating kets try the rest of the pattern or restart from the 673 preceding bracket, in the appropriate order. */ 674 675 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { 676 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); 677 if (isMatch) 678 RRETURN; 679 RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); 680 if (isMatch) 681 RRETURN; 682 } else { /* OP_KETRMAX */ 683 RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); 684 if (isMatch) 685 RRETURN; 686 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); 687 if (isMatch) 688 RRETURN; 689 } 690 RRETURN; 691 692 /* Start of subject. */ 693 694 BEGIN_OPCODE(CIRC): 695 if (stack.currentFrame->args.subjectPtr != md.startSubject) 696 RRETURN_NO_MATCH; 697 stack.currentFrame->args.instructionPtr++; 698 NEXT_OPCODE; 699 700 /* After internal newline if multiline. */ 701 702 BEGIN_OPCODE(BOL): 703 if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1])) 704 RRETURN_NO_MATCH; 705 stack.currentFrame->args.instructionPtr++; 706 NEXT_OPCODE; 707 708 /* End of subject. */ 709 710 BEGIN_OPCODE(DOLL): 711 if (stack.currentFrame->args.subjectPtr < md.endSubject) 712 RRETURN_NO_MATCH; 713 stack.currentFrame->args.instructionPtr++; 714 NEXT_OPCODE; 715 716 /* Before internal newline if multiline. */ 717 718 BEGIN_OPCODE(EOL): 719 if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr)) 720 RRETURN_NO_MATCH; 721 stack.currentFrame->args.instructionPtr++; 722 NEXT_OPCODE; 723 724 /* Word boundary assertions */ 725 726 BEGIN_OPCODE(NOT_WORD_BOUNDARY): 727 BEGIN_OPCODE(WORD_BOUNDARY): { 728 bool currentCharIsWordChar = false; 729 bool previousCharIsWordChar = false; 730 731 if (stack.currentFrame->args.subjectPtr > md.startSubject) 732 previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]); 733 if (stack.currentFrame->args.subjectPtr < md.endSubject) 734 currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr); 735 736 /* Now see if the situation is what we want */ 737 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY); 738 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar) 739 RRETURN_NO_MATCH; 740 NEXT_OPCODE; 741 } 742 743 /* Match a single character type; inline for speed */ 744 745 BEGIN_OPCODE(NOT_NEWLINE): 746 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 747 RRETURN_NO_MATCH; 748 if (isNewline(*stack.currentFrame->args.subjectPtr++)) 749 RRETURN_NO_MATCH; 750 stack.currentFrame->args.instructionPtr++; 751 NEXT_OPCODE; 752 753 BEGIN_OPCODE(NOT_DIGIT): 754 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 755 RRETURN_NO_MATCH; 756 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) 757 RRETURN_NO_MATCH; 758 stack.currentFrame->args.instructionPtr++; 759 NEXT_OPCODE; 760 761 BEGIN_OPCODE(DIGIT): 762 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 763 RRETURN_NO_MATCH; 764 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) 765 RRETURN_NO_MATCH; 766 stack.currentFrame->args.instructionPtr++; 767 NEXT_OPCODE; 768 769 BEGIN_OPCODE(NOT_WHITESPACE): 770 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 771 RRETURN_NO_MATCH; 772 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++)) 773 RRETURN_NO_MATCH; 774 stack.currentFrame->args.instructionPtr++; 775 NEXT_OPCODE; 776 777 BEGIN_OPCODE(WHITESPACE): 778 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 779 RRETURN_NO_MATCH; 780 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++)) 781 RRETURN_NO_MATCH; 782 stack.currentFrame->args.instructionPtr++; 783 NEXT_OPCODE; 784 785 BEGIN_OPCODE(NOT_WORDCHAR): 786 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 787 RRETURN_NO_MATCH; 788 if (isWordChar(*stack.currentFrame->args.subjectPtr++)) 789 RRETURN_NO_MATCH; 790 stack.currentFrame->args.instructionPtr++; 791 NEXT_OPCODE; 792 793 BEGIN_OPCODE(WORDCHAR): 794 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 795 RRETURN_NO_MATCH; 796 if (!isWordChar(*stack.currentFrame->args.subjectPtr++)) 797 RRETURN_NO_MATCH; 798 stack.currentFrame->args.instructionPtr++; 799 NEXT_OPCODE; 800 801 /* Match a back reference, possibly repeatedly. Look past the end of the 802 item to see if there is repeat information following. The code is similar 803 to that for character classes, but repeated for efficiency. Then obey 804 similar code to character type repeats - written out again for speed. 805 However, if the referenced string is the empty string, always treat 806 it as matched, any number of times (otherwise there could be infinite 807 loops). */ 808 809 BEGIN_OPCODE(REF): 810 stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */ 811 stack.currentFrame->args.instructionPtr += 3; /* Advance past item */ 812 813 /* If the reference is unset, set the length to be longer than the amount 814 of subject left; this ensures that every attempt at a match fails. We 815 can't just fail here, because of the possibility of quantifiers with zero 816 minima. */ 817 818 if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) 819 stack.currentFrame->locals.length = 0; 820 else 821 stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset]; 822 823 /* Set up for repetition, or handle the non-repeated case */ 824 825 switch (*stack.currentFrame->args.instructionPtr) { 826 case OP_CRSTAR: 827 case OP_CRMINSTAR: 828 case OP_CRPLUS: 829 case OP_CRMINPLUS: 830 case OP_CRQUERY: 831 case OP_CRMINQUERY: 832 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); 833 break; 834 835 case OP_CRRANGE: 836 case OP_CRMINRANGE: 837 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); 838 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 839 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); 840 if (stack.currentFrame->locals.max == 0) 841 stack.currentFrame->locals.max = INT_MAX; 842 stack.currentFrame->args.instructionPtr += 5; 843 break; 844 845 default: /* No repeat follows */ 846 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) 847 RRETURN_NO_MATCH; 848 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; 849 NEXT_OPCODE; 850 } 851 852 /* If the length of the reference is zero, just continue with the 853 main loop. */ 854 855 if (stack.currentFrame->locals.length == 0) 856 NEXT_OPCODE; 857 858 /* First, ensure the minimum number of matches are present. */ 859 860 for (int i = 1; i <= min; i++) { 861 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) 862 RRETURN_NO_MATCH; 863 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; 864 } 865 866 /* If min = max, continue at the same level without recursion. 867 They are not both allowed to be zero. */ 868 869 if (min == stack.currentFrame->locals.max) 870 NEXT_OPCODE; 871 872 /* If minimizing, keep trying and advancing the pointer */ 873 874 if (minimize) { 875 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 876 RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 877 if (isMatch) 878 RRETURN; 879 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) 880 RRETURN; 881 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; 882 } 883 /* Control never reaches here */ 884 } 885 886 /* If maximizing, find the longest string and work backwards */ 887 888 else { 889 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 890 for (int i = min; i < stack.currentFrame->locals.max; i++) { 891 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) 892 break; 893 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; 894 } 895 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { 896 RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 897 if (isMatch) 898 RRETURN; 899 stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length; 900 } 901 RRETURN_NO_MATCH; 902 } 903 /* Control never reaches here */ 904 905 /* Match a bit-mapped character class, possibly repeatedly. This op code is 906 used when all the characters in the class have values in the range 0-255, 907 and either the matching is caseful, or the characters are in the range 908 0-127 when UTF-8 processing is enabled. The only difference between 909 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 910 encountered. 911 912 First, look past the end of the item to see if there is repeat information 913 following. Then obey similar code to character type repeats - written out 914 again for speed. */ 915 916 BEGIN_OPCODE(NCLASS): 917 BEGIN_OPCODE(CLASS): 918 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */ 919 stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */ 920 921 switch (*stack.currentFrame->args.instructionPtr) { 922 case OP_CRSTAR: 923 case OP_CRMINSTAR: 924 case OP_CRPLUS: 925 case OP_CRMINPLUS: 926 case OP_CRQUERY: 927 case OP_CRMINQUERY: 928 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); 929 break; 930 931 case OP_CRRANGE: 932 case OP_CRMINRANGE: 933 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); 934 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 935 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); 936 if (stack.currentFrame->locals.max == 0) 937 stack.currentFrame->locals.max = INT_MAX; 938 stack.currentFrame->args.instructionPtr += 5; 939 break; 940 941 default: /* No repeat follows */ 942 min = stack.currentFrame->locals.max = 1; 943 break; 944 } 945 946 /* First, ensure the minimum number of matches are present. */ 947 948 for (int i = 1; i <= min; i++) { 949 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 950 RRETURN_NO_MATCH; 951 int c = *stack.currentFrame->args.subjectPtr++; 952 if (c > 255) { 953 if (stack.currentFrame->locals.data[-1] == OP_CLASS) 954 RRETURN_NO_MATCH; 955 } else { 956 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) 957 RRETURN_NO_MATCH; 958 } 959 } 960 961 /* If max == min we can continue with the main loop without the 962 need to recurse. */ 963 964 if (min == stack.currentFrame->locals.max) 965 NEXT_OPCODE; 966 967 /* If minimizing, keep testing the rest of the expression and advancing 968 the pointer while it matches the class. */ 969 if (minimize) { 970 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 971 RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 972 if (isMatch) 973 RRETURN; 974 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) 975 RRETURN; 976 int c = *stack.currentFrame->args.subjectPtr++; 977 if (c > 255) { 978 if (stack.currentFrame->locals.data[-1] == OP_CLASS) 979 RRETURN; 980 } else { 981 if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0) 982 RRETURN; 983 } 984 } 985 /* Control never reaches here */ 986 } 987 /* If maximizing, find the longest possible run, then work backwards. */ 988 else { 989 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 990 991 for (int i = min; i < stack.currentFrame->locals.max; i++) { 992 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 993 break; 994 int c = *stack.currentFrame->args.subjectPtr; 995 if (c > 255) { 996 if (stack.currentFrame->locals.data[-1] == OP_CLASS) 997 break; 998 } else { 999 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) 1000 break; 1001 } 1002 ++stack.currentFrame->args.subjectPtr; 1003 } 1004 for (;;) { 1005 RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1006 if (isMatch) 1007 RRETURN; 1008 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) 1009 break; /* Stop if tried at original pos */ 1010 } 1011 1012 RRETURN; 1013 } 1014 /* Control never reaches here */ 1015 1016 /* Match an extended character class. */ 1017 1018 BEGIN_OPCODE(XCLASS): 1019 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */ 1020 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */ 1021 1022 switch (*stack.currentFrame->args.instructionPtr) { 1023 case OP_CRSTAR: 1024 case OP_CRMINSTAR: 1025 case OP_CRPLUS: 1026 case OP_CRMINPLUS: 1027 case OP_CRQUERY: 1028 case OP_CRMINQUERY: 1029 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max); 1030 break; 1031 1032 case OP_CRRANGE: 1033 case OP_CRMINRANGE: 1034 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); 1035 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1036 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3); 1037 if (stack.currentFrame->locals.max == 0) 1038 stack.currentFrame->locals.max = INT_MAX; 1039 stack.currentFrame->args.instructionPtr += 5; 1040 break; 1041 1042 default: /* No repeat follows */ 1043 min = stack.currentFrame->locals.max = 1; 1044 } 1045 1046 /* First, ensure the minimum number of matches are present. */ 1047 1048 for (int i = 1; i <= min; i++) { 1049 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1050 RRETURN_NO_MATCH; 1051 int c = *stack.currentFrame->args.subjectPtr++; 1052 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) 1053 RRETURN_NO_MATCH; 1054 } 1055 1056 /* If max == min we can continue with the main loop without the 1057 need to recurse. */ 1058 1059 if (min == stack.currentFrame->locals.max) 1060 NEXT_OPCODE; 1061 1062 /* If minimizing, keep testing the rest of the expression and advancing 1063 the pointer while it matches the class. */ 1064 1065 if (minimize) { 1066 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1067 RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1068 if (isMatch) 1069 RRETURN; 1070 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) 1071 RRETURN; 1072 int c = *stack.currentFrame->args.subjectPtr++; 1073 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) 1074 RRETURN; 1075 } 1076 /* Control never reaches here */ 1077 } 1078 1079 /* If maximizing, find the longest possible run, then work backwards. */ 1080 1081 else { 1082 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 1083 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1084 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1085 break; 1086 int c = *stack.currentFrame->args.subjectPtr; 1087 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) 1088 break; 1089 ++stack.currentFrame->args.subjectPtr; 1090 } 1091 for(;;) { 1092 RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1093 if (isMatch) 1094 RRETURN; 1095 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) 1096 break; /* Stop if tried at original pos */ 1097 } 1098 RRETURN; 1099 } 1100 1101 /* Control never reaches here */ 1102 1103 /* Match a single character, casefully */ 1104 1105 BEGIN_OPCODE(CHAR): 1106 stack.currentFrame->locals.length = 1; 1107 stack.currentFrame->args.instructionPtr++; 1108 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); 1109 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; 1110 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1111 RRETURN_NO_MATCH; 1112 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++) 1113 RRETURN_NO_MATCH; 1114 NEXT_OPCODE; 1115 1116 /* Match a single character, caselessly */ 1117 1118 BEGIN_OPCODE(CHAR_IGNORING_CASE): { 1119 stack.currentFrame->locals.length = 1; 1120 stack.currentFrame->args.instructionPtr++; 1121 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); 1122 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; 1123 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1124 RRETURN_NO_MATCH; 1125 int dc = *stack.currentFrame->args.subjectPtr++; 1126 if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc) 1127 RRETURN_NO_MATCH; 1128 NEXT_OPCODE; 1129 } 1130 1131 /* Match a single ASCII character. */ 1132 1133 BEGIN_OPCODE(ASCII_CHAR): 1134 if (md.endSubject == stack.currentFrame->args.subjectPtr) 1135 RRETURN_NO_MATCH; 1136 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1]) 1137 RRETURN_NO_MATCH; 1138 ++stack.currentFrame->args.subjectPtr; 1139 stack.currentFrame->args.instructionPtr += 2; 1140 NEXT_OPCODE; 1141 1142 /* Match one of two cases of an ASCII letter. */ 1143 1144 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): 1145 if (md.endSubject == stack.currentFrame->args.subjectPtr) 1146 RRETURN_NO_MATCH; 1147 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1]) 1148 RRETURN_NO_MATCH; 1149 ++stack.currentFrame->args.subjectPtr; 1150 stack.currentFrame->args.instructionPtr += 2; 1151 NEXT_OPCODE; 1152 1153 /* Match a single character repeatedly; different opcodes share code. */ 1154 1155 BEGIN_OPCODE(EXACT): 1156 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1157 minimize = false; 1158 stack.currentFrame->args.instructionPtr += 3; 1159 goto REPEATCHAR; 1160 1161 BEGIN_OPCODE(UPTO): 1162 BEGIN_OPCODE(MINUPTO): 1163 min = 0; 1164 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1165 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO; 1166 stack.currentFrame->args.instructionPtr += 3; 1167 goto REPEATCHAR; 1168 1169 BEGIN_OPCODE(STAR): 1170 BEGIN_OPCODE(MINSTAR): 1171 BEGIN_OPCODE(PLUS): 1172 BEGIN_OPCODE(MINPLUS): 1173 BEGIN_OPCODE(QUERY): 1174 BEGIN_OPCODE(MINQUERY): 1175 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max); 1176 1177 /* Common code for all repeated single-character matches. We can give 1178 up quickly if there are fewer than the minimum number of characters left in 1179 the subject. */ 1180 1181 REPEATCHAR: 1182 1183 stack.currentFrame->locals.length = 1; 1184 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); 1185 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr) 1186 RRETURN_NO_MATCH; 1187 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; 1188 1189 if (stack.currentFrame->locals.fc <= 0xFFFF) { 1190 othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1; 1191 1192 for (int i = 1; i <= min; i++) { 1193 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) 1194 RRETURN_NO_MATCH; 1195 ++stack.currentFrame->args.subjectPtr; 1196 } 1197 1198 if (min == stack.currentFrame->locals.max) 1199 NEXT_OPCODE; 1200 1201 if (minimize) { 1202 stack.currentFrame->locals.repeatOthercase = othercase; 1203 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1204 RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1205 if (isMatch) 1206 RRETURN; 1207 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) 1208 RRETURN; 1209 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase) 1210 RRETURN; 1211 ++stack.currentFrame->args.subjectPtr; 1212 } 1213 /* Control never reaches here */ 1214 } else { 1215 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 1216 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1217 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1218 break; 1219 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) 1220 break; 1221 ++stack.currentFrame->args.subjectPtr; 1222 } 1223 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { 1224 RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1225 if (isMatch) 1226 RRETURN; 1227 --stack.currentFrame->args.subjectPtr; 1228 } 1229 RRETURN_NO_MATCH; 1230 } 1231 /* Control never reaches here */ 1232 } else { 1233 /* No case on surrogate pairs, so no need to bother with "othercase". */ 1234 1235 for (int i = 1; i <= min; i++) { 1236 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) 1237 RRETURN_NO_MATCH; 1238 stack.currentFrame->args.subjectPtr += 2; 1239 } 1240 1241 if (min == stack.currentFrame->locals.max) 1242 NEXT_OPCODE; 1243 1244 if (minimize) { 1245 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1246 RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1247 if (isMatch) 1248 RRETURN; 1249 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) 1250 RRETURN; 1251 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) 1252 RRETURN; 1253 stack.currentFrame->args.subjectPtr += 2; 1254 } 1255 /* Control never reaches here */ 1256 } else { 1257 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 1258 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1259 if (stack.currentFrame->args.subjectPtr > md.endSubject - 2) 1260 break; 1261 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) 1262 break; 1263 stack.currentFrame->args.subjectPtr += 2; 1264 } 1265 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { 1266 RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1267 if (isMatch) 1268 RRETURN; 1269 stack.currentFrame->args.subjectPtr -= 2; 1270 } 1271 RRETURN_NO_MATCH; 1272 } 1273 /* Control never reaches here */ 1274 } 1275 /* Control never reaches here */ 1276 1277 /* Match a negated single one-byte character. */ 1278 1279 BEGIN_OPCODE(NOT): { 1280 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1281 RRETURN_NO_MATCH; 1282 int b = stack.currentFrame->args.instructionPtr[1]; 1283 int c = *stack.currentFrame->args.subjectPtr++; 1284 stack.currentFrame->args.instructionPtr += 2; 1285 if (md.ignoreCase) { 1286 if (c < 128) 1287 c = toLowerCase(c); 1288 if (toLowerCase(b) == c) 1289 RRETURN_NO_MATCH; 1290 } else { 1291 if (b == c) 1292 RRETURN_NO_MATCH; 1293 } 1294 NEXT_OPCODE; 1295 } 1296 1297 /* Match a negated single one-byte character repeatedly. This is almost a 1298 repeat of the code for a repeated single character, but I haven't found a 1299 nice way of commoning these up that doesn't require a test of the 1300 positive/negative option for each character match. Maybe that wouldn't add 1301 very much to the time taken, but character matching *is* what this is all 1302 about... */ 1303 1304 BEGIN_OPCODE(NOTEXACT): 1305 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1306 minimize = false; 1307 stack.currentFrame->args.instructionPtr += 3; 1308 goto REPEATNOTCHAR; 1309 1310 BEGIN_OPCODE(NOTUPTO): 1311 BEGIN_OPCODE(NOTMINUPTO): 1312 min = 0; 1313 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1314 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO; 1315 stack.currentFrame->args.instructionPtr += 3; 1316 goto REPEATNOTCHAR; 1317 1318 BEGIN_OPCODE(NOTSTAR): 1319 BEGIN_OPCODE(NOTMINSTAR): 1320 BEGIN_OPCODE(NOTPLUS): 1321 BEGIN_OPCODE(NOTMINPLUS): 1322 BEGIN_OPCODE(NOTQUERY): 1323 BEGIN_OPCODE(NOTMINQUERY): 1324 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max); 1325 1326 /* Common code for all repeated single-byte matches. We can give up quickly 1327 if there are fewer than the minimum number of bytes left in the 1328 subject. */ 1329 1330 REPEATNOTCHAR: 1331 if (min > md.endSubject - stack.currentFrame->args.subjectPtr) 1332 RRETURN_NO_MATCH; 1333 stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++; 1334 1335 /* The code is duplicated for the caseless and caseful cases, for speed, 1336 since matching characters is likely to be quite common. First, ensure the 1337 minimum number of matches are present. If min = max, continue at the same 1338 level without recursing. Otherwise, if minimizing, keep trying the rest of 1339 the expression and advancing one matching character if failing, up to the 1340 maximum. Alternatively, if maximizing, find the maximum number of 1341 characters and work backwards. */ 1342 1343 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max)); 1344 1345 if (md.ignoreCase) { 1346 if (stack.currentFrame->locals.fc < 128) 1347 stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc); 1348 1349 for (int i = 1; i <= min; i++) { 1350 int d = *stack.currentFrame->args.subjectPtr++; 1351 if (d < 128) 1352 d = toLowerCase(d); 1353 if (stack.currentFrame->locals.fc == d) 1354 RRETURN_NO_MATCH; 1355 } 1356 1357 if (min == stack.currentFrame->locals.max) 1358 NEXT_OPCODE; 1359 1360 if (minimize) { 1361 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1362 RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1363 if (isMatch) 1364 RRETURN; 1365 int d = *stack.currentFrame->args.subjectPtr++; 1366 if (d < 128) 1367 d = toLowerCase(d); 1368 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) 1369 RRETURN; 1370 } 1371 /* Control never reaches here */ 1372 } 1373 1374 /* Maximize case */ 1375 1376 else { 1377 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 1378 1379 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1380 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1381 break; 1382 int d = *stack.currentFrame->args.subjectPtr; 1383 if (d < 128) 1384 d = toLowerCase(d); 1385 if (stack.currentFrame->locals.fc == d) 1386 break; 1387 ++stack.currentFrame->args.subjectPtr; 1388 } 1389 for (;;) { 1390 RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1391 if (isMatch) 1392 RRETURN; 1393 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) 1394 break; /* Stop if tried at original pos */ 1395 } 1396 1397 RRETURN; 1398 } 1399 /* Control never reaches here */ 1400 } 1401 1402 /* Caseful comparisons */ 1403 1404 else { 1405 for (int i = 1; i <= min; i++) { 1406 int d = *stack.currentFrame->args.subjectPtr++; 1407 if (stack.currentFrame->locals.fc == d) 1408 RRETURN_NO_MATCH; 1409 } 1410 1411 if (min == stack.currentFrame->locals.max) 1412 NEXT_OPCODE; 1413 1414 if (minimize) { 1415 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1416 RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1417 if (isMatch) 1418 RRETURN; 1419 int d = *stack.currentFrame->args.subjectPtr++; 1420 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) 1421 RRETURN; 1422 } 1423 /* Control never reaches here */ 1424 } 1425 1426 /* Maximize case */ 1427 1428 else { 1429 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; 1430 1431 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1432 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1433 break; 1434 int d = *stack.currentFrame->args.subjectPtr; 1435 if (stack.currentFrame->locals.fc == d) 1436 break; 1437 ++stack.currentFrame->args.subjectPtr; 1438 } 1439 for (;;) { 1440 RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1441 if (isMatch) 1442 RRETURN; 1443 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) 1444 break; /* Stop if tried at original pos */ 1445 } 1446 1447 RRETURN; 1448 } 1449 } 1450 /* Control never reaches here */ 1451 1452 /* Match a single character type repeatedly; several different opcodes 1453 share code. This is very similar to the code for single characters, but we 1454 repeat it in the interests of efficiency. */ 1455 1456 BEGIN_OPCODE(TYPEEXACT): 1457 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1458 minimize = true; 1459 stack.currentFrame->args.instructionPtr += 3; 1460 goto REPEATTYPE; 1461 1462 BEGIN_OPCODE(TYPEUPTO): 1463 BEGIN_OPCODE(TYPEMINUPTO): 1464 min = 0; 1465 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1); 1466 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO; 1467 stack.currentFrame->args.instructionPtr += 3; 1468 goto REPEATTYPE; 1469 1470 BEGIN_OPCODE(TYPESTAR): 1471 BEGIN_OPCODE(TYPEMINSTAR): 1472 BEGIN_OPCODE(TYPEPLUS): 1473 BEGIN_OPCODE(TYPEMINPLUS): 1474 BEGIN_OPCODE(TYPEQUERY): 1475 BEGIN_OPCODE(TYPEMINQUERY): 1476 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max); 1477 1478 /* Common code for all repeated single character type matches. Note that 1479 in UTF-8 mode, '.' matches a character of any length, but for the other 1480 character types, the valid characters are all one-byte long. */ 1481 1482 REPEATTYPE: 1483 stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */ 1484 1485 /* First, ensure the minimum number of matches are present. Use inline 1486 code for maximizing the speed, and do the type test once at the start 1487 (i.e. keep it out of the loop). Also we can test that there are at least 1488 the minimum number of characters before we start. */ 1489 1490 if (min > md.endSubject - stack.currentFrame->args.subjectPtr) 1491 RRETURN_NO_MATCH; 1492 if (min > 0) { 1493 switch (stack.currentFrame->locals.ctype) { 1494 case OP_NOT_NEWLINE: 1495 for (int i = 1; i <= min; i++) { 1496 if (isNewline(*stack.currentFrame->args.subjectPtr)) 1497 RRETURN_NO_MATCH; 1498 ++stack.currentFrame->args.subjectPtr; 1499 } 1500 break; 1501 1502 case OP_NOT_DIGIT: 1503 for (int i = 1; i <= min; i++) { 1504 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr)) 1505 RRETURN_NO_MATCH; 1506 ++stack.currentFrame->args.subjectPtr; 1507 } 1508 break; 1509 1510 case OP_DIGIT: 1511 for (int i = 1; i <= min; i++) { 1512 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr)) 1513 RRETURN_NO_MATCH; 1514 ++stack.currentFrame->args.subjectPtr; 1515 } 1516 break; 1517 1518 case OP_NOT_WHITESPACE: 1519 for (int i = 1; i <= min; i++) { 1520 if (isSpaceChar(*stack.currentFrame->args.subjectPtr)) 1521 RRETURN_NO_MATCH; 1522 ++stack.currentFrame->args.subjectPtr; 1523 } 1524 break; 1525 1526 case OP_WHITESPACE: 1527 for (int i = 1; i <= min; i++) { 1528 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr)) 1529 RRETURN_NO_MATCH; 1530 ++stack.currentFrame->args.subjectPtr; 1531 } 1532 break; 1533 1534 case OP_NOT_WORDCHAR: 1535 for (int i = 1; i <= min; i++) { 1536 if (isWordChar(*stack.currentFrame->args.subjectPtr)) 1537 RRETURN_NO_MATCH; 1538 ++stack.currentFrame->args.subjectPtr; 1539 } 1540 break; 1541 1542 case OP_WORDCHAR: 1543 for (int i = 1; i <= min; i++) { 1544 if (!isWordChar(*stack.currentFrame->args.subjectPtr)) 1545 RRETURN_NO_MATCH; 1546 ++stack.currentFrame->args.subjectPtr; 1547 } 1548 break; 1549 1550 default: 1551 ASSERT_NOT_REACHED(); 1552 return matchError(JSRegExpErrorInternal, stack); 1553 } /* End switch(stack.currentFrame->locals.ctype) */ 1554 } 1555 1556 /* If min = max, continue at the same level without recursing */ 1557 1558 if (min == stack.currentFrame->locals.max) 1559 NEXT_OPCODE; 1560 1561 /* If minimizing, we have to test the rest of the pattern before each 1562 subsequent match. */ 1563 1564 if (minimize) { 1565 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { 1566 RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1567 if (isMatch) 1568 RRETURN; 1569 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) 1570 RRETURN; 1571 1572 int c = *stack.currentFrame->args.subjectPtr++; 1573 switch (stack.currentFrame->locals.ctype) { 1574 case OP_NOT_NEWLINE: 1575 if (isNewline(c)) 1576 RRETURN; 1577 break; 1578 1579 case OP_NOT_DIGIT: 1580 if (isASCIIDigit(c)) 1581 RRETURN; 1582 break; 1583 1584 case OP_DIGIT: 1585 if (!isASCIIDigit(c)) 1586 RRETURN; 1587 break; 1588 1589 case OP_NOT_WHITESPACE: 1590 if (isSpaceChar(c)) 1591 RRETURN; 1592 break; 1593 1594 case OP_WHITESPACE: 1595 if (!isSpaceChar(c)) 1596 RRETURN; 1597 break; 1598 1599 case OP_NOT_WORDCHAR: 1600 if (isWordChar(c)) 1601 RRETURN; 1602 break; 1603 1604 case OP_WORDCHAR: 1605 if (!isWordChar(c)) 1606 RRETURN; 1607 break; 1608 1609 default: 1610 ASSERT_NOT_REACHED(); 1611 return matchError(JSRegExpErrorInternal, stack); 1612 } 1613 } 1614 /* Control never reaches here */ 1615 } 1616 1617 /* If maximizing it is worth using inline code for speed, doing the type 1618 test once at the start (i.e. keep it out of the loop). */ 1619 1620 else { 1621 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */ 1622 1623 switch (stack.currentFrame->locals.ctype) { 1624 case OP_NOT_NEWLINE: 1625 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1626 if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr)) 1627 break; 1628 stack.currentFrame->args.subjectPtr++; 1629 } 1630 break; 1631 1632 case OP_NOT_DIGIT: 1633 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1634 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1635 break; 1636 int c = *stack.currentFrame->args.subjectPtr; 1637 if (isASCIIDigit(c)) 1638 break; 1639 ++stack.currentFrame->args.subjectPtr; 1640 } 1641 break; 1642 1643 case OP_DIGIT: 1644 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1645 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1646 break; 1647 int c = *stack.currentFrame->args.subjectPtr; 1648 if (!isASCIIDigit(c)) 1649 break; 1650 ++stack.currentFrame->args.subjectPtr; 1651 } 1652 break; 1653 1654 case OP_NOT_WHITESPACE: 1655 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1656 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1657 break; 1658 int c = *stack.currentFrame->args.subjectPtr; 1659 if (isSpaceChar(c)) 1660 break; 1661 ++stack.currentFrame->args.subjectPtr; 1662 } 1663 break; 1664 1665 case OP_WHITESPACE: 1666 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1667 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1668 break; 1669 int c = *stack.currentFrame->args.subjectPtr; 1670 if (!isSpaceChar(c)) 1671 break; 1672 ++stack.currentFrame->args.subjectPtr; 1673 } 1674 break; 1675 1676 case OP_NOT_WORDCHAR: 1677 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1678 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1679 break; 1680 int c = *stack.currentFrame->args.subjectPtr; 1681 if (isWordChar(c)) 1682 break; 1683 ++stack.currentFrame->args.subjectPtr; 1684 } 1685 break; 1686 1687 case OP_WORDCHAR: 1688 for (int i = min; i < stack.currentFrame->locals.max; i++) { 1689 if (stack.currentFrame->args.subjectPtr >= md.endSubject) 1690 break; 1691 int c = *stack.currentFrame->args.subjectPtr; 1692 if (!isWordChar(c)) 1693 break; 1694 ++stack.currentFrame->args.subjectPtr; 1695 } 1696 break; 1697 1698 default: 1699 ASSERT_NOT_REACHED(); 1700 return matchError(JSRegExpErrorInternal, stack); 1701 } 1702 1703 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */ 1704 1705 for (;;) { 1706 RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); 1707 if (isMatch) 1708 RRETURN; 1709 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) 1710 break; /* Stop if tried at original pos */ 1711 } 1712 1713 /* Get here if we can't make it match with any permitted repetitions */ 1714 1715 RRETURN; 1716 } 1717 /* Control never reaches here */ 1718 1719 BEGIN_OPCODE(CRMINPLUS): 1720 BEGIN_OPCODE(CRMINQUERY): 1721 BEGIN_OPCODE(CRMINRANGE): 1722 BEGIN_OPCODE(CRMINSTAR): 1723 BEGIN_OPCODE(CRPLUS): 1724 BEGIN_OPCODE(CRQUERY): 1725 BEGIN_OPCODE(CRRANGE): 1726 BEGIN_OPCODE(CRSTAR): 1727 ASSERT_NOT_REACHED(); 1728 return matchError(JSRegExpErrorInternal, stack); 1729 1730 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP 1731 CAPTURING_BRACKET: 1732 #else 1733 default: 1734 #endif 1735 /* Opening capturing bracket. If there is space in the offset vector, save 1736 the current subject position in the working slot at the top of the vector. We 1737 mustn't change the current values of the data slot, because they may be set 1738 from a previous iteration of this group, and be referred to by a reference 1739 inside the group. 1740 1741 If the bracket fails to match, we need to restore this value and also the 1742 values of the final offsets, in case they were set by a previous iteration of 1743 the same bracket. 1744 1745 If there isn't enough space in the offset vector, treat this as if it were a 1746 non-capturing bracket. Don't worry about setting the flag for the error case 1747 here; that is handled in the code for KET. */ 1748 1749 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); 1750 1751 stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA; 1752 1753 /* For extended extraction brackets (large number), we have to fish out the 1754 number from a dummy opcode at the start. */ 1755 1756 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) 1757 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE); 1758 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; 1759 1760 #ifdef DEBUG 1761 printf("start bracket %d subject=", stack.currentFrame->locals.number); 1762 pchars(stack.currentFrame->args.subjectPtr, 16, true, md); 1763 printf("\n"); 1764 #endif 1765 1766 if (stack.currentFrame->locals.offset < md.offsetMax) { 1767 stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset]; 1768 stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1]; 1769 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; 1770 1771 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3)); 1772 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject; 1773 1774 do { 1775 RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); 1776 if (isMatch) 1777 RRETURN; 1778 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); 1779 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); 1780 1781 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number)); 1782 1783 md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1; 1784 md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2; 1785 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3; 1786 1787 RRETURN; 1788 } 1789 1790 /* Insufficient room for saving captured contents */ 1791 1792 goto NON_CAPTURING_BRACKET; 1793 } 1794 1795 /* Do not stick any code in here without much thought; it is assumed 1796 that "continue" in the code above comes out to here to repeat the main 1797 loop. */ 1798 1799 } /* End of main loop */ 1800 1801 ASSERT_NOT_REACHED(); 1802 1803 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION 1804 1805 RRETURN_SWITCH: 1806 switch (stack.currentFrame->returnLocation) { 1807 case 0: goto RETURN; 1808 case 1: goto RRETURN_1; 1809 case 2: goto RRETURN_2; 1810 case 6: goto RRETURN_6; 1811 case 7: goto RRETURN_7; 1812 case 14: goto RRETURN_14; 1813 case 15: goto RRETURN_15; 1814 case 16: goto RRETURN_16; 1815 case 17: goto RRETURN_17; 1816 case 18: goto RRETURN_18; 1817 case 19: goto RRETURN_19; 1818 case 20: goto RRETURN_20; 1819 case 21: goto RRETURN_21; 1820 case 22: goto RRETURN_22; 1821 case 24: goto RRETURN_24; 1822 case 26: goto RRETURN_26; 1823 case 27: goto RRETURN_27; 1824 case 28: goto RRETURN_28; 1825 case 29: goto RRETURN_29; 1826 case 30: goto RRETURN_30; 1827 case 31: goto RRETURN_31; 1828 case 38: goto RRETURN_38; 1829 case 40: goto RRETURN_40; 1830 case 42: goto RRETURN_42; 1831 case 44: goto RRETURN_44; 1832 case 48: goto RRETURN_48; 1833 case 52: goto RRETURN_52; 1834 } 1835 1836 ASSERT_NOT_REACHED(); 1837 return matchError(JSRegExpErrorInternal, stack); 1838 1839 #endif 1840 1841 RETURN: 1842 return isMatch; 1843 } 1844 1845 1846 /************************************************* 1847 * Execute a Regular Expression * 1848 *************************************************/ 1849 1850 /* This function applies a compiled re to a subject string and picks out 1851 portions of the string if it matches. Two elements in the vector are set for 1852 each substring: the offsets to the start and end of the substring. 1853 1854 Arguments: 1855 re points to the compiled expression 1856 extra_data points to extra data or is NULL 1857 subject points to the subject string 1858 length length of subject string (may contain binary zeros) 1859 start_offset where to start in the subject string 1860 options option bits 1861 offsets points to a vector of ints to be filled in with offsets 1862 offsetCount the number of elements in the vector 1863 1864 Returns: > 0 => success; value is the number of elements filled in 1865 = 0 => success, but offsets is not big enough 1866 -1 => failed to match 1867 < -1 => some kind of unexpected problem 1868 */ 1869 1870 static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) 1871 { 1872 // If firstByte is set, try scanning to the first instance of that byte 1873 // no need to try and match against any earlier part of the subject string. 1874 if (firstByte >= 0) { 1875 UChar firstChar = firstByte; 1876 if (firstByteIsCaseless) 1877 while (subjectPtr < endSubject) { 1878 int c = *subjectPtr; 1879 if (c > 127) 1880 break; 1881 if (toLowerCase(c) == firstChar) 1882 break; 1883 subjectPtr++; 1884 } 1885 else { 1886 while (subjectPtr < endSubject && *subjectPtr != firstChar) 1887 subjectPtr++; 1888 } 1889 } else if (useMultiLineFirstCharOptimization) { 1890 /* Or to just after \n for a multiline match if possible */ 1891 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 1892 if (subjectPtr > originalSubjectStart) { 1893 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) 1894 subjectPtr++; 1895 } 1896 } 1897 } 1898 1899 static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr) 1900 { 1901 /* If reqByte is set, we know that that character must appear in the subject 1902 for the match to succeed. If the first character is set, reqByte must be 1903 later in the subject; otherwise the test starts at the match point. This 1904 optimization can save a huge amount of backtracking in patterns with nested 1905 unlimited repeats that aren't going to match. Writing separate code for 1906 cased/caseless versions makes it go faster, as does using an autoincrement 1907 and backing off on a match. 1908 1909 HOWEVER: when the subject string is very, very long, searching to its end can 1910 take a long time, and give bad performance on quite ordinary patterns. This 1911 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 1912 don't do this when the string is sufficiently long. 1913 */ 1914 1915 if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { 1916 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); 1917 1918 /* We don't need to repeat the search if we haven't yet reached the 1919 place we found it at last time. */ 1920 1921 if (p > reqBytePtr) { 1922 if (reqByteIsCaseless) { 1923 while (p < endSubject) { 1924 int pp = *p++; 1925 if (pp == reqByte || pp == reqByte2) { 1926 p--; 1927 break; 1928 } 1929 } 1930 } else { 1931 while (p < endSubject) { 1932 if (*p++ == reqByte) { 1933 p--; 1934 break; 1935 } 1936 } 1937 } 1938 1939 /* If we can't find the required character, break the matching loop */ 1940 1941 if (p >= endSubject) 1942 return true; 1943 1944 /* If we have found the required character, save the point where we 1945 found it, so that we don't search again next time round the loop if 1946 the start hasn't passed this character yet. */ 1947 1948 reqBytePtr = p; 1949 } 1950 } 1951 return false; 1952 } 1953 1954 int jsRegExpExecute(const JSRegExp* re, 1955 const UChar* subject, int length, int start_offset, int* offsets, 1956 int offsetCount) 1957 { 1958 ASSERT(re); 1959 ASSERT(subject || !length); 1960 ASSERT(offsetCount >= 0); 1961 ASSERT(offsets || offsetCount == 0); 1962 1963 HistogramTimeLogger logger(re); 1964 1965 MatchData matchBlock; 1966 matchBlock.startSubject = subject; 1967 matchBlock.endSubject = matchBlock.startSubject + length; 1968 const UChar* endSubject = matchBlock.endSubject; 1969 1970 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); 1971 matchBlock.ignoreCase = (re->options & IgnoreCaseOption); 1972 1973 /* If the expression has got more back references than the offsets supplied can 1974 hold, we get a temporary chunk of working store to use during the matching. 1975 Otherwise, we can use the vector supplied, rounding down its size to a multiple 1976 of 3. */ 1977 1978 int ocount = offsetCount - (offsetCount % 3); 1979 1980 // FIXME: This is lame that we have to second-guess our caller here. 1981 // The API should change to either fail-hard when we don't have enough offset space 1982 // or that we shouldn't ask our callers to pre-allocate in the first place. 1983 bool usingTemporaryOffsets = false; 1984 if (re->topBackref > 0 && re->topBackref >= ocount/3) { 1985 ocount = re->topBackref * 3 + 3; 1986 matchBlock.offsetVector = new int[ocount]; 1987 if (!matchBlock.offsetVector) 1988 return JSRegExpErrorNoMemory; 1989 usingTemporaryOffsets = true; 1990 } else 1991 matchBlock.offsetVector = offsets; 1992 1993 matchBlock.offsetEnd = ocount; 1994 matchBlock.offsetMax = (2*ocount)/3; 1995 matchBlock.offsetOverflow = false; 1996 1997 /* Compute the minimum number of offsets that we need to reset each time. Doing 1998 this makes a huge difference to execution time when there aren't many brackets 1999 in the pattern. */ 2000 2001 int resetCount = 2 + re->topBracket * 2; 2002 if (resetCount > offsetCount) 2003 resetCount = ocount; 2004 2005 /* Reset the working variable associated with each extraction. These should 2006 never be used unless previously set, but they get saved and restored, and so we 2007 initialize them to avoid reading uninitialized locations. */ 2008 2009 if (matchBlock.offsetVector) { 2010 int* iptr = matchBlock.offsetVector + ocount; 2011 int* iend = iptr - resetCount/2 + 1; 2012 while (--iptr >= iend) 2013 *iptr = -1; 2014 } 2015 2016 /* Set up the first character to match, if available. The firstByte value is 2017 never set for an anchored regular expression, but the anchoring may be forced 2018 at run time, so we have to test for anchoring. The first char may be unset for 2019 an unanchored pattern, of course. If there's no first char and the pattern was 2020 studied, there may be a bitmap of possible first characters. */ 2021 2022 bool firstByteIsCaseless = false; 2023 int firstByte = -1; 2024 if (re->options & UseFirstByteOptimizationOption) { 2025 firstByte = re->firstByte & 255; 2026 if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE))) 2027 firstByte = toLowerCase(firstByte); 2028 } 2029 2030 /* For anchored or unanchored matches, there may be a "last known required 2031 character" set. */ 2032 2033 bool reqByteIsCaseless = false; 2034 int reqByte = -1; 2035 int reqByte2 = -1; 2036 if (re->options & UseRequiredByteOptimizationOption) { 2037 reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... 2038 reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE); 2039 reqByte2 = flipCase(reqByte); 2040 } 2041 2042 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 2043 the loop runs just once. */ 2044 2045 const UChar* startMatch = subject + start_offset; 2046 const UChar* reqBytePtr = startMatch - 1; 2047 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption; 2048 2049 do { 2050 /* Reset the maximum number of extractions we might see. */ 2051 if (matchBlock.offsetVector) { 2052 int* iptr = matchBlock.offsetVector; 2053 int* iend = iptr + resetCount; 2054 while (iptr < iend) 2055 *iptr++ = -1; 2056 } 2057 2058 tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset); 2059 if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr)) 2060 break; 2061 2062 /* When a match occurs, substrings will be set for all internal extractions; 2063 we just need to set up the whole thing as substring 0 before returning. If 2064 there were too many extractions, set the return code to zero. In the case 2065 where we had to get some local store to hold offsets for backreferences, copy 2066 those back references that we can. In this case there need not be overflow 2067 if certain parts of the pattern were not used. */ 2068 2069 /* The code starts after the JSRegExp block and the capture name table. */ 2070 const unsigned char* start_code = (const unsigned char*)(re + 1); 2071 2072 int returnCode = match(startMatch, start_code, 2, matchBlock); 2073 2074 /* When the result is no match, advance the pointer to the next character 2075 and continue. */ 2076 if (returnCode == 0) { 2077 startMatch++; 2078 continue; 2079 } 2080 2081 if (returnCode != 1) { 2082 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory); 2083 DPRINTF((">>>> error: returning %d\n", returnCode)); 2084 return returnCode; 2085 } 2086 2087 /* We have a match! Copy the offset information from temporary store if 2088 necessary */ 2089 2090 if (usingTemporaryOffsets) { 2091 if (offsetCount >= 4) { 2092 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int)); 2093 DPRINTF(("Copied offsets from temporary memory\n")); 2094 } 2095 if (matchBlock.endOffsetTop > offsetCount) 2096 matchBlock.offsetOverflow = true; 2097 2098 DPRINTF(("Freeing temporary memory\n")); 2099 delete [] matchBlock.offsetVector; 2100 } 2101 2102 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2; 2103 2104 if (offsetCount < 2) 2105 returnCode = 0; 2106 else { 2107 offsets[0] = startMatch - matchBlock.startSubject; 2108 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; 2109 } 2110 2111 DPRINTF((">>>> returning %d\n", returnCode)); 2112 return returnCode; 2113 } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); 2114 2115 if (usingTemporaryOffsets) { 2116 DPRINTF(("Freeing temporary memory\n")); 2117 delete [] matchBlock.offsetVector; 2118 } 2119 2120 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 2121 return JSRegExpErrorNoMatch; 2122 } 2123 2124 #if REGEXP_HISTOGRAM 2125 2126 class CompareHistogramEntries { 2127 public: 2128 bool operator()(const pair<UString, double>& a, const pair<UString, double>& b) 2129 { 2130 if (a.second == b.second) 2131 return a.first < b.first; 2132 return a.second < b.second; 2133 } 2134 }; 2135 2136 Histogram::~Histogram() 2137 { 2138 Vector<pair<UString, double> > values; 2139 Map::iterator end = times.end(); 2140 for (Map::iterator it = times.begin(); it != end; ++it) 2141 values.append(*it); 2142 sort(values.begin(), values.end(), CompareHistogramEntries()); 2143 size_t size = values.size(); 2144 printf("Regular Expressions, sorted by time spent evaluating them:\n"); 2145 for (size_t i = 0; i < size; ++i) 2146 printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str()); 2147 } 2148 2149 void Histogram::add(const JSRegExp* re, double elapsedTime) 2150 { 2151 UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength); 2152 if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption) 2153 string += " (multi-line, ignore case)"; 2154 else { 2155 if (re->options & IgnoreCaseOption) 2156 string += " (ignore case)"; 2157 if (re->options & MatchAcrossMultipleLinesOption) 2158 string += " (multi-line)"; 2159 } 2160 pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime); 2161 if (!result.second) 2162 result.first->second += elapsedTime; 2163 } 2164 2165 HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re) 2166 : m_re(re) 2167 , m_startTime(currentTimeMS()) 2168 { 2169 } 2170 2171 HistogramTimeLogger::~HistogramTimeLogger() 2172 { 2173 static Histogram histogram; 2174 histogram.add(m_re, currentTimeMS() - m_startTime); 2175 } 2176 2177 #endif 2178