1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga (at) inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "YarrInterpreter.h" 29 30 #include "Yarr.h" 31 #include <wtf/BumpPointerAllocator.h> 32 33 #ifndef NDEBUG 34 #include <stdio.h> 35 #endif 36 37 using namespace WTF; 38 39 namespace JSC { namespace Yarr { 40 41 class Interpreter { 42 public: 43 struct ParenthesesDisjunctionContext; 44 45 struct BackTrackInfoPatternCharacter { 46 uintptr_t matchAmount; 47 }; 48 struct BackTrackInfoCharacterClass { 49 uintptr_t matchAmount; 50 }; 51 struct BackTrackInfoBackReference { 52 uintptr_t begin; // Not really needed for greedy quantifiers. 53 uintptr_t matchAmount; // Not really needed for fixed quantifiers. 54 }; 55 struct BackTrackInfoAlternative { 56 uintptr_t offset; 57 }; 58 struct BackTrackInfoParentheticalAssertion { 59 uintptr_t begin; 60 }; 61 struct BackTrackInfoParenthesesOnce { 62 uintptr_t begin; 63 }; 64 struct BackTrackInfoParenthesesTerminal { 65 uintptr_t begin; 66 }; 67 struct BackTrackInfoParentheses { 68 uintptr_t matchAmount; 69 ParenthesesDisjunctionContext* lastContext; 70 }; 71 72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) 73 { 74 context->next = backTrack->lastContext; 75 backTrack->lastContext = context; 76 ++backTrack->matchAmount; 77 } 78 79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) 80 { 81 ASSERT(backTrack->matchAmount); 82 ASSERT(backTrack->lastContext); 83 backTrack->lastContext = backTrack->lastContext->next; 84 --backTrack->matchAmount; 85 } 86 87 struct DisjunctionContext 88 { 89 DisjunctionContext() 90 : term(0) 91 { 92 } 93 94 void* operator new(size_t, void* where) 95 { 96 return where; 97 } 98 99 int term; 100 unsigned matchBegin; 101 unsigned matchEnd; 102 uintptr_t frame[1]; 103 }; 104 105 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) 106 { 107 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); 108 allocatorPool = allocatorPool->ensureCapacity(size); 109 if (!allocatorPool) 110 CRASH(); 111 return new(allocatorPool->alloc(size)) DisjunctionContext(); 112 } 113 114 void freeDisjunctionContext(DisjunctionContext* context) 115 { 116 allocatorPool = allocatorPool->dealloc(context); 117 } 118 119 struct ParenthesesDisjunctionContext 120 { 121 ParenthesesDisjunctionContext(int* output, ByteTerm& term) 122 : next(0) 123 { 124 unsigned firstSubpatternId = term.atom.subpatternId; 125 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; 126 127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { 128 subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; 129 output[(firstSubpatternId << 1) + i] = -1; 130 } 131 132 new(getDisjunctionContext(term)) DisjunctionContext(); 133 } 134 135 void* operator new(size_t, void* where) 136 { 137 return where; 138 } 139 140 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) 141 { 142 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) 143 output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; 144 } 145 146 DisjunctionContext* getDisjunctionContext(ByteTerm& term) 147 { 148 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); 149 } 150 151 ParenthesesDisjunctionContext* next; 152 int subpatternBackup[1]; 153 }; 154 155 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) 156 { 157 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); 158 allocatorPool = allocatorPool->ensureCapacity(size); 159 if (!allocatorPool) 160 CRASH(); 161 return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); 162 } 163 164 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) 165 { 166 allocatorPool = allocatorPool->dealloc(context); 167 } 168 169 class InputStream { 170 public: 171 InputStream(const UChar* input, unsigned start, unsigned length) 172 : input(input) 173 , pos(start) 174 , length(length) 175 { 176 } 177 178 void next() 179 { 180 ++pos; 181 } 182 183 void rewind(unsigned amount) 184 { 185 ASSERT(pos >= amount); 186 pos -= amount; 187 } 188 189 int read() 190 { 191 ASSERT(pos < length); 192 if (pos < length) 193 return input[pos]; 194 return -1; 195 } 196 197 int readPair() 198 { 199 ASSERT(pos + 1 < length); 200 return input[pos] | input[pos + 1] << 16; 201 } 202 203 int readChecked(int position) 204 { 205 ASSERT(position < 0); 206 ASSERT(static_cast<unsigned>(-position) <= pos); 207 unsigned p = pos + position; 208 ASSERT(p < length); 209 return input[p]; 210 } 211 212 int reread(unsigned from) 213 { 214 ASSERT(from < length); 215 return input[from]; 216 } 217 218 int prev() 219 { 220 ASSERT(!(pos > length)); 221 if (pos && length) 222 return input[pos - 1]; 223 return -1; 224 } 225 226 unsigned getPos() 227 { 228 return pos; 229 } 230 231 void setPos(unsigned p) 232 { 233 pos = p; 234 } 235 236 bool atStart() 237 { 238 return pos == 0; 239 } 240 241 bool atEnd() 242 { 243 return pos == length; 244 } 245 246 bool checkInput(int count) 247 { 248 if ((pos + count) <= length) { 249 pos += count; 250 return true; 251 } 252 return false; 253 } 254 255 void uncheckInput(int count) 256 { 257 pos -= count; 258 } 259 260 bool atStart(int position) 261 { 262 return (pos + position) == 0; 263 } 264 265 bool atEnd(int position) 266 { 267 return (pos + position) == length; 268 } 269 270 bool isNotAvailableInput(int position) 271 { 272 return (pos + position) > length; 273 } 274 275 private: 276 const UChar* input; 277 unsigned pos; 278 unsigned length; 279 }; 280 281 bool testCharacterClass(CharacterClass* characterClass, int ch) 282 { 283 if (ch & 0xFF80) { 284 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) 285 if (ch == characterClass->m_matchesUnicode[i]) 286 return true; 287 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) 288 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) 289 return true; 290 } else { 291 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) 292 if (ch == characterClass->m_matches[i]) 293 return true; 294 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) 295 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) 296 return true; 297 } 298 299 return false; 300 } 301 302 bool checkCharacter(int testChar, int inputPosition) 303 { 304 return testChar == input.readChecked(inputPosition); 305 } 306 307 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) 308 { 309 int ch = input.readChecked(inputPosition); 310 return (loChar == ch) || (hiChar == ch); 311 } 312 313 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) 314 { 315 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); 316 return invert ? !match : match; 317 } 318 319 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) 320 { 321 int matchSize = matchEnd - matchBegin; 322 323 if (!input.checkInput(matchSize)) 324 return false; 325 326 if (pattern->m_ignoreCase) { 327 for (int i = 0; i < matchSize; ++i) { 328 int ch = input.reread(matchBegin + i); 329 330 int lo = Unicode::toLower(ch); 331 int hi = Unicode::toUpper(ch); 332 333 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { 334 input.uncheckInput(matchSize); 335 return false; 336 } 337 } 338 } else { 339 for (int i = 0; i < matchSize; ++i) { 340 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { 341 input.uncheckInput(matchSize); 342 return false; 343 } 344 } 345 } 346 347 return true; 348 } 349 350 bool matchAssertionBOL(ByteTerm& term) 351 { 352 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); 353 } 354 355 bool matchAssertionEOL(ByteTerm& term) 356 { 357 if (term.inputPosition) 358 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); 359 360 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); 361 } 362 363 bool matchAssertionWordBoundary(ByteTerm& term) 364 { 365 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); 366 bool readIsWordchar; 367 if (term.inputPosition) 368 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); 369 else 370 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); 371 372 bool wordBoundary = prevIsWordchar != readIsWordchar; 373 return term.invert() ? !wordBoundary : wordBoundary; 374 } 375 376 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) 377 { 378 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 379 380 switch (term.atom.quantityType) { 381 case QuantifierFixedCount: 382 break; 383 384 case QuantifierGreedy: 385 if (backTrack->matchAmount) { 386 --backTrack->matchAmount; 387 input.uncheckInput(1); 388 return true; 389 } 390 break; 391 392 case QuantifierNonGreedy: 393 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 394 ++backTrack->matchAmount; 395 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) 396 return true; 397 } 398 input.uncheckInput(backTrack->matchAmount); 399 break; 400 } 401 402 return false; 403 } 404 405 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) 406 { 407 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 408 409 switch (term.atom.quantityType) { 410 case QuantifierFixedCount: 411 break; 412 413 case QuantifierGreedy: 414 if (backTrack->matchAmount) { 415 --backTrack->matchAmount; 416 input.uncheckInput(1); 417 return true; 418 } 419 break; 420 421 case QuantifierNonGreedy: 422 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 423 ++backTrack->matchAmount; 424 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) 425 return true; 426 } 427 input.uncheckInput(backTrack->matchAmount); 428 break; 429 } 430 431 return false; 432 } 433 434 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) 435 { 436 ASSERT(term.type == ByteTerm::TypeCharacterClass); 437 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 438 439 switch (term.atom.quantityType) { 440 case QuantifierFixedCount: { 441 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 442 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) 443 return false; 444 } 445 return true; 446 } 447 448 case QuantifierGreedy: { 449 unsigned matchAmount = 0; 450 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 451 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { 452 input.uncheckInput(1); 453 break; 454 } 455 ++matchAmount; 456 } 457 backTrack->matchAmount = matchAmount; 458 459 return true; 460 } 461 462 case QuantifierNonGreedy: 463 backTrack->matchAmount = 0; 464 return true; 465 } 466 467 ASSERT_NOT_REACHED(); 468 return false; 469 } 470 471 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) 472 { 473 ASSERT(term.type == ByteTerm::TypeCharacterClass); 474 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 475 476 switch (term.atom.quantityType) { 477 case QuantifierFixedCount: 478 break; 479 480 case QuantifierGreedy: 481 if (backTrack->matchAmount) { 482 --backTrack->matchAmount; 483 input.uncheckInput(1); 484 return true; 485 } 486 break; 487 488 case QuantifierNonGreedy: 489 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 490 ++backTrack->matchAmount; 491 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) 492 return true; 493 } 494 input.uncheckInput(backTrack->matchAmount); 495 break; 496 } 497 498 return false; 499 } 500 501 bool matchBackReference(ByteTerm& term, DisjunctionContext* context) 502 { 503 ASSERT(term.type == ByteTerm::TypeBackReference); 504 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 505 506 int matchBegin = output[(term.atom.subpatternId << 1)]; 507 int matchEnd = output[(term.atom.subpatternId << 1) + 1]; 508 509 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. 510 // In this case the result of match is empty string like when it references to a parentheses with zero-width match. 511 // Eg.: /(a\1)/ 512 if (matchEnd == -1) 513 return true; 514 515 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); 516 517 if (matchBegin == matchEnd) 518 return true; 519 520 switch (term.atom.quantityType) { 521 case QuantifierFixedCount: { 522 backTrack->begin = input.getPos(); 523 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 524 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 525 input.setPos(backTrack->begin); 526 return false; 527 } 528 } 529 return true; 530 } 531 532 case QuantifierGreedy: { 533 unsigned matchAmount = 0; 534 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) 535 ++matchAmount; 536 backTrack->matchAmount = matchAmount; 537 return true; 538 } 539 540 case QuantifierNonGreedy: 541 backTrack->begin = input.getPos(); 542 backTrack->matchAmount = 0; 543 return true; 544 } 545 546 ASSERT_NOT_REACHED(); 547 return false; 548 } 549 550 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) 551 { 552 ASSERT(term.type == ByteTerm::TypeBackReference); 553 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 554 555 int matchBegin = output[(term.atom.subpatternId << 1)]; 556 int matchEnd = output[(term.atom.subpatternId << 1) + 1]; 557 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); 558 559 if (matchBegin == matchEnd) 560 return false; 561 562 switch (term.atom.quantityType) { 563 case QuantifierFixedCount: 564 // for quantityCount == 1, could rewind. 565 input.setPos(backTrack->begin); 566 break; 567 568 case QuantifierGreedy: 569 if (backTrack->matchAmount) { 570 --backTrack->matchAmount; 571 input.rewind(matchEnd - matchBegin); 572 return true; 573 } 574 break; 575 576 case QuantifierNonGreedy: 577 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 578 ++backTrack->matchAmount; 579 return true; 580 } 581 input.setPos(backTrack->begin); 582 break; 583 } 584 585 return false; 586 } 587 588 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) 589 { 590 if (term.capture()) { 591 unsigned subpatternId = term.atom.subpatternId; 592 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; 593 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; 594 } 595 } 596 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) 597 { 598 unsigned firstSubpatternId = term.atom.subpatternId; 599 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; 600 context->restoreOutput(output, firstSubpatternId, count); 601 } 602 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) 603 { 604 while (backTrack->matchAmount) { 605 ParenthesesDisjunctionContext* context = backTrack->lastContext; 606 607 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); 608 if (result == JSRegExpMatch) 609 return JSRegExpMatch; 610 611 resetMatches(term, context); 612 popParenthesesDisjunctionContext(backTrack); 613 freeParenthesesDisjunctionContext(context); 614 615 if (result != JSRegExpNoMatch) 616 return result; 617 } 618 619 return JSRegExpNoMatch; 620 } 621 622 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 623 { 624 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 625 ASSERT(term.atom.quantityCount == 1); 626 627 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 628 629 switch (term.atom.quantityType) { 630 case QuantifierGreedy: { 631 // set this speculatively; if we get to the parens end this will be true. 632 backTrack->begin = input.getPos(); 633 break; 634 } 635 case QuantifierNonGreedy: { 636 backTrack->begin = notFound; 637 context->term += term.atom.parenthesesWidth; 638 return true; 639 } 640 case QuantifierFixedCount: 641 break; 642 } 643 644 if (term.capture()) { 645 unsigned subpatternId = term.atom.subpatternId; 646 output[(subpatternId << 1)] = input.getPos() + term.inputPosition; 647 } 648 649 return true; 650 } 651 652 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) 653 { 654 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 655 ASSERT(term.atom.quantityCount == 1); 656 657 if (term.capture()) { 658 unsigned subpatternId = term.atom.subpatternId; 659 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; 660 } 661 662 if (term.atom.quantityType == QuantifierFixedCount) 663 return true; 664 665 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 666 return backTrack->begin != input.getPos(); 667 } 668 669 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 670 { 671 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 672 ASSERT(term.atom.quantityCount == 1); 673 674 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 675 676 if (term.capture()) { 677 unsigned subpatternId = term.atom.subpatternId; 678 output[(subpatternId << 1)] = -1; 679 output[(subpatternId << 1) + 1] = -1; 680 } 681 682 switch (term.atom.quantityType) { 683 case QuantifierGreedy: 684 // if we backtrack to this point, there is another chance - try matching nothing. 685 ASSERT(backTrack->begin != notFound); 686 backTrack->begin = notFound; 687 context->term += term.atom.parenthesesWidth; 688 return true; 689 case QuantifierNonGreedy: 690 ASSERT(backTrack->begin != notFound); 691 case QuantifierFixedCount: 692 break; 693 } 694 695 return false; 696 } 697 698 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) 699 { 700 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 701 ASSERT(term.atom.quantityCount == 1); 702 703 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 704 705 switch (term.atom.quantityType) { 706 case QuantifierGreedy: 707 if (backTrack->begin == notFound) { 708 context->term -= term.atom.parenthesesWidth; 709 return false; 710 } 711 case QuantifierNonGreedy: 712 if (backTrack->begin == notFound) { 713 backTrack->begin = input.getPos(); 714 if (term.capture()) { 715 // Technically this access to inputPosition should be accessing the begin term's 716 // inputPosition, but for repeats other than fixed these values should be 717 // the same anyway! (We don't pre-check for greedy or non-greedy matches.) 718 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 719 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); 720 unsigned subpatternId = term.atom.subpatternId; 721 output[subpatternId << 1] = input.getPos() + term.inputPosition; 722 } 723 context->term -= term.atom.parenthesesWidth; 724 return true; 725 } 726 case QuantifierFixedCount: 727 break; 728 } 729 730 return false; 731 } 732 733 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) 734 { 735 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 736 ASSERT(term.atom.quantityType == QuantifierGreedy); 737 ASSERT(term.atom.quantityCount == quantifyInfinite); 738 ASSERT(!term.capture()); 739 740 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); 741 backTrack->begin = input.getPos(); 742 return true; 743 } 744 745 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) 746 { 747 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); 748 749 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); 750 // Empty match is a failed match. 751 if (backTrack->begin == input.getPos()) 752 return false; 753 754 // Successful match! Okay, what's next? - loop around and try to match moar! 755 context->term -= (term.atom.parenthesesWidth + 1); 756 return true; 757 } 758 759 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) 760 { 761 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 762 ASSERT(term.atom.quantityType == QuantifierGreedy); 763 ASSERT(term.atom.quantityCount == quantifyInfinite); 764 ASSERT(!term.capture()); 765 766 // If we backtrack to this point, we have failed to match this iteration of the parens. 767 // Since this is greedy / zero minimum a failed is also accepted as a match! 768 context->term += term.atom.parenthesesWidth; 769 return true; 770 } 771 772 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) 773 { 774 // 'Terminal' parentheses are at the end of the regex, and as such a match past end 775 // should always be returned as a successful match - we should never backtrack to here. 776 ASSERT_NOT_REACHED(); 777 return false; 778 } 779 780 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 781 { 782 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 783 ASSERT(term.atom.quantityCount == 1); 784 785 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 786 787 backTrack->begin = input.getPos(); 788 return true; 789 } 790 791 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 792 { 793 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 794 ASSERT(term.atom.quantityCount == 1); 795 796 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 797 798 input.setPos(backTrack->begin); 799 800 // We've reached the end of the parens; if they are inverted, this is failure. 801 if (term.invert()) { 802 context->term -= term.atom.parenthesesWidth; 803 return false; 804 } 805 806 return true; 807 } 808 809 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 810 { 811 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 812 ASSERT(term.atom.quantityCount == 1); 813 814 // We've failed to match parens; if they are inverted, this is win! 815 if (term.invert()) { 816 context->term += term.atom.parenthesesWidth; 817 return true; 818 } 819 820 return false; 821 } 822 823 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 824 { 825 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 826 ASSERT(term.atom.quantityCount == 1); 827 828 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 829 830 input.setPos(backTrack->begin); 831 832 context->term -= term.atom.parenthesesWidth; 833 return false; 834 } 835 836 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) 837 { 838 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 839 840 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 841 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 842 843 backTrack->matchAmount = 0; 844 backTrack->lastContext = 0; 845 846 switch (term.atom.quantityType) { 847 case QuantifierFixedCount: { 848 // While we haven't yet reached our fixed limit, 849 while (backTrack->matchAmount < term.atom.quantityCount) { 850 // Try to do a match, and it it succeeds, add it to the list. 851 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 852 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 853 if (result == JSRegExpMatch) 854 appendParenthesesDisjunctionContext(backTrack, context); 855 else { 856 // The match failed; try to find an alternate point to carry on from. 857 resetMatches(term, context); 858 freeParenthesesDisjunctionContext(context); 859 860 if (result == JSRegExpNoMatch) { 861 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); 862 if (backtrackResult != JSRegExpMatch) 863 return backtrackResult; 864 } else 865 return result; 866 } 867 } 868 869 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 870 ParenthesesDisjunctionContext* context = backTrack->lastContext; 871 recordParenthesesMatch(term, context); 872 return JSRegExpMatch; 873 } 874 875 case QuantifierGreedy: { 876 while (backTrack->matchAmount < term.atom.quantityCount) { 877 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 878 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 879 if (result == JSRegExpMatch) 880 appendParenthesesDisjunctionContext(backTrack, context); 881 else { 882 resetMatches(term, context); 883 freeParenthesesDisjunctionContext(context); 884 885 if (result != JSRegExpNoMatch) 886 return result; 887 888 break; 889 } 890 } 891 892 if (backTrack->matchAmount) { 893 ParenthesesDisjunctionContext* context = backTrack->lastContext; 894 recordParenthesesMatch(term, context); 895 } 896 return JSRegExpMatch; 897 } 898 899 case QuantifierNonGreedy: 900 return JSRegExpMatch; 901 } 902 903 ASSERT_NOT_REACHED(); 904 return JSRegExpErrorNoMatch; 905 } 906 907 // Rules for backtracking differ depending on whether this is greedy or non-greedy. 908 // 909 // Greedy matches never should try just adding more - you should already have done 910 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where 911 // you backtrack an item off the list needs checking, since we'll never have matched 912 // the one less case. Tracking forwards, still add as much as possible. 913 // 914 // Non-greedy, we've already done the one less case, so don't match on popping. 915 // We haven't done the one more case, so always try to add that. 916 // 917 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) 918 { 919 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 920 921 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 922 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 923 924 switch (term.atom.quantityType) { 925 case QuantifierFixedCount: { 926 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 927 928 ParenthesesDisjunctionContext* context = 0; 929 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); 930 931 if (result != JSRegExpMatch) 932 return result; 933 934 // While we haven't yet reached our fixed limit, 935 while (backTrack->matchAmount < term.atom.quantityCount) { 936 // Try to do a match, and it it succeeds, add it to the list. 937 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 938 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 939 940 if (result == JSRegExpMatch) 941 appendParenthesesDisjunctionContext(backTrack, context); 942 else { 943 // The match failed; try to find an alternate point to carry on from. 944 resetMatches(term, context); 945 freeParenthesesDisjunctionContext(context); 946 947 if (result == JSRegExpNoMatch) { 948 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); 949 if (backtrackResult != JSRegExpMatch) 950 return backtrackResult; 951 } else 952 return result; 953 } 954 } 955 956 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 957 context = backTrack->lastContext; 958 recordParenthesesMatch(term, context); 959 return JSRegExpMatch; 960 } 961 962 case QuantifierGreedy: { 963 if (!backTrack->matchAmount) 964 return JSRegExpNoMatch; 965 966 ParenthesesDisjunctionContext* context = backTrack->lastContext; 967 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); 968 if (result == JSRegExpMatch) { 969 while (backTrack->matchAmount < term.atom.quantityCount) { 970 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 971 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 972 if (parenthesesResult == JSRegExpMatch) 973 appendParenthesesDisjunctionContext(backTrack, context); 974 else { 975 resetMatches(term, context); 976 freeParenthesesDisjunctionContext(context); 977 978 if (parenthesesResult != JSRegExpNoMatch) 979 return parenthesesResult; 980 981 break; 982 } 983 } 984 } else { 985 resetMatches(term, context); 986 popParenthesesDisjunctionContext(backTrack); 987 freeParenthesesDisjunctionContext(context); 988 989 if (result != JSRegExpNoMatch) 990 return result; 991 } 992 993 if (backTrack->matchAmount) { 994 ParenthesesDisjunctionContext* context = backTrack->lastContext; 995 recordParenthesesMatch(term, context); 996 } 997 return JSRegExpMatch; 998 } 999 1000 case QuantifierNonGreedy: { 1001 // If we've not reached the limit, try to add one more match. 1002 if (backTrack->matchAmount < term.atom.quantityCount) { 1003 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 1004 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); 1005 if (result == JSRegExpMatch) { 1006 appendParenthesesDisjunctionContext(backTrack, context); 1007 recordParenthesesMatch(term, context); 1008 return JSRegExpMatch; 1009 } 1010 1011 resetMatches(term, context); 1012 freeParenthesesDisjunctionContext(context); 1013 1014 if (result != JSRegExpNoMatch) 1015 return result; 1016 } 1017 1018 // Nope - okay backtrack looking for an alternative. 1019 while (backTrack->matchAmount) { 1020 ParenthesesDisjunctionContext* context = backTrack->lastContext; 1021 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); 1022 if (result == JSRegExpMatch) { 1023 // successful backtrack! we're back in the game! 1024 if (backTrack->matchAmount) { 1025 context = backTrack->lastContext; 1026 recordParenthesesMatch(term, context); 1027 } 1028 return JSRegExpMatch; 1029 } 1030 1031 // pop a match off the stack 1032 resetMatches(term, context); 1033 popParenthesesDisjunctionContext(backTrack); 1034 freeParenthesesDisjunctionContext(context); 1035 1036 return result; 1037 } 1038 1039 return JSRegExpNoMatch; 1040 } 1041 } 1042 1043 ASSERT_NOT_REACHED(); 1044 return JSRegExpErrorNoMatch; 1045 } 1046 1047 void lookupForBeginChars() 1048 { 1049 int character; 1050 bool firstSingleCharFound; 1051 1052 while (true) { 1053 if (input.isNotAvailableInput(2)) 1054 return; 1055 1056 firstSingleCharFound = false; 1057 1058 character = input.readPair(); 1059 1060 for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) { 1061 BeginChar bc = pattern->m_beginChars[i]; 1062 1063 if (!firstSingleCharFound && bc.value <= 0xFFFF) { 1064 firstSingleCharFound = true; 1065 character &= 0xFFFF; 1066 } 1067 1068 if ((character | bc.mask) == bc.value) 1069 return; 1070 } 1071 1072 input.next(); 1073 } 1074 } 1075 1076 #define MATCH_NEXT() { ++context->term; goto matchAgain; } 1077 #define BACKTRACK() { --context->term; goto backtrack; } 1078 #define currentTerm() (disjunction->terms[context->term]) 1079 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false) 1080 { 1081 if (!--remainingMatchCount) 1082 return JSRegExpErrorHitLimit; 1083 1084 if (btrack) 1085 BACKTRACK(); 1086 1087 if (pattern->m_containsBeginChars && isBody) 1088 lookupForBeginChars(); 1089 1090 context->matchBegin = input.getPos(); 1091 context->term = 0; 1092 1093 matchAgain: 1094 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 1095 1096 switch (currentTerm().type) { 1097 case ByteTerm::TypeSubpatternBegin: 1098 MATCH_NEXT(); 1099 case ByteTerm::TypeSubpatternEnd: 1100 context->matchEnd = input.getPos(); 1101 return JSRegExpMatch; 1102 1103 case ByteTerm::TypeBodyAlternativeBegin: 1104 MATCH_NEXT(); 1105 case ByteTerm::TypeBodyAlternativeDisjunction: 1106 case ByteTerm::TypeBodyAlternativeEnd: 1107 context->matchEnd = input.getPos(); 1108 return JSRegExpMatch; 1109 1110 case ByteTerm::TypeAlternativeBegin: 1111 MATCH_NEXT(); 1112 case ByteTerm::TypeAlternativeDisjunction: 1113 case ByteTerm::TypeAlternativeEnd: { 1114 int offset = currentTerm().alternative.end; 1115 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 1116 backTrack->offset = offset; 1117 context->term += offset; 1118 MATCH_NEXT(); 1119 } 1120 1121 case ByteTerm::TypeAssertionBOL: 1122 if (matchAssertionBOL(currentTerm())) 1123 MATCH_NEXT(); 1124 BACKTRACK(); 1125 case ByteTerm::TypeAssertionEOL: 1126 if (matchAssertionEOL(currentTerm())) 1127 MATCH_NEXT(); 1128 BACKTRACK(); 1129 case ByteTerm::TypeAssertionWordBoundary: 1130 if (matchAssertionWordBoundary(currentTerm())) 1131 MATCH_NEXT(); 1132 BACKTRACK(); 1133 1134 case ByteTerm::TypePatternCharacterOnce: 1135 case ByteTerm::TypePatternCharacterFixed: { 1136 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1137 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) 1138 BACKTRACK(); 1139 } 1140 MATCH_NEXT(); 1141 } 1142 case ByteTerm::TypePatternCharacterGreedy: { 1143 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1144 unsigned matchAmount = 0; 1145 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1146 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { 1147 input.uncheckInput(1); 1148 break; 1149 } 1150 ++matchAmount; 1151 } 1152 backTrack->matchAmount = matchAmount; 1153 1154 MATCH_NEXT(); 1155 } 1156 case ByteTerm::TypePatternCharacterNonGreedy: { 1157 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1158 backTrack->matchAmount = 0; 1159 MATCH_NEXT(); 1160 } 1161 1162 case ByteTerm::TypePatternCasedCharacterOnce: 1163 case ByteTerm::TypePatternCasedCharacterFixed: { 1164 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1165 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) 1166 BACKTRACK(); 1167 } 1168 MATCH_NEXT(); 1169 } 1170 case ByteTerm::TypePatternCasedCharacterGreedy: { 1171 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1172 unsigned matchAmount = 0; 1173 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1174 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { 1175 input.uncheckInput(1); 1176 break; 1177 } 1178 ++matchAmount; 1179 } 1180 backTrack->matchAmount = matchAmount; 1181 1182 MATCH_NEXT(); 1183 } 1184 case ByteTerm::TypePatternCasedCharacterNonGreedy: { 1185 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1186 backTrack->matchAmount = 0; 1187 MATCH_NEXT(); 1188 } 1189 1190 case ByteTerm::TypeCharacterClass: 1191 if (matchCharacterClass(currentTerm(), context)) 1192 MATCH_NEXT(); 1193 BACKTRACK(); 1194 case ByteTerm::TypeBackReference: 1195 if (matchBackReference(currentTerm(), context)) 1196 MATCH_NEXT(); 1197 BACKTRACK(); 1198 case ByteTerm::TypeParenthesesSubpattern: { 1199 JSRegExpResult result = matchParentheses(currentTerm(), context); 1200 1201 if (result == JSRegExpMatch) { 1202 MATCH_NEXT(); 1203 } else if (result != JSRegExpNoMatch) 1204 return result; 1205 1206 BACKTRACK(); 1207 } 1208 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1209 if (matchParenthesesOnceBegin(currentTerm(), context)) 1210 MATCH_NEXT(); 1211 BACKTRACK(); 1212 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1213 if (matchParenthesesOnceEnd(currentTerm(), context)) 1214 MATCH_NEXT(); 1215 BACKTRACK(); 1216 case ByteTerm::TypeParenthesesSubpatternTerminalBegin: 1217 if (matchParenthesesTerminalBegin(currentTerm(), context)) 1218 MATCH_NEXT(); 1219 BACKTRACK(); 1220 case ByteTerm::TypeParenthesesSubpatternTerminalEnd: 1221 if (matchParenthesesTerminalEnd(currentTerm(), context)) 1222 MATCH_NEXT(); 1223 BACKTRACK(); 1224 case ByteTerm::TypeParentheticalAssertionBegin: 1225 if (matchParentheticalAssertionBegin(currentTerm(), context)) 1226 MATCH_NEXT(); 1227 BACKTRACK(); 1228 case ByteTerm::TypeParentheticalAssertionEnd: 1229 if (matchParentheticalAssertionEnd(currentTerm(), context)) 1230 MATCH_NEXT(); 1231 BACKTRACK(); 1232 1233 case ByteTerm::TypeCheckInput: 1234 if (input.checkInput(currentTerm().checkInputCount)) 1235 MATCH_NEXT(); 1236 BACKTRACK(); 1237 1238 case ByteTerm::TypeUncheckInput: 1239 input.uncheckInput(currentTerm().checkInputCount); 1240 MATCH_NEXT(); 1241 } 1242 1243 // We should never fall-through to here. 1244 ASSERT_NOT_REACHED(); 1245 1246 backtrack: 1247 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 1248 1249 switch (currentTerm().type) { 1250 case ByteTerm::TypeSubpatternBegin: 1251 return JSRegExpNoMatch; 1252 case ByteTerm::TypeSubpatternEnd: 1253 ASSERT_NOT_REACHED(); 1254 1255 case ByteTerm::TypeBodyAlternativeBegin: 1256 case ByteTerm::TypeBodyAlternativeDisjunction: { 1257 int offset = currentTerm().alternative.next; 1258 context->term += offset; 1259 if (offset > 0) 1260 MATCH_NEXT(); 1261 1262 if (input.atEnd()) 1263 return JSRegExpNoMatch; 1264 1265 input.next(); 1266 1267 if (pattern->m_containsBeginChars && isBody) 1268 lookupForBeginChars(); 1269 1270 context->matchBegin = input.getPos(); 1271 1272 if (currentTerm().alternative.onceThrough) 1273 context->term += currentTerm().alternative.next; 1274 1275 MATCH_NEXT(); 1276 } 1277 case ByteTerm::TypeBodyAlternativeEnd: 1278 ASSERT_NOT_REACHED(); 1279 1280 case ByteTerm::TypeAlternativeBegin: 1281 case ByteTerm::TypeAlternativeDisjunction: { 1282 int offset = currentTerm().alternative.next; 1283 context->term += offset; 1284 if (offset > 0) 1285 MATCH_NEXT(); 1286 BACKTRACK(); 1287 } 1288 case ByteTerm::TypeAlternativeEnd: { 1289 // We should never backtrack back into an alternative of the main body of the regex. 1290 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 1291 unsigned offset = backTrack->offset; 1292 context->term -= offset; 1293 BACKTRACK(); 1294 } 1295 1296 case ByteTerm::TypeAssertionBOL: 1297 case ByteTerm::TypeAssertionEOL: 1298 case ByteTerm::TypeAssertionWordBoundary: 1299 BACKTRACK(); 1300 1301 case ByteTerm::TypePatternCharacterOnce: 1302 case ByteTerm::TypePatternCharacterFixed: 1303 case ByteTerm::TypePatternCharacterGreedy: 1304 case ByteTerm::TypePatternCharacterNonGreedy: 1305 if (backtrackPatternCharacter(currentTerm(), context)) 1306 MATCH_NEXT(); 1307 BACKTRACK(); 1308 case ByteTerm::TypePatternCasedCharacterOnce: 1309 case ByteTerm::TypePatternCasedCharacterFixed: 1310 case ByteTerm::TypePatternCasedCharacterGreedy: 1311 case ByteTerm::TypePatternCasedCharacterNonGreedy: 1312 if (backtrackPatternCasedCharacter(currentTerm(), context)) 1313 MATCH_NEXT(); 1314 BACKTRACK(); 1315 case ByteTerm::TypeCharacterClass: 1316 if (backtrackCharacterClass(currentTerm(), context)) 1317 MATCH_NEXT(); 1318 BACKTRACK(); 1319 case ByteTerm::TypeBackReference: 1320 if (backtrackBackReference(currentTerm(), context)) 1321 MATCH_NEXT(); 1322 BACKTRACK(); 1323 case ByteTerm::TypeParenthesesSubpattern: { 1324 JSRegExpResult result = backtrackParentheses(currentTerm(), context); 1325 1326 if (result == JSRegExpMatch) { 1327 MATCH_NEXT(); 1328 } else if (result != JSRegExpNoMatch) 1329 return result; 1330 1331 BACKTRACK(); 1332 } 1333 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1334 if (backtrackParenthesesOnceBegin(currentTerm(), context)) 1335 MATCH_NEXT(); 1336 BACKTRACK(); 1337 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1338 if (backtrackParenthesesOnceEnd(currentTerm(), context)) 1339 MATCH_NEXT(); 1340 BACKTRACK(); 1341 case ByteTerm::TypeParenthesesSubpatternTerminalBegin: 1342 if (backtrackParenthesesTerminalBegin(currentTerm(), context)) 1343 MATCH_NEXT(); 1344 BACKTRACK(); 1345 case ByteTerm::TypeParenthesesSubpatternTerminalEnd: 1346 if (backtrackParenthesesTerminalEnd(currentTerm(), context)) 1347 MATCH_NEXT(); 1348 BACKTRACK(); 1349 case ByteTerm::TypeParentheticalAssertionBegin: 1350 if (backtrackParentheticalAssertionBegin(currentTerm(), context)) 1351 MATCH_NEXT(); 1352 BACKTRACK(); 1353 case ByteTerm::TypeParentheticalAssertionEnd: 1354 if (backtrackParentheticalAssertionEnd(currentTerm(), context)) 1355 MATCH_NEXT(); 1356 BACKTRACK(); 1357 1358 case ByteTerm::TypeCheckInput: 1359 input.uncheckInput(currentTerm().checkInputCount); 1360 BACKTRACK(); 1361 1362 case ByteTerm::TypeUncheckInput: 1363 input.checkInput(currentTerm().checkInputCount); 1364 BACKTRACK(); 1365 } 1366 1367 ASSERT_NOT_REACHED(); 1368 return JSRegExpErrorNoMatch; 1369 } 1370 1371 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) 1372 { 1373 JSRegExpResult result = matchDisjunction(disjunction, context, btrack); 1374 1375 if (result == JSRegExpMatch) { 1376 while (context->matchBegin == context->matchEnd) { 1377 result = matchDisjunction(disjunction, context, true); 1378 if (result != JSRegExpMatch) 1379 return result; 1380 } 1381 return JSRegExpMatch; 1382 } 1383 1384 return result; 1385 } 1386 1387 int interpret() 1388 { 1389 allocatorPool = pattern->m_allocator->startAllocator(); 1390 if (!allocatorPool) 1391 CRASH(); 1392 1393 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) 1394 output[i] = -1; 1395 1396 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); 1397 1398 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true); 1399 if (result == JSRegExpMatch) { 1400 output[0] = context->matchBegin; 1401 output[1] = context->matchEnd; 1402 } 1403 1404 freeDisjunctionContext(context); 1405 1406 pattern->m_allocator->stopAllocator(); 1407 1408 // RegExp.cpp currently expects all error to be converted to -1. 1409 ASSERT((result == JSRegExpMatch) == (output[0] != -1)); 1410 return output[0]; 1411 } 1412 1413 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) 1414 : pattern(pattern) 1415 , output(output) 1416 , input(inputChar, start, length) 1417 , allocatorPool(0) 1418 , remainingMatchCount(matchLimit) 1419 { 1420 } 1421 1422 private: 1423 BytecodePattern* pattern; 1424 int* output; 1425 InputStream input; 1426 BumpPointerPool* allocatorPool; 1427 unsigned remainingMatchCount; 1428 }; 1429 1430 1431 1432 class ByteCompiler { 1433 struct ParenthesesStackEntry { 1434 unsigned beginTerm; 1435 unsigned savedAlternativeIndex; 1436 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) 1437 : beginTerm(beginTerm) 1438 , savedAlternativeIndex(savedAlternativeIndex) 1439 { 1440 } 1441 }; 1442 1443 public: 1444 ByteCompiler(YarrPattern& pattern) 1445 : m_pattern(pattern) 1446 { 1447 m_currentAlternativeIndex = 0; 1448 } 1449 1450 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) 1451 { 1452 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); 1453 emitDisjunction(m_pattern.m_body); 1454 regexEnd(); 1455 1456 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); 1457 } 1458 1459 void checkInput(unsigned count) 1460 { 1461 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); 1462 } 1463 1464 void uncheckInput(unsigned count) 1465 { 1466 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); 1467 } 1468 1469 void assertionBOL(int inputPosition) 1470 { 1471 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); 1472 } 1473 1474 void assertionEOL(int inputPosition) 1475 { 1476 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); 1477 } 1478 1479 void assertionWordBoundary(bool invert, int inputPosition) 1480 { 1481 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); 1482 } 1483 1484 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1485 { 1486 if (m_pattern.m_ignoreCase) { 1487 UChar lo = Unicode::toLower(ch); 1488 UChar hi = Unicode::toUpper(ch); 1489 1490 if (lo != hi) { 1491 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); 1492 return; 1493 } 1494 } 1495 1496 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); 1497 } 1498 1499 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1500 { 1501 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); 1502 1503 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; 1504 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1505 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1506 } 1507 1508 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1509 { 1510 ASSERT(subpatternId); 1511 1512 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); 1513 1514 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; 1515 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1516 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1517 } 1518 1519 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1520 { 1521 int beginTerm = m_bodyDisjunction->terms.size(); 1522 1523 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); 1524 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1525 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1526 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1527 1528 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1529 m_currentAlternativeIndex = beginTerm + 1; 1530 } 1531 1532 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1533 { 1534 int beginTerm = m_bodyDisjunction->terms.size(); 1535 1536 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); 1537 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1538 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1539 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1540 1541 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1542 m_currentAlternativeIndex = beginTerm + 1; 1543 } 1544 1545 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1546 { 1547 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, 1548 // then fix this up at the end! - simplifying this should make it much clearer. 1549 // https://bugs.webkit.org/show_bug.cgi?id=50136 1550 1551 int beginTerm = m_bodyDisjunction->terms.size(); 1552 1553 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); 1554 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1555 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1556 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1557 1558 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1559 m_currentAlternativeIndex = beginTerm + 1; 1560 } 1561 1562 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) 1563 { 1564 int beginTerm = m_bodyDisjunction->terms.size(); 1565 1566 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); 1567 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1568 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1569 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1570 1571 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1572 m_currentAlternativeIndex = beginTerm + 1; 1573 } 1574 1575 void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1576 { 1577 unsigned beginTerm = popParenthesesStack(); 1578 closeAlternative(beginTerm + 1); 1579 unsigned endTerm = m_bodyDisjunction->terms.size(); 1580 1581 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); 1582 1583 bool invert = m_bodyDisjunction->terms[beginTerm].invert(); 1584 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1585 1586 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); 1587 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1588 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1589 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1590 1591 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1592 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1593 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; 1594 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1595 } 1596 1597 unsigned popParenthesesStack() 1598 { 1599 ASSERT(m_parenthesesStack.size()); 1600 int stackEnd = m_parenthesesStack.size() - 1; 1601 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; 1602 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; 1603 m_parenthesesStack.shrink(stackEnd); 1604 1605 ASSERT(beginTerm < m_bodyDisjunction->terms.size()); 1606 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); 1607 1608 return beginTerm; 1609 } 1610 1611 #ifndef NDEBUG 1612 void dumpDisjunction(ByteDisjunction* disjunction) 1613 { 1614 printf("ByteDisjunction(%p):\n\t", disjunction); 1615 for (unsigned i = 0; i < disjunction->terms.size(); ++i) 1616 printf("{ %d } ", disjunction->terms[i].type); 1617 printf("\n"); 1618 } 1619 #endif 1620 1621 void closeAlternative(int beginTerm) 1622 { 1623 int origBeginTerm = beginTerm; 1624 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); 1625 int endIndex = m_bodyDisjunction->terms.size(); 1626 1627 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1628 1629 if (!m_bodyDisjunction->terms[beginTerm].alternative.next) 1630 m_bodyDisjunction->terms.remove(beginTerm); 1631 else { 1632 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1633 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1634 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); 1635 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1636 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1637 } 1638 1639 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1640 1641 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); 1642 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1643 } 1644 } 1645 1646 void closeBodyAlternative() 1647 { 1648 int beginTerm = 0; 1649 int origBeginTerm = 0; 1650 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); 1651 int endIndex = m_bodyDisjunction->terms.size(); 1652 1653 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1654 1655 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1656 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1657 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); 1658 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1659 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1660 } 1661 1662 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1663 1664 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); 1665 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1666 } 1667 1668 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) 1669 { 1670 unsigned beginTerm = popParenthesesStack(); 1671 closeAlternative(beginTerm + 1); 1672 unsigned endTerm = m_bodyDisjunction->terms.size(); 1673 1674 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 1675 1676 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; 1677 1678 bool capture = parenthesesBegin.capture(); 1679 unsigned subpatternId = parenthesesBegin.atom.subpatternId; 1680 1681 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; 1682 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); 1683 1684 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); 1685 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) 1686 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); 1687 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); 1688 1689 m_bodyDisjunction->terms.shrink(beginTerm); 1690 1691 m_allParenthesesInfo.append(parenthesesDisjunction); 1692 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); 1693 1694 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1695 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1696 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1697 } 1698 1699 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1700 { 1701 unsigned beginTerm = popParenthesesStack(); 1702 closeAlternative(beginTerm + 1); 1703 unsigned endTerm = m_bodyDisjunction->terms.size(); 1704 1705 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 1706 1707 bool capture = m_bodyDisjunction->terms[beginTerm].capture(); 1708 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1709 1710 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); 1711 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1712 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1713 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1714 1715 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1716 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1717 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; 1718 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1719 } 1720 1721 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1722 { 1723 unsigned beginTerm = popParenthesesStack(); 1724 closeAlternative(beginTerm + 1); 1725 unsigned endTerm = m_bodyDisjunction->terms.size(); 1726 1727 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); 1728 1729 bool capture = m_bodyDisjunction->terms[beginTerm].capture(); 1730 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1731 1732 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); 1733 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1734 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1735 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1736 1737 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1738 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1739 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; 1740 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1741 } 1742 1743 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) 1744 { 1745 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); 1746 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); 1747 m_bodyDisjunction->terms[0].frameLocation = 0; 1748 m_currentAlternativeIndex = 0; 1749 } 1750 1751 void regexEnd() 1752 { 1753 closeBodyAlternative(); 1754 } 1755 1756 void alternativeBodyDisjunction(bool onceThrough) 1757 { 1758 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1759 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1760 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); 1761 1762 m_currentAlternativeIndex = newAlternativeIndex; 1763 } 1764 1765 void alternativeDisjunction() 1766 { 1767 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1768 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1769 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); 1770 1771 m_currentAlternativeIndex = newAlternativeIndex; 1772 } 1773 1774 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false) 1775 { 1776 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 1777 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; 1778 1779 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 1780 1781 if (alt) { 1782 if (disjunction == m_pattern.m_body) 1783 alternativeBodyDisjunction(alternative->onceThrough()); 1784 else 1785 alternativeDisjunction(); 1786 } 1787 1788 unsigned minimumSize = alternative->m_minimumSize; 1789 int countToCheck; 1790 1791 if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize) 1792 countToCheck = 0; 1793 else 1794 countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; 1795 1796 ASSERT(countToCheck >= 0); 1797 if (countToCheck) { 1798 checkInput(countToCheck); 1799 currentCountAlreadyChecked += countToCheck; 1800 } 1801 1802 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 1803 PatternTerm& term = alternative->m_terms[i]; 1804 1805 switch (term.type) { 1806 case PatternTerm::TypeAssertionBOL: 1807 assertionBOL(term.inputPosition - currentCountAlreadyChecked); 1808 break; 1809 1810 case PatternTerm::TypeAssertionEOL: 1811 assertionEOL(term.inputPosition - currentCountAlreadyChecked); 1812 break; 1813 1814 case PatternTerm::TypeAssertionWordBoundary: 1815 assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); 1816 break; 1817 1818 case PatternTerm::TypePatternCharacter: 1819 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1820 break; 1821 1822 case PatternTerm::TypeCharacterClass: 1823 atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1824 break; 1825 1826 case PatternTerm::TypeBackReference: 1827 atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1828 break; 1829 1830 case PatternTerm::TypeForwardReference: 1831 break; 1832 1833 case PatternTerm::TypeParenthesesSubpattern: { 1834 unsigned disjunctionAlreadyCheckedCount = 0; 1835 if (term.quantityCount == 1 && !term.parentheses.isCopy) { 1836 unsigned alternativeFrameLocation = term.frameLocation; 1837 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. 1838 if (term.quantityType == QuantifierFixedCount) 1839 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; 1840 else 1841 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; 1842 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1843 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); 1844 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); 1845 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); 1846 } else if (term.parentheses.isTerminal) { 1847 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1848 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); 1849 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); 1850 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); 1851 } else { 1852 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1853 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); 1854 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); 1855 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); 1856 } 1857 break; 1858 } 1859 1860 case PatternTerm::TypeParentheticalAssertion: { 1861 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; 1862 1863 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); 1864 int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; 1865 int uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; 1866 1867 if (uncheckAmount > 0) { 1868 uncheckInput(uncheckAmount); 1869 currentCountAlreadyChecked -= uncheckAmount; 1870 } else 1871 uncheckAmount = 0; 1872 1873 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); 1874 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true); 1875 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); 1876 if (uncheckAmount) { 1877 checkInput(uncheckAmount); 1878 currentCountAlreadyChecked += uncheckAmount; 1879 } 1880 break; 1881 } 1882 } 1883 } 1884 } 1885 } 1886 1887 private: 1888 YarrPattern& m_pattern; 1889 OwnPtr<ByteDisjunction> m_bodyDisjunction; 1890 unsigned m_currentAlternativeIndex; 1891 Vector<ParenthesesStackEntry> m_parenthesesStack; 1892 Vector<ByteDisjunction*> m_allParenthesesInfo; 1893 }; 1894 1895 PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) 1896 { 1897 return ByteCompiler(pattern).compile(allocator); 1898 } 1899 1900 int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output) 1901 { 1902 return Interpreter(bytecode, output, input, start, length).interpret(); 1903 } 1904 1905 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); 1906 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); 1907 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); 1908 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); 1909 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); 1910 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); 1911 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); 1912 1913 1914 } } 1915