1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "RegexInterpreter.h" 28 29 #include "RegexCompiler.h" 30 #include "RegexPattern.h" 31 32 #ifndef NDEBUG 33 #include <stdio.h> 34 #endif 35 36 #if ENABLE(YARR) 37 38 using namespace WTF; 39 40 namespace JSC { namespace Yarr { 41 42 class Interpreter { 43 public: 44 struct ParenthesesDisjunctionContext; 45 46 struct BackTrackInfoPatternCharacter { 47 uintptr_t matchAmount; 48 }; 49 struct BackTrackInfoCharacterClass { 50 uintptr_t matchAmount; 51 }; 52 struct BackTrackInfoBackReference { 53 uintptr_t begin; // Not really needed for greedy quantifiers. 54 uintptr_t matchAmount; // Not really needed for fixed quantifiers. 55 }; 56 struct BackTrackInfoAlternative { 57 uintptr_t offset; 58 }; 59 struct BackTrackInfoParentheticalAssertion { 60 uintptr_t begin; 61 }; 62 struct BackTrackInfoParenthesesOnce { 63 uintptr_t inParentheses; 64 }; 65 struct BackTrackInfoParentheses { 66 uintptr_t matchAmount; 67 ParenthesesDisjunctionContext* lastContext; 68 uintptr_t prevBegin; 69 uintptr_t prevEnd; 70 }; 71 72 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) 73 { 74 context->next = backTrack->lastContext; 75 backTrack->lastContext = context; 76 ++backTrack->matchAmount; 77 } 78 79 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) 80 { 81 ASSERT(backTrack->matchAmount); 82 ASSERT(backTrack->lastContext); 83 backTrack->lastContext = backTrack->lastContext->next; 84 --backTrack->matchAmount; 85 } 86 87 struct DisjunctionContext 88 { 89 DisjunctionContext() 90 : term(0) 91 { 92 } 93 94 void* operator new(size_t, void* where) 95 { 96 return where; 97 } 98 99 int term; 100 unsigned matchBegin; 101 unsigned matchEnd; 102 uintptr_t frame[1]; 103 }; 104 105 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) 106 { 107 return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); 108 } 109 110 void freeDisjunctionContext(DisjunctionContext* context) 111 { 112 free(context); 113 } 114 115 struct ParenthesesDisjunctionContext 116 { 117 ParenthesesDisjunctionContext(int* output, ByteTerm& term) 118 : next(0) 119 { 120 unsigned firstSubpatternId = term.atom.subpatternId; 121 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; 122 123 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { 124 subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; 125 output[(firstSubpatternId << 1) + i] = -1; 126 } 127 128 new(getDisjunctionContext(term)) DisjunctionContext(); 129 } 130 131 void* operator new(size_t, void* where) 132 { 133 return where; 134 } 135 136 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) 137 { 138 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) 139 output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; 140 } 141 142 DisjunctionContext* getDisjunctionContext(ByteTerm& term) 143 { 144 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); 145 } 146 147 ParenthesesDisjunctionContext* next; 148 int subpatternBackup[1]; 149 }; 150 151 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) 152 { 153 return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); 154 } 155 156 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) 157 { 158 free(context); 159 } 160 161 class InputStream { 162 public: 163 InputStream(const UChar* input, unsigned start, unsigned length) 164 : input(input) 165 , pos(start) 166 , length(length) 167 { 168 } 169 170 void next() 171 { 172 ++pos; 173 } 174 175 void rewind(unsigned amount) 176 { 177 ASSERT(pos >= amount); 178 pos -= amount; 179 } 180 181 int read() 182 { 183 ASSERT(pos < length); 184 if (pos < length) 185 return input[pos]; 186 return -1; 187 } 188 189 int readChecked(int position) 190 { 191 ASSERT(position < 0); 192 ASSERT((unsigned)-position <= pos); 193 unsigned p = pos + position; 194 ASSERT(p < length); 195 return input[p]; 196 } 197 198 int reread(unsigned from) 199 { 200 ASSERT(from < length); 201 return input[from]; 202 } 203 204 int prev() 205 { 206 ASSERT(!(pos > length)); 207 if (pos && length) 208 return input[pos - 1]; 209 return -1; 210 } 211 212 unsigned getPos() 213 { 214 return pos; 215 } 216 217 void setPos(unsigned p) 218 { 219 pos = p; 220 } 221 222 bool atStart() 223 { 224 return pos == 0; 225 } 226 227 bool atEnd() 228 { 229 return pos == length; 230 } 231 232 bool checkInput(int count) 233 { 234 if ((pos + count) <= length) { 235 pos += count; 236 return true; 237 } else 238 return false; 239 } 240 241 void uncheckInput(int count) 242 { 243 pos -= count; 244 } 245 246 bool atStart(int position) 247 { 248 return (pos + position) == 0; 249 } 250 251 bool atEnd(int position) 252 { 253 return (pos + position) == length; 254 } 255 256 private: 257 const UChar* input; 258 unsigned pos; 259 unsigned length; 260 }; 261 262 bool testCharacterClass(CharacterClass* characterClass, int ch) 263 { 264 if (ch & 0xFF80) { 265 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) 266 if (ch == characterClass->m_matchesUnicode[i]) 267 return true; 268 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) 269 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) 270 return true; 271 } else { 272 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) 273 if (ch == characterClass->m_matches[i]) 274 return true; 275 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) 276 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) 277 return true; 278 } 279 280 return false; 281 } 282 283 bool tryConsumeCharacter(int testChar) 284 { 285 if (input.atEnd()) 286 return false; 287 288 int ch = input.read(); 289 290 if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { 291 input.next(); 292 return true; 293 } 294 return false; 295 } 296 297 bool checkCharacter(int testChar, int inputPosition) 298 { 299 return testChar == input.readChecked(inputPosition); 300 } 301 302 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) 303 { 304 int ch = input.readChecked(inputPosition); 305 return (loChar == ch) || (hiChar == ch); 306 } 307 308 bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) 309 { 310 if (input.atEnd()) 311 return false; 312 313 bool match = testCharacterClass(characterClass, input.read()); 314 315 if (invert) 316 match = !match; 317 318 if (match) { 319 input.next(); 320 return true; 321 } 322 return false; 323 } 324 325 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) 326 { 327 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); 328 return invert ? !match : match; 329 } 330 331 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) 332 { 333 int matchSize = matchEnd - matchBegin; 334 335 if (!input.checkInput(matchSize)) 336 return false; 337 338 for (int i = 0; i < matchSize; ++i) { 339 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { 340 input.uncheckInput(matchSize); 341 return false; 342 } 343 } 344 345 return true; 346 } 347 348 bool matchAssertionBOL(ByteTerm& term) 349 { 350 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); 351 } 352 353 bool matchAssertionEOL(ByteTerm& term) 354 { 355 if (term.inputPosition) 356 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); 357 else 358 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); 359 } 360 361 bool matchAssertionWordBoundary(ByteTerm& term) 362 { 363 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); 364 bool readIsWordchar; 365 if (term.inputPosition) 366 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); 367 else 368 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); 369 370 bool wordBoundary = prevIsWordchar != readIsWordchar; 371 return term.invert() ? !wordBoundary : wordBoundary; 372 } 373 374 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) 375 { 376 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 377 378 switch (term.atom.quantityType) { 379 case QuantifierFixedCount: 380 break; 381 382 case QuantifierGreedy: 383 if (backTrack->matchAmount) { 384 --backTrack->matchAmount; 385 input.uncheckInput(1); 386 return true; 387 } 388 break; 389 390 case QuantifierNonGreedy: 391 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 392 ++backTrack->matchAmount; 393 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) 394 return true; 395 } 396 input.uncheckInput(backTrack->matchAmount); 397 break; 398 } 399 400 return false; 401 } 402 403 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) 404 { 405 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 406 407 switch (term.atom.quantityType) { 408 case QuantifierFixedCount: 409 break; 410 411 case QuantifierGreedy: 412 if (backTrack->matchAmount) { 413 --backTrack->matchAmount; 414 input.uncheckInput(1); 415 return true; 416 } 417 break; 418 419 case QuantifierNonGreedy: 420 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 421 ++backTrack->matchAmount; 422 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) 423 return true; 424 } 425 input.uncheckInput(backTrack->matchAmount); 426 break; 427 } 428 429 return false; 430 } 431 432 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) 433 { 434 ASSERT(term.type == ByteTerm::TypeCharacterClass); 435 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 436 437 switch (term.atom.quantityType) { 438 case QuantifierFixedCount: { 439 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 440 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) 441 return false; 442 } 443 return true; 444 } 445 446 case QuantifierGreedy: { 447 unsigned matchAmount = 0; 448 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 449 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { 450 input.uncheckInput(1); 451 break; 452 } 453 ++matchAmount; 454 } 455 backTrack->matchAmount = matchAmount; 456 457 return true; 458 } 459 460 case QuantifierNonGreedy: 461 backTrack->matchAmount = 0; 462 return true; 463 } 464 465 ASSERT_NOT_REACHED(); 466 return false; 467 } 468 469 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) 470 { 471 ASSERT(term.type == ByteTerm::TypeCharacterClass); 472 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); 473 474 switch (term.atom.quantityType) { 475 case QuantifierFixedCount: 476 break; 477 478 case QuantifierGreedy: 479 if (backTrack->matchAmount) { 480 --backTrack->matchAmount; 481 input.uncheckInput(1); 482 return true; 483 } 484 break; 485 486 case QuantifierNonGreedy: 487 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { 488 ++backTrack->matchAmount; 489 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) 490 return true; 491 } 492 input.uncheckInput(backTrack->matchAmount); 493 break; 494 } 495 496 return false; 497 } 498 499 bool matchBackReference(ByteTerm& term, DisjunctionContext* context) 500 { 501 ASSERT(term.type == ByteTerm::TypeBackReference); 502 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 503 504 int matchBegin = output[(term.atom.subpatternId << 1)]; 505 int matchEnd = output[(term.atom.subpatternId << 1) + 1]; 506 ASSERT((matchBegin == -1) == (matchEnd == -1)); 507 ASSERT(matchBegin <= matchEnd); 508 509 if (matchBegin == matchEnd) 510 return true; 511 512 switch (term.atom.quantityType) { 513 case QuantifierFixedCount: { 514 backTrack->begin = input.getPos(); 515 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { 516 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 517 input.setPos(backTrack->begin); 518 return false; 519 } 520 } 521 return true; 522 } 523 524 case QuantifierGreedy: { 525 unsigned matchAmount = 0; 526 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) 527 ++matchAmount; 528 backTrack->matchAmount = matchAmount; 529 return true; 530 } 531 532 case QuantifierNonGreedy: 533 backTrack->begin = input.getPos(); 534 backTrack->matchAmount = 0; 535 return true; 536 } 537 538 ASSERT_NOT_REACHED(); 539 return false; 540 } 541 542 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) 543 { 544 ASSERT(term.type == ByteTerm::TypeBackReference); 545 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); 546 547 int matchBegin = output[(term.atom.subpatternId << 1)]; 548 int matchEnd = output[(term.atom.subpatternId << 1) + 1]; 549 ASSERT((matchBegin == -1) == (matchEnd == -1)); 550 ASSERT(matchBegin <= matchEnd); 551 552 if (matchBegin == matchEnd) 553 return false; 554 555 switch (term.atom.quantityType) { 556 case QuantifierFixedCount: 557 // for quantityCount == 1, could rewind. 558 input.setPos(backTrack->begin); 559 break; 560 561 case QuantifierGreedy: 562 if (backTrack->matchAmount) { 563 --backTrack->matchAmount; 564 input.rewind(matchEnd - matchBegin); 565 return true; 566 } 567 break; 568 569 case QuantifierNonGreedy: 570 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { 571 ++backTrack->matchAmount; 572 return true; 573 } else 574 input.setPos(backTrack->begin); 575 break; 576 } 577 578 return false; 579 } 580 581 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) 582 { 583 if (term.capture()) { 584 unsigned subpatternId = term.atom.subpatternId; 585 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; 586 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; 587 } 588 } 589 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) 590 { 591 unsigned firstSubpatternId = term.atom.subpatternId; 592 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; 593 context->restoreOutput(output, firstSubpatternId, count); 594 } 595 void resetAssertionMatches(ByteTerm& term) 596 { 597 unsigned firstSubpatternId = term.atom.subpatternId; 598 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; 599 for (unsigned i = 0; i < (count << 1); ++i) 600 output[(firstSubpatternId << 1) + i] = -1; 601 } 602 bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) 603 { 604 while (backTrack->matchAmount) { 605 ParenthesesDisjunctionContext* context = backTrack->lastContext; 606 607 if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) 608 return true; 609 610 resetMatches(term, context); 611 popParenthesesDisjunctionContext(backTrack); 612 freeParenthesesDisjunctionContext(context); 613 } 614 615 return false; 616 } 617 618 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 619 { 620 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 621 ASSERT(term.atom.quantityCount == 1); 622 623 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 624 625 switch (term.atom.quantityType) { 626 case QuantifierGreedy: { 627 // set this speculatively; if we get to the parens end this will be true. 628 backTrack->inParentheses = 1; 629 break; 630 } 631 case QuantifierNonGreedy: { 632 backTrack->inParentheses = 0; 633 context->term += term.atom.parenthesesWidth; 634 return true; 635 } 636 case QuantifierFixedCount: 637 break; 638 } 639 640 if (term.capture()) { 641 unsigned subpatternId = term.atom.subpatternId; 642 output[(subpatternId << 1)] = input.getPos() + term.inputPosition; 643 } 644 645 return true; 646 } 647 648 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) 649 { 650 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 651 ASSERT(term.atom.quantityCount == 1); 652 653 if (term.capture()) { 654 unsigned subpatternId = term.atom.subpatternId; 655 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; 656 } 657 return true; 658 } 659 660 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) 661 { 662 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 663 ASSERT(term.atom.quantityCount == 1); 664 665 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 666 667 if (term.capture()) { 668 unsigned subpatternId = term.atom.subpatternId; 669 output[(subpatternId << 1)] = -1; 670 output[(subpatternId << 1) + 1] = -1; 671 } 672 673 switch (term.atom.quantityType) { 674 case QuantifierGreedy: 675 // if we backtrack to this point, there is another chance - try matching nothing. 676 ASSERT(backTrack->inParentheses); 677 backTrack->inParentheses = 0; 678 context->term += term.atom.parenthesesWidth; 679 return true; 680 case QuantifierNonGreedy: 681 ASSERT(backTrack->inParentheses); 682 case QuantifierFixedCount: 683 break; 684 } 685 686 return false; 687 } 688 689 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) 690 { 691 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); 692 ASSERT(term.atom.quantityCount == 1); 693 694 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); 695 696 switch (term.atom.quantityType) { 697 case QuantifierGreedy: 698 if (!backTrack->inParentheses) { 699 context->term -= term.atom.parenthesesWidth; 700 return false; 701 } 702 case QuantifierNonGreedy: 703 if (!backTrack->inParentheses) { 704 // now try to match the parens; set this speculatively. 705 backTrack->inParentheses = 1; 706 if (term.capture()) { 707 unsigned subpatternId = term.atom.subpatternId; 708 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; 709 } 710 context->term -= term.atom.parenthesesWidth; 711 return true; 712 } 713 case QuantifierFixedCount: 714 break; 715 } 716 717 return false; 718 } 719 720 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 721 { 722 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 723 ASSERT(term.atom.quantityCount == 1); 724 725 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 726 727 backTrack->begin = input.getPos(); 728 return true; 729 } 730 731 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 732 { 733 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 734 ASSERT(term.atom.quantityCount == 1); 735 736 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 737 738 input.setPos(backTrack->begin); 739 740 // We've reached the end of the parens; if they are inverted, this is failure. 741 if (term.invert()) { 742 context->term -= term.atom.parenthesesWidth; 743 return false; 744 } 745 746 return true; 747 } 748 749 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) 750 { 751 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); 752 ASSERT(term.atom.quantityCount == 1); 753 754 // We've failed to match parens; if they are inverted, this is win! 755 if (term.invert()) { 756 context->term += term.atom.parenthesesWidth; 757 return true; 758 } 759 760 return false; 761 } 762 763 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) 764 { 765 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); 766 ASSERT(term.atom.quantityCount == 1); 767 768 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); 769 770 input.setPos(backTrack->begin); 771 772 context->term -= term.atom.parenthesesWidth; 773 return false; 774 } 775 776 bool matchParentheses(ByteTerm& term, DisjunctionContext* context) 777 { 778 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 779 780 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 781 782 unsigned subpatternId = term.atom.subpatternId; 783 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 784 785 backTrack->prevBegin = output[(subpatternId << 1)]; 786 backTrack->prevEnd = output[(subpatternId << 1) + 1]; 787 788 backTrack->matchAmount = 0; 789 backTrack->lastContext = 0; 790 791 switch (term.atom.quantityType) { 792 case QuantifierFixedCount: { 793 // While we haven't yet reached our fixed limit, 794 while (backTrack->matchAmount < term.atom.quantityCount) { 795 // Try to do a match, and it it succeeds, add it to the list. 796 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 797 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) 798 appendParenthesesDisjunctionContext(backTrack, context); 799 else { 800 // The match failed; try to find an alternate point to carry on from. 801 resetMatches(term, context); 802 freeParenthesesDisjunctionContext(context); 803 if (!parenthesesDoBacktrack(term, backTrack)) 804 return false; 805 } 806 } 807 808 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 809 ParenthesesDisjunctionContext* context = backTrack->lastContext; 810 recordParenthesesMatch(term, context); 811 return true; 812 } 813 814 case QuantifierGreedy: { 815 while (backTrack->matchAmount < term.atom.quantityCount) { 816 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 817 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) 818 appendParenthesesDisjunctionContext(backTrack, context); 819 else { 820 resetMatches(term, context); 821 freeParenthesesDisjunctionContext(context); 822 break; 823 } 824 } 825 826 if (backTrack->matchAmount) { 827 ParenthesesDisjunctionContext* context = backTrack->lastContext; 828 recordParenthesesMatch(term, context); 829 } 830 return true; 831 } 832 833 case QuantifierNonGreedy: 834 return true; 835 } 836 837 ASSERT_NOT_REACHED(); 838 return false; 839 } 840 841 // Rules for backtracking differ depending on whether this is greedy or non-greedy. 842 // 843 // Greedy matches never should try just adding more - you should already have done 844 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where 845 // you backtrack an item off the list needs checking, since we'll never have matched 846 // the one less case. Tracking forwards, still add as much as possible. 847 // 848 // Non-greedy, we've already done the one less case, so don't match on popping. 849 // We haven't done the one more case, so always try to add that. 850 // 851 bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) 852 { 853 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); 854 855 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); 856 857 if (term.capture()) { 858 unsigned subpatternId = term.atom.subpatternId; 859 output[(subpatternId << 1)] = backTrack->prevBegin; 860 output[(subpatternId << 1) + 1] = backTrack->prevEnd; 861 } 862 863 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; 864 865 switch (term.atom.quantityType) { 866 case QuantifierFixedCount: { 867 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 868 869 ParenthesesDisjunctionContext* context = 0; 870 871 if (!parenthesesDoBacktrack(term, backTrack)) 872 return false; 873 874 // While we haven't yet reached our fixed limit, 875 while (backTrack->matchAmount < term.atom.quantityCount) { 876 // Try to do a match, and it it succeeds, add it to the list. 877 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 878 if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) 879 appendParenthesesDisjunctionContext(backTrack, context); 880 else { 881 // The match failed; try to find an alternate point to carry on from. 882 resetMatches(term, context); 883 freeParenthesesDisjunctionContext(context); 884 if (!parenthesesDoBacktrack(term, backTrack)) 885 return false; 886 } 887 } 888 889 ASSERT(backTrack->matchAmount == term.atom.quantityCount); 890 context = backTrack->lastContext; 891 recordParenthesesMatch(term, context); 892 return true; 893 } 894 895 case QuantifierGreedy: { 896 if (!backTrack->matchAmount) 897 return false; 898 899 ParenthesesDisjunctionContext* context = backTrack->lastContext; 900 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { 901 while (backTrack->matchAmount < term.atom.quantityCount) { 902 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 903 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) 904 appendParenthesesDisjunctionContext(backTrack, context); 905 else { 906 resetMatches(term, context); 907 freeParenthesesDisjunctionContext(context); 908 break; 909 } 910 } 911 } else { 912 resetMatches(term, context); 913 popParenthesesDisjunctionContext(backTrack); 914 freeParenthesesDisjunctionContext(context); 915 } 916 917 if (backTrack->matchAmount) { 918 ParenthesesDisjunctionContext* context = backTrack->lastContext; 919 recordParenthesesMatch(term, context); 920 } 921 return true; 922 } 923 924 case QuantifierNonGreedy: { 925 // If we've not reached the limit, try to add one more match. 926 if (backTrack->matchAmount < term.atom.quantityCount) { 927 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); 928 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { 929 appendParenthesesDisjunctionContext(backTrack, context); 930 recordParenthesesMatch(term, context); 931 return true; 932 } else { 933 resetMatches(term, context); 934 freeParenthesesDisjunctionContext(context); 935 } 936 } 937 938 // Nope - okay backtrack looking for an alternative. 939 while (backTrack->matchAmount) { 940 ParenthesesDisjunctionContext* context = backTrack->lastContext; 941 if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { 942 // successful backtrack! we're back in the game! 943 if (backTrack->matchAmount) { 944 context = backTrack->lastContext; 945 recordParenthesesMatch(term, context); 946 } 947 return true; 948 } 949 950 // pop a match off the stack 951 resetMatches(term, context); 952 popParenthesesDisjunctionContext(backTrack); 953 freeParenthesesDisjunctionContext(context); 954 } 955 956 return false; 957 } 958 } 959 960 ASSERT_NOT_REACHED(); 961 return false; 962 } 963 964 #define MATCH_NEXT() { ++context->term; goto matchAgain; } 965 #define BACKTRACK() { --context->term; goto backtrack; } 966 #define currentTerm() (disjunction->terms[context->term]) 967 bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) 968 { 969 if (btrack) 970 BACKTRACK(); 971 972 context->matchBegin = input.getPos(); 973 context->term = 0; 974 975 matchAgain: 976 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 977 978 switch (currentTerm().type) { 979 case ByteTerm::TypeSubpatternBegin: 980 MATCH_NEXT(); 981 case ByteTerm::TypeSubpatternEnd: 982 context->matchEnd = input.getPos(); 983 return true; 984 985 case ByteTerm::TypeBodyAlternativeBegin: 986 MATCH_NEXT(); 987 case ByteTerm::TypeBodyAlternativeDisjunction: 988 case ByteTerm::TypeBodyAlternativeEnd: 989 context->matchEnd = input.getPos(); 990 return true; 991 992 case ByteTerm::TypeAlternativeBegin: 993 MATCH_NEXT(); 994 case ByteTerm::TypeAlternativeDisjunction: 995 case ByteTerm::TypeAlternativeEnd: { 996 int offset = currentTerm().alternative.end; 997 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 998 backTrack->offset = offset; 999 context->term += offset; 1000 MATCH_NEXT(); 1001 } 1002 1003 case ByteTerm::TypeAssertionBOL: 1004 if (matchAssertionBOL(currentTerm())) 1005 MATCH_NEXT(); 1006 BACKTRACK(); 1007 case ByteTerm::TypeAssertionEOL: 1008 if (matchAssertionEOL(currentTerm())) 1009 MATCH_NEXT(); 1010 BACKTRACK(); 1011 case ByteTerm::TypeAssertionWordBoundary: 1012 if (matchAssertionWordBoundary(currentTerm())) 1013 MATCH_NEXT(); 1014 BACKTRACK(); 1015 1016 case ByteTerm::TypePatternCharacterOnce: 1017 case ByteTerm::TypePatternCharacterFixed: { 1018 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1019 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) 1020 BACKTRACK(); 1021 } 1022 MATCH_NEXT(); 1023 } 1024 case ByteTerm::TypePatternCharacterGreedy: { 1025 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1026 unsigned matchAmount = 0; 1027 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1028 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { 1029 input.uncheckInput(1); 1030 break; 1031 } 1032 ++matchAmount; 1033 } 1034 backTrack->matchAmount = matchAmount; 1035 1036 MATCH_NEXT(); 1037 } 1038 case ByteTerm::TypePatternCharacterNonGreedy: { 1039 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1040 backTrack->matchAmount = 0; 1041 MATCH_NEXT(); 1042 } 1043 1044 case ByteTerm::TypePatternCasedCharacterOnce: 1045 case ByteTerm::TypePatternCasedCharacterFixed: { 1046 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { 1047 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) 1048 BACKTRACK(); 1049 } 1050 MATCH_NEXT(); 1051 } 1052 case ByteTerm::TypePatternCasedCharacterGreedy: { 1053 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1054 unsigned matchAmount = 0; 1055 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { 1056 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { 1057 input.uncheckInput(1); 1058 break; 1059 } 1060 ++matchAmount; 1061 } 1062 backTrack->matchAmount = matchAmount; 1063 1064 MATCH_NEXT(); 1065 } 1066 case ByteTerm::TypePatternCasedCharacterNonGreedy: { 1067 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); 1068 backTrack->matchAmount = 0; 1069 MATCH_NEXT(); 1070 } 1071 1072 case ByteTerm::TypeCharacterClass: 1073 if (matchCharacterClass(currentTerm(), context)) 1074 MATCH_NEXT(); 1075 BACKTRACK(); 1076 case ByteTerm::TypeBackReference: 1077 if (matchBackReference(currentTerm(), context)) 1078 MATCH_NEXT(); 1079 BACKTRACK(); 1080 case ByteTerm::TypeParenthesesSubpattern: 1081 if (matchParentheses(currentTerm(), context)) 1082 MATCH_NEXT(); 1083 BACKTRACK(); 1084 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1085 if (matchParenthesesOnceBegin(currentTerm(), context)) 1086 MATCH_NEXT(); 1087 BACKTRACK(); 1088 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1089 if (matchParenthesesOnceEnd(currentTerm(), context)) 1090 MATCH_NEXT(); 1091 BACKTRACK(); 1092 case ByteTerm::TypeParentheticalAssertionBegin: 1093 if (matchParentheticalAssertionBegin(currentTerm(), context)) 1094 MATCH_NEXT(); 1095 BACKTRACK(); 1096 case ByteTerm::TypeParentheticalAssertionEnd: 1097 if (matchParentheticalAssertionEnd(currentTerm(), context)) 1098 MATCH_NEXT(); 1099 BACKTRACK(); 1100 1101 case ByteTerm::TypeCheckInput: 1102 if (input.checkInput(currentTerm().checkInputCount)) 1103 MATCH_NEXT(); 1104 BACKTRACK(); 1105 } 1106 1107 // We should never fall-through to here. 1108 ASSERT_NOT_REACHED(); 1109 1110 backtrack: 1111 ASSERT(context->term < static_cast<int>(disjunction->terms.size())); 1112 1113 switch (currentTerm().type) { 1114 case ByteTerm::TypeSubpatternBegin: 1115 return false; 1116 case ByteTerm::TypeSubpatternEnd: 1117 ASSERT_NOT_REACHED(); 1118 1119 case ByteTerm::TypeBodyAlternativeBegin: 1120 case ByteTerm::TypeBodyAlternativeDisjunction: { 1121 int offset = currentTerm().alternative.next; 1122 context->term += offset; 1123 if (offset > 0) 1124 MATCH_NEXT(); 1125 1126 if (input.atEnd()) 1127 return false; 1128 1129 input.next(); 1130 context->matchBegin = input.getPos(); 1131 MATCH_NEXT(); 1132 } 1133 case ByteTerm::TypeBodyAlternativeEnd: 1134 ASSERT_NOT_REACHED(); 1135 1136 case ByteTerm::TypeAlternativeBegin: 1137 case ByteTerm::TypeAlternativeDisjunction: { 1138 int offset = currentTerm().alternative.next; 1139 context->term += offset; 1140 if (offset > 0) 1141 MATCH_NEXT(); 1142 BACKTRACK(); 1143 } 1144 case ByteTerm::TypeAlternativeEnd: { 1145 // We should never backtrack back into an alternative of the main body of the regex. 1146 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); 1147 unsigned offset = backTrack->offset; 1148 context->term -= offset; 1149 BACKTRACK(); 1150 } 1151 1152 case ByteTerm::TypeAssertionBOL: 1153 case ByteTerm::TypeAssertionEOL: 1154 case ByteTerm::TypeAssertionWordBoundary: 1155 BACKTRACK(); 1156 1157 case ByteTerm::TypePatternCharacterOnce: 1158 case ByteTerm::TypePatternCharacterFixed: 1159 case ByteTerm::TypePatternCharacterGreedy: 1160 case ByteTerm::TypePatternCharacterNonGreedy: 1161 if (backtrackPatternCharacter(currentTerm(), context)) 1162 MATCH_NEXT(); 1163 BACKTRACK(); 1164 case ByteTerm::TypePatternCasedCharacterOnce: 1165 case ByteTerm::TypePatternCasedCharacterFixed: 1166 case ByteTerm::TypePatternCasedCharacterGreedy: 1167 case ByteTerm::TypePatternCasedCharacterNonGreedy: 1168 if (backtrackPatternCasedCharacter(currentTerm(), context)) 1169 MATCH_NEXT(); 1170 BACKTRACK(); 1171 case ByteTerm::TypeCharacterClass: 1172 if (backtrackCharacterClass(currentTerm(), context)) 1173 MATCH_NEXT(); 1174 BACKTRACK(); 1175 case ByteTerm::TypeBackReference: 1176 if (backtrackBackReference(currentTerm(), context)) 1177 MATCH_NEXT(); 1178 BACKTRACK(); 1179 case ByteTerm::TypeParenthesesSubpattern: 1180 if (backtrackParentheses(currentTerm(), context)) 1181 MATCH_NEXT(); 1182 BACKTRACK(); 1183 case ByteTerm::TypeParenthesesSubpatternOnceBegin: 1184 if (backtrackParenthesesOnceBegin(currentTerm(), context)) 1185 MATCH_NEXT(); 1186 BACKTRACK(); 1187 case ByteTerm::TypeParenthesesSubpatternOnceEnd: 1188 if (backtrackParenthesesOnceEnd(currentTerm(), context)) 1189 MATCH_NEXT(); 1190 BACKTRACK(); 1191 case ByteTerm::TypeParentheticalAssertionBegin: 1192 if (backtrackParentheticalAssertionBegin(currentTerm(), context)) 1193 MATCH_NEXT(); 1194 BACKTRACK(); 1195 case ByteTerm::TypeParentheticalAssertionEnd: 1196 if (backtrackParentheticalAssertionEnd(currentTerm(), context)) 1197 MATCH_NEXT(); 1198 BACKTRACK(); 1199 1200 case ByteTerm::TypeCheckInput: 1201 input.uncheckInput(currentTerm().checkInputCount); 1202 BACKTRACK(); 1203 } 1204 1205 ASSERT_NOT_REACHED(); 1206 return false; 1207 } 1208 1209 bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) 1210 { 1211 if (matchDisjunction(disjunction, context, btrack)) { 1212 while (context->matchBegin == context->matchEnd) { 1213 if (!matchDisjunction(disjunction, context, true)) 1214 return false; 1215 } 1216 return true; 1217 } 1218 1219 return false; 1220 } 1221 1222 int interpret() 1223 { 1224 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) 1225 output[i] = -1; 1226 1227 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); 1228 1229 if (matchDisjunction(pattern->m_body.get(), context)) { 1230 output[0] = context->matchBegin; 1231 output[1] = context->matchEnd; 1232 } 1233 1234 freeDisjunctionContext(context); 1235 1236 return output[0]; 1237 } 1238 1239 Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) 1240 : pattern(pattern) 1241 , output(output) 1242 , input(inputChar, start, length) 1243 { 1244 } 1245 1246 private: 1247 BytecodePattern *pattern; 1248 int* output; 1249 InputStream input; 1250 }; 1251 1252 1253 1254 class ByteCompiler { 1255 struct ParenthesesStackEntry { 1256 unsigned beginTerm; 1257 unsigned savedAlternativeIndex; 1258 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) 1259 : beginTerm(beginTerm) 1260 , savedAlternativeIndex(savedAlternativeIndex) 1261 { 1262 } 1263 }; 1264 1265 public: 1266 ByteCompiler(RegexPattern& pattern) 1267 : m_pattern(pattern) 1268 { 1269 m_bodyDisjunction = 0; 1270 m_currentAlternativeIndex = 0; 1271 } 1272 1273 BytecodePattern* compile() 1274 { 1275 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); 1276 emitDisjunction(m_pattern.m_body); 1277 regexEnd(); 1278 1279 return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); 1280 } 1281 1282 void checkInput(unsigned count) 1283 { 1284 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); 1285 } 1286 1287 void assertionBOL(int inputPosition) 1288 { 1289 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); 1290 } 1291 1292 void assertionEOL(int inputPosition) 1293 { 1294 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); 1295 } 1296 1297 void assertionWordBoundary(bool invert, int inputPosition) 1298 { 1299 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); 1300 } 1301 1302 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1303 { 1304 if (m_pattern.m_ignoreCase) { 1305 UChar lo = Unicode::toLower(ch); 1306 UChar hi = Unicode::toUpper(ch); 1307 1308 if (lo != hi) { 1309 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); 1310 return; 1311 } 1312 } 1313 1314 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); 1315 } 1316 1317 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1318 { 1319 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); 1320 1321 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; 1322 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1323 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1324 } 1325 1326 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 1327 { 1328 ASSERT(subpatternId); 1329 1330 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); 1331 1332 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; 1333 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; 1334 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1335 } 1336 1337 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) 1338 { 1339 int beginTerm = m_bodyDisjunction->terms.size(); 1340 1341 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); 1342 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1343 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1344 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1345 1346 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1347 m_currentAlternativeIndex = beginTerm + 1; 1348 } 1349 1350 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) 1351 { 1352 int beginTerm = m_bodyDisjunction->terms.size(); 1353 1354 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); 1355 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; 1356 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); 1357 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; 1358 1359 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); 1360 m_currentAlternativeIndex = beginTerm + 1; 1361 } 1362 1363 unsigned popParenthesesStack() 1364 { 1365 ASSERT(m_parenthesesStack.size()); 1366 int stackEnd = m_parenthesesStack.size() - 1; 1367 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; 1368 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; 1369 m_parenthesesStack.shrink(stackEnd); 1370 1371 ASSERT(beginTerm < m_bodyDisjunction->terms.size()); 1372 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); 1373 1374 return beginTerm; 1375 } 1376 1377 #ifndef NDEBUG 1378 void dumpDisjunction(ByteDisjunction* disjunction) 1379 { 1380 printf("ByteDisjunction(%p):\n\t", disjunction); 1381 for (unsigned i = 0; i < disjunction->terms.size(); ++i) 1382 printf("{ %d } ", disjunction->terms[i].type); 1383 printf("\n"); 1384 } 1385 #endif 1386 1387 void closeAlternative(int beginTerm) 1388 { 1389 int origBeginTerm = beginTerm; 1390 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); 1391 int endIndex = m_bodyDisjunction->terms.size(); 1392 1393 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1394 1395 if (!m_bodyDisjunction->terms[beginTerm].alternative.next) 1396 m_bodyDisjunction->terms.remove(beginTerm); 1397 else { 1398 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1399 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1400 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); 1401 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1402 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1403 } 1404 1405 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1406 1407 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); 1408 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1409 } 1410 } 1411 1412 void closeBodyAlternative() 1413 { 1414 int beginTerm = 0; 1415 int origBeginTerm = 0; 1416 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); 1417 int endIndex = m_bodyDisjunction->terms.size(); 1418 1419 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; 1420 1421 while (m_bodyDisjunction->terms[beginTerm].alternative.next) { 1422 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; 1423 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); 1424 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; 1425 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1426 } 1427 1428 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; 1429 1430 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); 1431 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; 1432 } 1433 1434 void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) 1435 { 1436 unsigned beginTerm = popParenthesesStack(); 1437 closeAlternative(beginTerm + 1); 1438 unsigned endTerm = m_bodyDisjunction->terms.size(); 1439 1440 bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; 1441 bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; 1442 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; 1443 1444 m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); 1445 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; 1446 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; 1447 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; 1448 1449 if (doInline) { 1450 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1451 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1452 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; 1453 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; 1454 } else { 1455 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; 1456 ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); 1457 1458 bool invertOrCapture = parenthesesBegin.invertOrCapture; 1459 unsigned subpatternId = parenthesesBegin.atom.subpatternId; 1460 1461 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; 1462 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); 1463 1464 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); 1465 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) 1466 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); 1467 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); 1468 1469 m_bodyDisjunction->terms.shrink(beginTerm); 1470 1471 m_allParenthesesInfo.append(parenthesesDisjunction); 1472 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); 1473 1474 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; 1475 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; 1476 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; 1477 } 1478 } 1479 1480 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) 1481 { 1482 m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); 1483 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); 1484 m_bodyDisjunction->terms[0].frameLocation = 0; 1485 m_currentAlternativeIndex = 0; 1486 } 1487 1488 void regexEnd() 1489 { 1490 closeBodyAlternative(); 1491 } 1492 1493 void alternativeBodyDisjunction() 1494 { 1495 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1496 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1497 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); 1498 1499 m_currentAlternativeIndex = newAlternativeIndex; 1500 } 1501 1502 void alternativeDisjunction() 1503 { 1504 int newAlternativeIndex = m_bodyDisjunction->terms.size(); 1505 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; 1506 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); 1507 1508 m_currentAlternativeIndex = newAlternativeIndex; 1509 } 1510 1511 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) 1512 { 1513 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 1514 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; 1515 1516 if (alt) { 1517 if (disjunction == m_pattern.m_body) 1518 alternativeBodyDisjunction(); 1519 else 1520 alternativeDisjunction(); 1521 } 1522 1523 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 1524 unsigned minimumSize = alternative->m_minimumSize; 1525 1526 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); 1527 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; 1528 if (countToCheck) 1529 checkInput(countToCheck); 1530 currentCountAlreadyChecked += countToCheck; 1531 1532 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 1533 PatternTerm& term = alternative->m_terms[i]; 1534 1535 switch (term.type) { 1536 case PatternTerm::TypeAssertionBOL: 1537 assertionBOL(term.inputPosition - currentCountAlreadyChecked); 1538 break; 1539 1540 case PatternTerm::TypeAssertionEOL: 1541 assertionEOL(term.inputPosition - currentCountAlreadyChecked); 1542 break; 1543 1544 case PatternTerm::TypeAssertionWordBoundary: 1545 assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); 1546 break; 1547 1548 case PatternTerm::TypePatternCharacter: 1549 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1550 break; 1551 1552 case PatternTerm::TypeCharacterClass: 1553 atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1554 break; 1555 1556 case PatternTerm::TypeBackReference: 1557 atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); 1558 break; 1559 1560 case PatternTerm::TypeForwardReference: 1561 break; 1562 1563 case PatternTerm::TypeParenthesesSubpattern: { 1564 unsigned disjunctionAlreadyCheckedCount = 0; 1565 if ((term.quantityCount == 1) && !term.parentheses.isCopy) { 1566 if (term.quantityType == QuantifierFixedCount) { 1567 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; 1568 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1569 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); 1570 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); 1571 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); 1572 } else { 1573 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1574 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); 1575 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); 1576 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); 1577 } 1578 } else { 1579 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; 1580 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); 1581 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); 1582 atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); 1583 } 1584 break; 1585 } 1586 1587 case PatternTerm::TypeParentheticalAssertion: { 1588 unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; 1589 1590 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); 1591 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); 1592 atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); 1593 break; 1594 } 1595 } 1596 } 1597 } 1598 } 1599 1600 private: 1601 RegexPattern& m_pattern; 1602 ByteDisjunction* m_bodyDisjunction; 1603 unsigned m_currentAlternativeIndex; 1604 Vector<ParenthesesStackEntry> m_parenthesesStack; 1605 Vector<ByteDisjunction*> m_allParenthesesInfo; 1606 }; 1607 1608 1609 BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) 1610 { 1611 RegexPattern pattern(ignoreCase, multiline); 1612 1613 if ((error = compileRegex(patternString, pattern))) 1614 return 0; 1615 1616 numSubpatterns = pattern.m_numSubpatterns; 1617 1618 return ByteCompiler(pattern).compile(); 1619 } 1620 1621 int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) 1622 { 1623 return Interpreter(regex, output, input, start, length).interpret(); 1624 } 1625 1626 1627 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); 1628 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); 1629 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); 1630 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); 1631 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); 1632 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); 1633 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); 1634 1635 1636 } } 1637 1638 #endif 1639