1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "RegexCompiler.h" 28 29 #include "RegexInterpreter.h" 30 #include "RegexPattern.h" 31 #include <wtf/Vector.h> 32 33 #if ENABLE(YARR) 34 35 using namespace WTF; 36 37 namespace JSC { namespace Yarr { 38 39 class CharacterClassConstructor { 40 public: 41 CharacterClassConstructor(bool isCaseInsensitive = false) 42 : m_isCaseInsensitive(isCaseInsensitive) 43 { 44 } 45 46 void reset() 47 { 48 m_matches.clear(); 49 m_ranges.clear(); 50 m_matchesUnicode.clear(); 51 m_rangesUnicode.clear(); 52 } 53 54 void append(const CharacterClass* other) 55 { 56 for (size_t i = 0; i < other->m_matches.size(); ++i) 57 addSorted(m_matches, other->m_matches[i]); 58 for (size_t i = 0; i < other->m_ranges.size(); ++i) 59 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); 60 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) 61 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); 62 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) 63 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); 64 } 65 66 void putChar(UChar ch) 67 { 68 if (ch <= 0x7f) { 69 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { 70 addSorted(m_matches, toASCIIUpper(ch)); 71 addSorted(m_matches, toASCIILower(ch)); 72 } else 73 addSorted(m_matches, ch); 74 } else { 75 UChar upper, lower; 76 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { 77 addSorted(m_matchesUnicode, upper); 78 addSorted(m_matchesUnicode, lower); 79 } else 80 addSorted(m_matchesUnicode, ch); 81 } 82 } 83 84 // returns true if this character has another case, and 'ch' is the upper case form. 85 static inline bool isUnicodeUpper(UChar ch) 86 { 87 return ch != Unicode::toLower(ch); 88 } 89 90 // returns true if this character has another case, and 'ch' is the lower case form. 91 static inline bool isUnicodeLower(UChar ch) 92 { 93 return ch != Unicode::toUpper(ch); 94 } 95 96 void putRange(UChar lo, UChar hi) 97 { 98 if (lo <= 0x7f) { 99 char asciiLo = lo; 100 char asciiHi = std::min(hi, (UChar)0x7f); 101 addSortedRange(m_ranges, lo, asciiHi); 102 103 if (m_isCaseInsensitive) { 104 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) 105 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); 106 if ((asciiLo <= 'z') && (asciiHi >= 'a')) 107 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); 108 } 109 } 110 if (hi >= 0x80) { 111 uint32_t unicodeCurr = std::max(lo, (UChar)0x80); 112 addSortedRange(m_rangesUnicode, unicodeCurr, hi); 113 114 if (m_isCaseInsensitive) { 115 while (unicodeCurr <= hi) { 116 // If the upper bound of the range (hi) is 0xffff, the increments to 117 // unicodeCurr in this loop may take it to 0x10000. This is fine 118 // (if so we won't re-enter the loop, since the loop condition above 119 // will definitely fail) - but this does mean we cannot use a UChar 120 // to represent unicodeCurr, we must use a 32-bit value instead. 121 ASSERT(unicodeCurr <= 0xffff); 122 123 if (isUnicodeUpper(unicodeCurr)) { 124 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); 125 UChar lowerCaseRangeEnd = lowerCaseRangeBegin; 126 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) 127 lowerCaseRangeEnd++; 128 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); 129 } else if (isUnicodeLower(unicodeCurr)) { 130 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); 131 UChar upperCaseRangeEnd = upperCaseRangeBegin; 132 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) 133 upperCaseRangeEnd++; 134 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); 135 } else 136 ++unicodeCurr; 137 } 138 } 139 } 140 } 141 142 CharacterClass* charClass() 143 { 144 CharacterClass* characterClass = new CharacterClass(); 145 146 characterClass->m_matches.append(m_matches); 147 characterClass->m_ranges.append(m_ranges); 148 characterClass->m_matchesUnicode.append(m_matchesUnicode); 149 characterClass->m_rangesUnicode.append(m_rangesUnicode); 150 151 reset(); 152 153 return characterClass; 154 } 155 156 private: 157 void addSorted(Vector<UChar>& matches, UChar ch) 158 { 159 unsigned pos = 0; 160 unsigned range = matches.size(); 161 162 // binary chop, find position to insert char. 163 while (range) { 164 unsigned index = range >> 1; 165 166 int val = matches[pos+index] - ch; 167 if (!val) 168 return; 169 else if (val > 0) 170 range = index; 171 else { 172 pos += (index+1); 173 range -= (index+1); 174 } 175 } 176 177 if (pos == matches.size()) 178 matches.append(ch); 179 else 180 matches.insert(pos, ch); 181 } 182 183 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) 184 { 185 unsigned end = ranges.size(); 186 187 // Simple linear scan - I doubt there are that many ranges anyway... 188 // feel free to fix this with something faster (eg binary chop). 189 for (unsigned i = 0; i < end; ++i) { 190 // does the new range fall before the current position in the array 191 if (hi < ranges[i].begin) { 192 // optional optimization: concatenate appending ranges? - may not be worthwhile. 193 if (hi == (ranges[i].begin - 1)) { 194 ranges[i].begin = lo; 195 return; 196 } 197 ranges.insert(i, CharacterRange(lo, hi)); 198 return; 199 } 200 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 201 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 202 // end of the last range they concatenate, which is just as good. 203 if (lo <= (ranges[i].end + 1)) { 204 // found an intersect! we'll replace this entry in the array. 205 ranges[i].begin = std::min(ranges[i].begin, lo); 206 ranges[i].end = std::max(ranges[i].end, hi); 207 208 // now check if the new range can subsume any subsequent ranges. 209 unsigned next = i+1; 210 // each iteration of the loop we will either remove something from the list, or break the loop. 211 while (next < ranges.size()) { 212 if (ranges[next].begin <= (ranges[i].end + 1)) { 213 // the next entry now overlaps / concatenates this one. 214 ranges[i].end = std::max(ranges[i].end, ranges[next].end); 215 ranges.remove(next); 216 } else 217 break; 218 } 219 220 return; 221 } 222 } 223 224 // CharacterRange comes after all existing ranges. 225 ranges.append(CharacterRange(lo, hi)); 226 } 227 228 bool m_isCaseInsensitive; 229 230 Vector<UChar> m_matches; 231 Vector<CharacterRange> m_ranges; 232 Vector<UChar> m_matchesUnicode; 233 Vector<CharacterRange> m_rangesUnicode; 234 }; 235 236 237 CharacterClass* newlineCreate() 238 { 239 CharacterClass* characterClass = new CharacterClass(); 240 241 characterClass->m_matches.append('\n'); 242 characterClass->m_matches.append('\r'); 243 characterClass->m_matchesUnicode.append(0x2028); 244 characterClass->m_matchesUnicode.append(0x2029); 245 246 return characterClass; 247 } 248 249 CharacterClass* digitsCreate() 250 { 251 CharacterClass* characterClass = new CharacterClass(); 252 253 characterClass->m_ranges.append(CharacterRange('0', '9')); 254 255 return characterClass; 256 } 257 258 CharacterClass* spacesCreate() 259 { 260 CharacterClass* characterClass = new CharacterClass(); 261 262 characterClass->m_matches.append(' '); 263 characterClass->m_ranges.append(CharacterRange('\t', '\r')); 264 characterClass->m_matchesUnicode.append(0x00a0); 265 characterClass->m_matchesUnicode.append(0x1680); 266 characterClass->m_matchesUnicode.append(0x180e); 267 characterClass->m_matchesUnicode.append(0x2028); 268 characterClass->m_matchesUnicode.append(0x2029); 269 characterClass->m_matchesUnicode.append(0x202f); 270 characterClass->m_matchesUnicode.append(0x205f); 271 characterClass->m_matchesUnicode.append(0x3000); 272 characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a)); 273 274 return characterClass; 275 } 276 277 CharacterClass* wordcharCreate() 278 { 279 CharacterClass* characterClass = new CharacterClass(); 280 281 characterClass->m_matches.append('_'); 282 characterClass->m_ranges.append(CharacterRange('0', '9')); 283 characterClass->m_ranges.append(CharacterRange('A', 'Z')); 284 characterClass->m_ranges.append(CharacterRange('a', 'z')); 285 286 return characterClass; 287 } 288 289 CharacterClass* nondigitsCreate() 290 { 291 CharacterClass* characterClass = new CharacterClass(); 292 293 characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); 294 characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f)); 295 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); 296 297 return characterClass; 298 } 299 300 CharacterClass* nonspacesCreate() 301 { 302 CharacterClass* characterClass = new CharacterClass(); 303 304 characterClass->m_ranges.append(CharacterRange(0, '\t' - 1)); 305 characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1)); 306 characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f)); 307 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f)); 308 characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f)); 309 characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d)); 310 characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff)); 311 characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027)); 312 characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e)); 313 characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e)); 314 characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff)); 315 characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff)); 316 317 return characterClass; 318 } 319 320 CharacterClass* nonwordcharCreate() 321 { 322 CharacterClass* characterClass = new CharacterClass(); 323 324 characterClass->m_matches.append('`'); 325 characterClass->m_ranges.append(CharacterRange(0, '0' - 1)); 326 characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1)); 327 characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1)); 328 characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f)); 329 characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff)); 330 331 return characterClass; 332 } 333 334 335 class RegexPatternConstructor { 336 public: 337 RegexPatternConstructor(RegexPattern& pattern) 338 : m_pattern(pattern) 339 , m_characterClassConstructor(pattern.m_ignoreCase) 340 { 341 } 342 343 ~RegexPatternConstructor() 344 { 345 } 346 347 void reset() 348 { 349 m_pattern.reset(); 350 m_characterClassConstructor.reset(); 351 } 352 353 void assertionBOL() 354 { 355 m_alternative->m_terms.append(PatternTerm::BOL()); 356 } 357 void assertionEOL() 358 { 359 m_alternative->m_terms.append(PatternTerm::EOL()); 360 } 361 void assertionWordBoundary(bool invert) 362 { 363 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); 364 } 365 366 void atomPatternCharacter(UChar ch) 367 { 368 // We handle case-insensitive checking of unicode characters which do have both 369 // cases by handling them as if they were defined using a CharacterClass. 370 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { 371 atomCharacterClassBegin(); 372 atomCharacterClassAtom(ch); 373 atomCharacterClassEnd(); 374 } else 375 m_alternative->m_terms.append(PatternTerm(ch)); 376 } 377 378 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 379 { 380 switch (classID) { 381 case DigitClassID: 382 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); 383 break; 384 case SpaceClassID: 385 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); 386 break; 387 case WordClassID: 388 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); 389 break; 390 case NewlineClassID: 391 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); 392 break; 393 } 394 } 395 396 void atomCharacterClassBegin(bool invert = false) 397 { 398 m_invertCharacterClass = invert; 399 } 400 401 void atomCharacterClassAtom(UChar ch) 402 { 403 m_characterClassConstructor.putChar(ch); 404 } 405 406 void atomCharacterClassRange(UChar begin, UChar end) 407 { 408 m_characterClassConstructor.putRange(begin, end); 409 } 410 411 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 412 { 413 ASSERT(classID != NewlineClassID); 414 415 switch (classID) { 416 case DigitClassID: 417 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); 418 break; 419 420 case SpaceClassID: 421 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); 422 break; 423 424 case WordClassID: 425 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); 426 break; 427 428 default: 429 ASSERT_NOT_REACHED(); 430 } 431 } 432 433 void atomCharacterClassEnd() 434 { 435 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 436 m_pattern.m_userCharacterClasses.append(newCharacterClass); 437 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); 438 } 439 440 void atomParenthesesSubpatternBegin(bool capture = true) 441 { 442 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; 443 if (capture) 444 m_pattern.m_numSubpatterns++; 445 446 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); 447 m_pattern.m_disjunctions.append(parenthesesDisjunction); 448 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); 449 m_alternative = parenthesesDisjunction->addNewAlternative(); 450 } 451 452 void atomParentheticalAssertionBegin(bool invert = false) 453 { 454 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); 455 m_pattern.m_disjunctions.append(parenthesesDisjunction); 456 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); 457 m_alternative = parenthesesDisjunction->addNewAlternative(); 458 } 459 460 void atomParenthesesEnd() 461 { 462 ASSERT(m_alternative->m_parent); 463 ASSERT(m_alternative->m_parent->m_parent); 464 m_alternative = m_alternative->m_parent->m_parent; 465 466 m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; 467 } 468 469 void atomBackReference(unsigned subpatternId) 470 { 471 ASSERT(subpatternId); 472 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); 473 474 if (subpatternId > m_pattern.m_numSubpatterns) { 475 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 476 return; 477 } 478 479 PatternAlternative* currentAlternative = m_alternative; 480 ASSERT(currentAlternative); 481 482 // Note to self: if we waited until the AST was baked, we could also remove forwards refs 483 while ((currentAlternative = currentAlternative->m_parent->m_parent)) { 484 PatternTerm& term = currentAlternative->lastTerm(); 485 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); 486 487 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { 488 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 489 return; 490 } 491 } 492 493 m_alternative->m_terms.append(PatternTerm(subpatternId)); 494 } 495 496 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) 497 { 498 PatternDisjunction* newDisjunction = new PatternDisjunction(); 499 500 newDisjunction->m_parent = disjunction->m_parent; 501 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 502 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 503 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); 504 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) 505 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); 506 } 507 508 m_pattern.m_disjunctions.append(newDisjunction); 509 return newDisjunction; 510 } 511 512 PatternTerm copyTerm(PatternTerm& term) 513 { 514 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) 515 return PatternTerm(term); 516 517 PatternTerm termCopy = term; 518 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); 519 return termCopy; 520 } 521 522 void quantifyAtom(unsigned min, unsigned max, bool greedy) 523 { 524 ASSERT(min <= max); 525 ASSERT(m_alternative->m_terms.size()); 526 527 if (!max) { 528 m_alternative->removeLastTerm(); 529 return; 530 } 531 532 PatternTerm& term = m_alternative->lastTerm(); 533 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); 534 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); 535 536 // For any assertion with a zero minimum, not matching is valid and has no effect, 537 // remove it. Otherwise, we need to match as least once, but there is no point 538 // matching more than once, so remove the quantifier. It is not entirely clear 539 // from the spec whether or not this behavior is correct, but I believe this 540 // matches Firefox. :-/ 541 if (term.type == PatternTerm::TypeParentheticalAssertion) { 542 if (!min) 543 m_alternative->removeLastTerm(); 544 return; 545 } 546 547 if (min == 0) 548 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); 549 else if (min == max) 550 term.quantify(min, QuantifierFixedCount); 551 else { 552 term.quantify(min, QuantifierFixedCount); 553 m_alternative->m_terms.append(copyTerm(term)); 554 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... 555 m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); 556 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) 557 m_alternative->lastTerm().parentheses.isCopy = true; 558 } 559 } 560 561 void disjunction() 562 { 563 m_alternative = m_alternative->m_parent->addNewAlternative(); 564 } 565 566 void regexBegin() 567 { 568 m_pattern.m_body = new PatternDisjunction(); 569 m_alternative = m_pattern.m_body->addNewAlternative(); 570 m_pattern.m_disjunctions.append(m_pattern.m_body); 571 } 572 void regexEnd() 573 { 574 } 575 void regexError() 576 { 577 } 578 579 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) 580 { 581 alternative->m_hasFixedSize = true; 582 unsigned currentInputPosition = initialInputPosition; 583 584 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 585 PatternTerm& term = alternative->m_terms[i]; 586 587 switch (term.type) { 588 case PatternTerm::TypeAssertionBOL: 589 case PatternTerm::TypeAssertionEOL: 590 case PatternTerm::TypeAssertionWordBoundary: 591 term.inputPosition = currentInputPosition; 592 break; 593 594 case PatternTerm::TypeBackReference: 595 term.inputPosition = currentInputPosition; 596 term.frameLocation = currentCallFrameSize; 597 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; 598 alternative->m_hasFixedSize = false; 599 break; 600 601 case PatternTerm::TypeForwardReference: 602 break; 603 604 case PatternTerm::TypePatternCharacter: 605 term.inputPosition = currentInputPosition; 606 if (term.quantityType != QuantifierFixedCount) { 607 term.frameLocation = currentCallFrameSize; 608 currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; 609 alternative->m_hasFixedSize = false; 610 } else 611 currentInputPosition += term.quantityCount; 612 break; 613 614 case PatternTerm::TypeCharacterClass: 615 term.inputPosition = currentInputPosition; 616 if (term.quantityType != QuantifierFixedCount) { 617 term.frameLocation = currentCallFrameSize; 618 currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; 619 alternative->m_hasFixedSize = false; 620 } else 621 currentInputPosition += term.quantityCount; 622 break; 623 624 case PatternTerm::TypeParenthesesSubpattern: 625 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. 626 term.frameLocation = currentCallFrameSize; 627 if ((term.quantityCount == 1) && !term.parentheses.isCopy) { 628 if (term.quantityType == QuantifierFixedCount) { 629 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); 630 currentInputPosition += term.parentheses.disjunction->m_minimumSize; 631 } else { 632 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; 633 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); 634 } 635 term.inputPosition = currentInputPosition; 636 } else { 637 term.inputPosition = currentInputPosition; 638 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); 639 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; 640 } 641 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. 642 alternative->m_hasFixedSize = false; 643 break; 644 645 case PatternTerm::TypeParentheticalAssertion: 646 term.inputPosition = currentInputPosition; 647 term.frameLocation = currentCallFrameSize; 648 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); 649 break; 650 } 651 } 652 653 alternative->m_minimumSize = currentInputPosition - initialInputPosition; 654 return currentCallFrameSize; 655 } 656 657 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) 658 { 659 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) 660 initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; 661 662 unsigned minimumInputSize = UINT_MAX; 663 unsigned maximumCallFrameSize = 0; 664 bool hasFixedSize = true; 665 666 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 667 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 668 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); 669 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); 670 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); 671 hasFixedSize &= alternative->m_hasFixedSize; 672 } 673 674 ASSERT(minimumInputSize != UINT_MAX); 675 ASSERT(maximumCallFrameSize >= initialCallFrameSize); 676 677 disjunction->m_hasFixedSize = hasFixedSize; 678 disjunction->m_minimumSize = minimumInputSize; 679 disjunction->m_callFrameSize = maximumCallFrameSize; 680 return maximumCallFrameSize; 681 } 682 683 void setupOffsets() 684 { 685 setupDisjunctionOffsets(m_pattern.m_body, 0, 0); 686 } 687 688 private: 689 RegexPattern& m_pattern; 690 PatternAlternative* m_alternative; 691 CharacterClassConstructor m_characterClassConstructor; 692 bool m_invertCharacterClass; 693 }; 694 695 696 const char* compileRegex(const UString& patternString, RegexPattern& pattern) 697 { 698 RegexPatternConstructor constructor(pattern); 699 700 if (const char* error = parse(constructor, patternString)) 701 return error; 702 703 // If the pattern contains illegal backreferences reset & reparse. 704 // Quoting Netscape's "What's new in JavaScript 1.2", 705 // "Note: if the number of left parentheses is less than the number specified 706 // in \#, the \# is taken as an octal escape as described in the next row." 707 if (pattern.containsIllegalBackReference()) { 708 unsigned numSubpatterns = pattern.m_numSubpatterns; 709 710 constructor.reset(); 711 #if !ASSERT_DISABLED 712 const char* error = 713 #endif 714 parse(constructor, patternString, numSubpatterns); 715 716 ASSERT(!error); 717 ASSERT(numSubpatterns == pattern.m_numSubpatterns); 718 } 719 720 constructor.setupOffsets(); 721 722 return false; 723 }; 724 725 726 } } 727 728 #endif 729