1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga (at) inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "YarrPattern.h" 29 30 #include "Yarr.h" 31 #include "YarrParser.h" 32 #include <wtf/Vector.h> 33 34 using namespace WTF; 35 36 namespace JSC { namespace Yarr { 37 38 #include "RegExpJitTables.h" 39 40 class CharacterClassConstructor { 41 public: 42 CharacterClassConstructor(bool isCaseInsensitive = false) 43 : m_isCaseInsensitive(isCaseInsensitive) 44 { 45 } 46 47 void reset() 48 { 49 m_matches.clear(); 50 m_ranges.clear(); 51 m_matchesUnicode.clear(); 52 m_rangesUnicode.clear(); 53 } 54 55 void append(const CharacterClass* other) 56 { 57 for (size_t i = 0; i < other->m_matches.size(); ++i) 58 addSorted(m_matches, other->m_matches[i]); 59 for (size_t i = 0; i < other->m_ranges.size(); ++i) 60 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); 61 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) 62 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); 63 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) 64 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); 65 } 66 67 void putChar(UChar ch) 68 { 69 if (ch <= 0x7f) { 70 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { 71 addSorted(m_matches, toASCIIUpper(ch)); 72 addSorted(m_matches, toASCIILower(ch)); 73 } else 74 addSorted(m_matches, ch); 75 } else { 76 UChar upper, lower; 77 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { 78 addSorted(m_matchesUnicode, upper); 79 addSorted(m_matchesUnicode, lower); 80 } else 81 addSorted(m_matchesUnicode, ch); 82 } 83 } 84 85 // returns true if this character has another case, and 'ch' is the upper case form. 86 static inline bool isUnicodeUpper(UChar ch) 87 { 88 return ch != Unicode::toLower(ch); 89 } 90 91 // returns true if this character has another case, and 'ch' is the lower case form. 92 static inline bool isUnicodeLower(UChar ch) 93 { 94 return ch != Unicode::toUpper(ch); 95 } 96 97 void putRange(UChar lo, UChar hi) 98 { 99 if (lo <= 0x7f) { 100 char asciiLo = lo; 101 char asciiHi = std::min(hi, (UChar)0x7f); 102 addSortedRange(m_ranges, lo, asciiHi); 103 104 if (m_isCaseInsensitive) { 105 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) 106 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); 107 if ((asciiLo <= 'z') && (asciiHi >= 'a')) 108 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); 109 } 110 } 111 if (hi >= 0x80) { 112 uint32_t unicodeCurr = std::max(lo, (UChar)0x80); 113 addSortedRange(m_rangesUnicode, unicodeCurr, hi); 114 115 if (m_isCaseInsensitive) { 116 while (unicodeCurr <= hi) { 117 // If the upper bound of the range (hi) is 0xffff, the increments to 118 // unicodeCurr in this loop may take it to 0x10000. This is fine 119 // (if so we won't re-enter the loop, since the loop condition above 120 // will definitely fail) - but this does mean we cannot use a UChar 121 // to represent unicodeCurr, we must use a 32-bit value instead. 122 ASSERT(unicodeCurr <= 0xffff); 123 124 if (isUnicodeUpper(unicodeCurr)) { 125 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); 126 UChar lowerCaseRangeEnd = lowerCaseRangeBegin; 127 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) 128 lowerCaseRangeEnd++; 129 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); 130 } else if (isUnicodeLower(unicodeCurr)) { 131 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); 132 UChar upperCaseRangeEnd = upperCaseRangeBegin; 133 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) 134 upperCaseRangeEnd++; 135 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); 136 } else 137 ++unicodeCurr; 138 } 139 } 140 } 141 } 142 143 CharacterClass* charClass() 144 { 145 CharacterClass* characterClass = new CharacterClass(0); 146 147 characterClass->m_matches.append(m_matches); 148 characterClass->m_ranges.append(m_ranges); 149 characterClass->m_matchesUnicode.append(m_matchesUnicode); 150 characterClass->m_rangesUnicode.append(m_rangesUnicode); 151 152 reset(); 153 154 return characterClass; 155 } 156 157 private: 158 void addSorted(Vector<UChar>& matches, UChar ch) 159 { 160 unsigned pos = 0; 161 unsigned range = matches.size(); 162 163 // binary chop, find position to insert char. 164 while (range) { 165 unsigned index = range >> 1; 166 167 int val = matches[pos+index] - ch; 168 if (!val) 169 return; 170 else if (val > 0) 171 range = index; 172 else { 173 pos += (index+1); 174 range -= (index+1); 175 } 176 } 177 178 if (pos == matches.size()) 179 matches.append(ch); 180 else 181 matches.insert(pos, ch); 182 } 183 184 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) 185 { 186 unsigned end = ranges.size(); 187 188 // Simple linear scan - I doubt there are that many ranges anyway... 189 // feel free to fix this with something faster (eg binary chop). 190 for (unsigned i = 0; i < end; ++i) { 191 // does the new range fall before the current position in the array 192 if (hi < ranges[i].begin) { 193 // optional optimization: concatenate appending ranges? - may not be worthwhile. 194 if (hi == (ranges[i].begin - 1)) { 195 ranges[i].begin = lo; 196 return; 197 } 198 ranges.insert(i, CharacterRange(lo, hi)); 199 return; 200 } 201 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 202 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 203 // end of the last range they concatenate, which is just as good. 204 if (lo <= (ranges[i].end + 1)) { 205 // found an intersect! we'll replace this entry in the array. 206 ranges[i].begin = std::min(ranges[i].begin, lo); 207 ranges[i].end = std::max(ranges[i].end, hi); 208 209 // now check if the new range can subsume any subsequent ranges. 210 unsigned next = i+1; 211 // each iteration of the loop we will either remove something from the list, or break the loop. 212 while (next < ranges.size()) { 213 if (ranges[next].begin <= (ranges[i].end + 1)) { 214 // the next entry now overlaps / concatenates this one. 215 ranges[i].end = std::max(ranges[i].end, ranges[next].end); 216 ranges.remove(next); 217 } else 218 break; 219 } 220 221 return; 222 } 223 } 224 225 // CharacterRange comes after all existing ranges. 226 ranges.append(CharacterRange(lo, hi)); 227 } 228 229 bool m_isCaseInsensitive; 230 231 Vector<UChar> m_matches; 232 Vector<CharacterRange> m_ranges; 233 Vector<UChar> m_matchesUnicode; 234 Vector<CharacterRange> m_rangesUnicode; 235 }; 236 237 struct BeginCharHelper { 238 BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false) 239 : m_beginChars(beginChars) 240 , m_isCaseInsensitive(isCaseInsensitive) 241 {} 242 243 void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount) 244 { 245 if (quantityType == QuantifierFixedCount && quantityCount > 1) { 246 // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/ 247 beginChar.value |= beginChar.value << 16; 248 beginChar.mask |= beginChar.mask << 16; 249 addCharacter(beginChar); 250 } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size()) 251 // In case of characters with fixed quantifier we should check the next character as well. 252 linkHotTerms(beginChar, hotTerms); 253 else 254 // In case of greedy matching the next character checking is unnecessary therefore we just store 255 // the first character. 256 addCharacter(beginChar); 257 } 258 259 // Merge two following BeginChars in the vector to reduce the number of character checks. 260 void merge(unsigned size) 261 { 262 for (unsigned i = 0; i < size; i++) { 263 BeginChar* curr = &m_beginChars->at(i); 264 BeginChar* next = &m_beginChars->at(i + 1); 265 266 // If the current and the next size of value is different we should skip the merge process 267 // because the 16bit and 32bit values are unmergable. 268 if (curr->value <= 0xFFFF && next->value > 0xFFFF) 269 continue; 270 271 unsigned diff = curr->value ^ next->value; 272 273 curr->mask |= diff; 274 curr->value |= curr->mask; 275 276 m_beginChars->remove(i + 1); 277 size--; 278 } 279 } 280 281 private: 282 void addCharacter(BeginChar beginChar) 283 { 284 unsigned pos = 0; 285 unsigned range = m_beginChars->size(); 286 287 // binary chop, find position to insert char. 288 while (range) { 289 unsigned index = range >> 1; 290 291 int val = m_beginChars->at(pos+index).value - beginChar.value; 292 if (!val) 293 return; 294 if (val < 0) 295 range = index; 296 else { 297 pos += (index+1); 298 range -= (index+1); 299 } 300 } 301 302 if (pos == m_beginChars->size()) 303 m_beginChars->append(beginChar); 304 else 305 m_beginChars->insert(pos, beginChar); 306 } 307 308 // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object. 309 void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms) 310 { 311 for (unsigned i = 0; i < hotTerms->size(); i++) { 312 PatternTerm hotTerm = hotTerms->at(i).term; 313 ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter); 314 315 UChar characterNext = hotTerm.patternCharacter; 316 317 // Append a character to an existing BeginChar object. 318 if (characterNext <= 0x7f) { 319 unsigned mask = 0; 320 321 if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) { 322 mask = 32; 323 characterNext = toASCIILower(characterNext); 324 } 325 326 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16))); 327 } else { 328 UChar upper, lower; 329 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) { 330 addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask)); 331 addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask)); 332 } else 333 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask)); 334 } 335 } 336 } 337 338 Vector<BeginChar>* m_beginChars; 339 bool m_isCaseInsensitive; 340 }; 341 342 class YarrPatternConstructor { 343 public: 344 YarrPatternConstructor(YarrPattern& pattern) 345 : m_pattern(pattern) 346 , m_characterClassConstructor(pattern.m_ignoreCase) 347 , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase) 348 , m_invertParentheticalAssertion(false) 349 { 350 m_pattern.m_body = new PatternDisjunction(); 351 m_alternative = m_pattern.m_body->addNewAlternative(); 352 m_pattern.m_disjunctions.append(m_pattern.m_body); 353 } 354 355 ~YarrPatternConstructor() 356 { 357 } 358 359 void reset() 360 { 361 m_pattern.reset(); 362 m_characterClassConstructor.reset(); 363 364 m_pattern.m_body = new PatternDisjunction(); 365 m_alternative = m_pattern.m_body->addNewAlternative(); 366 m_pattern.m_disjunctions.append(m_pattern.m_body); 367 } 368 369 void assertionBOL() 370 { 371 if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { 372 m_alternative->m_startsWithBOL = true; 373 m_alternative->m_containsBOL = true; 374 m_pattern.m_containsBOL = true; 375 } 376 m_alternative->m_terms.append(PatternTerm::BOL()); 377 } 378 void assertionEOL() 379 { 380 m_alternative->m_terms.append(PatternTerm::EOL()); 381 } 382 void assertionWordBoundary(bool invert) 383 { 384 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); 385 } 386 387 void atomPatternCharacter(UChar ch) 388 { 389 // We handle case-insensitive checking of unicode characters which do have both 390 // cases by handling them as if they were defined using a CharacterClass. 391 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { 392 atomCharacterClassBegin(); 393 atomCharacterClassAtom(ch); 394 atomCharacterClassEnd(); 395 } else 396 m_alternative->m_terms.append(PatternTerm(ch)); 397 } 398 399 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 400 { 401 switch (classID) { 402 case DigitClassID: 403 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); 404 break; 405 case SpaceClassID: 406 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); 407 break; 408 case WordClassID: 409 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); 410 break; 411 case NewlineClassID: 412 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); 413 break; 414 } 415 } 416 417 void atomCharacterClassBegin(bool invert = false) 418 { 419 m_invertCharacterClass = invert; 420 } 421 422 void atomCharacterClassAtom(UChar ch) 423 { 424 m_characterClassConstructor.putChar(ch); 425 } 426 427 void atomCharacterClassRange(UChar begin, UChar end) 428 { 429 m_characterClassConstructor.putRange(begin, end); 430 } 431 432 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 433 { 434 ASSERT(classID != NewlineClassID); 435 436 switch (classID) { 437 case DigitClassID: 438 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); 439 break; 440 441 case SpaceClassID: 442 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); 443 break; 444 445 case WordClassID: 446 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); 447 break; 448 449 default: 450 ASSERT_NOT_REACHED(); 451 } 452 } 453 454 void atomCharacterClassEnd() 455 { 456 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 457 m_pattern.m_userCharacterClasses.append(newCharacterClass); 458 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); 459 } 460 461 void atomParenthesesSubpatternBegin(bool capture = true) 462 { 463 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; 464 if (capture) 465 m_pattern.m_numSubpatterns++; 466 467 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); 468 m_pattern.m_disjunctions.append(parenthesesDisjunction); 469 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); 470 m_alternative = parenthesesDisjunction->addNewAlternative(); 471 } 472 473 void atomParentheticalAssertionBegin(bool invert = false) 474 { 475 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); 476 m_pattern.m_disjunctions.append(parenthesesDisjunction); 477 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); 478 m_alternative = parenthesesDisjunction->addNewAlternative(); 479 m_invertParentheticalAssertion = invert; 480 } 481 482 void atomParenthesesEnd() 483 { 484 ASSERT(m_alternative->m_parent); 485 ASSERT(m_alternative->m_parent->m_parent); 486 487 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; 488 m_alternative = m_alternative->m_parent->m_parent; 489 490 PatternTerm& lastTerm = m_alternative->lastTerm(); 491 492 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); 493 unsigned numBOLAnchoredAlts = 0; 494 bool containsEmptyAlternative = false; 495 496 for (unsigned i = 0; i < numParenAlternatives; i++) { 497 if (!parenthesesDisjunction->m_alternatives[i]->m_terms.size() && numParenAlternatives > 1) { 498 PatternAlternative* altToRemove = parenthesesDisjunction->m_alternatives[i]; 499 parenthesesDisjunction->m_alternatives.remove(i); 500 delete altToRemove; 501 --numParenAlternatives; 502 503 containsEmptyAlternative = true; 504 continue; 505 } 506 507 // Bubble up BOL flags 508 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) 509 numBOLAnchoredAlts++; 510 } 511 512 if (numBOLAnchoredAlts) { 513 m_alternative->m_containsBOL = true; 514 // If all the alternatives in parens start with BOL, then so does this one 515 if (numBOLAnchoredAlts == numParenAlternatives) 516 m_alternative->m_startsWithBOL = true; 517 } 518 519 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; 520 m_invertParentheticalAssertion = false; 521 522 if (containsEmptyAlternative) { 523 // Backup and remove the current disjunction's alternatives. 524 Vector<PatternAlternative*> alternatives; 525 alternatives.append(parenthesesDisjunction->m_alternatives); 526 parenthesesDisjunction->m_alternatives.clear(); 527 PatternAlternative* alternative = parenthesesDisjunction->addNewAlternative(); 528 529 // Insert a new non-capturing parentheses. 530 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; 531 PatternDisjunction* newDisjunction = new PatternDisjunction(alternative); 532 m_pattern.m_disjunctions.append(newDisjunction); 533 alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, newDisjunction, false, false)); 534 newDisjunction->m_alternatives.append(alternatives); 535 536 // Set the quantifier of the new parentheses to '?' and set the inherited properties. 537 PatternTerm& disjunctionTerm = alternative->lastTerm(); 538 disjunctionTerm.quantify(1, QuantifierGreedy); 539 disjunctionTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; 540 alternative->m_containsBOL = m_alternative->m_containsBOL; 541 alternative->m_startsWithBOL = m_alternative->m_startsWithBOL; 542 } 543 } 544 545 void atomBackReference(unsigned subpatternId) 546 { 547 ASSERT(subpatternId); 548 m_pattern.m_containsBackreferences = true; 549 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); 550 551 if (subpatternId > m_pattern.m_numSubpatterns) { 552 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 553 return; 554 } 555 556 PatternAlternative* currentAlternative = m_alternative; 557 ASSERT(currentAlternative); 558 559 // Note to self: if we waited until the AST was baked, we could also remove forwards refs 560 while ((currentAlternative = currentAlternative->m_parent->m_parent)) { 561 PatternTerm& term = currentAlternative->lastTerm(); 562 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); 563 564 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { 565 m_alternative->m_terms.append(PatternTerm::ForwardReference()); 566 return; 567 } 568 } 569 570 m_alternative->m_terms.append(PatternTerm(subpatternId)); 571 } 572 573 // deep copy the argument disjunction. If filterStartsWithBOL is true, 574 // skip alternatives with m_startsWithBOL set true. 575 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) 576 { 577 PatternDisjunction* newDisjunction = 0; 578 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 579 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 580 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { 581 if (!newDisjunction) { 582 newDisjunction = new PatternDisjunction(); 583 newDisjunction->m_parent = disjunction->m_parent; 584 } 585 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); 586 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) 587 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); 588 } 589 } 590 591 if (newDisjunction) 592 m_pattern.m_disjunctions.append(newDisjunction); 593 return newDisjunction; 594 } 595 596 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) 597 { 598 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) 599 return PatternTerm(term); 600 601 PatternTerm termCopy = term; 602 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); 603 return termCopy; 604 } 605 606 void quantifyAtom(unsigned min, unsigned max, bool greedy) 607 { 608 ASSERT(min <= max); 609 ASSERT(m_alternative->m_terms.size()); 610 611 if (!max) { 612 m_alternative->removeLastTerm(); 613 return; 614 } 615 616 PatternTerm& term = m_alternative->lastTerm(); 617 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); 618 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); 619 620 // For any assertion with a zero minimum, not matching is valid and has no effect, 621 // remove it. Otherwise, we need to match as least once, but there is no point 622 // matching more than once, so remove the quantifier. It is not entirely clear 623 // from the spec whether or not this behavior is correct, but I believe this 624 // matches Firefox. :-/ 625 if (term.type == PatternTerm::TypeParentheticalAssertion) { 626 if (!min) 627 m_alternative->removeLastTerm(); 628 return; 629 } 630 631 if (min == 0) 632 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); 633 else if (min == max) 634 term.quantify(min, QuantifierFixedCount); 635 else { 636 term.quantify(min, QuantifierFixedCount); 637 m_alternative->m_terms.append(copyTerm(term)); 638 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... 639 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); 640 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) 641 m_alternative->lastTerm().parentheses.isCopy = true; 642 } 643 } 644 645 void disjunction() 646 { 647 m_alternative = m_alternative->m_parent->addNewAlternative(); 648 } 649 650 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) 651 { 652 alternative->m_hasFixedSize = true; 653 unsigned currentInputPosition = initialInputPosition; 654 655 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 656 PatternTerm& term = alternative->m_terms[i]; 657 658 switch (term.type) { 659 case PatternTerm::TypeAssertionBOL: 660 case PatternTerm::TypeAssertionEOL: 661 case PatternTerm::TypeAssertionWordBoundary: 662 term.inputPosition = currentInputPosition; 663 break; 664 665 case PatternTerm::TypeBackReference: 666 term.inputPosition = currentInputPosition; 667 term.frameLocation = currentCallFrameSize; 668 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; 669 alternative->m_hasFixedSize = false; 670 break; 671 672 case PatternTerm::TypeForwardReference: 673 break; 674 675 case PatternTerm::TypePatternCharacter: 676 term.inputPosition = currentInputPosition; 677 if (term.quantityType != QuantifierFixedCount) { 678 term.frameLocation = currentCallFrameSize; 679 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; 680 alternative->m_hasFixedSize = false; 681 } else 682 currentInputPosition += term.quantityCount; 683 break; 684 685 case PatternTerm::TypeCharacterClass: 686 term.inputPosition = currentInputPosition; 687 if (term.quantityType != QuantifierFixedCount) { 688 term.frameLocation = currentCallFrameSize; 689 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; 690 alternative->m_hasFixedSize = false; 691 } else 692 currentInputPosition += term.quantityCount; 693 break; 694 695 case PatternTerm::TypeParenthesesSubpattern: 696 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. 697 term.frameLocation = currentCallFrameSize; 698 if (term.quantityCount == 1 && !term.parentheses.isCopy) { 699 if (term.quantityType != QuantifierFixedCount) 700 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; 701 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); 702 // If quantity is fixed, then pre-check its minimum size. 703 if (term.quantityType == QuantifierFixedCount) 704 currentInputPosition += term.parentheses.disjunction->m_minimumSize; 705 term.inputPosition = currentInputPosition; 706 } else if (term.parentheses.isTerminal) { 707 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; 708 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); 709 term.inputPosition = currentInputPosition; 710 } else { 711 term.inputPosition = currentInputPosition; 712 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); 713 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; 714 } 715 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. 716 alternative->m_hasFixedSize = false; 717 break; 718 719 case PatternTerm::TypeParentheticalAssertion: 720 term.inputPosition = currentInputPosition; 721 term.frameLocation = currentCallFrameSize; 722 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); 723 break; 724 } 725 } 726 727 alternative->m_minimumSize = currentInputPosition - initialInputPosition; 728 return currentCallFrameSize; 729 } 730 731 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) 732 { 733 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) 734 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; 735 736 unsigned minimumInputSize = UINT_MAX; 737 unsigned maximumCallFrameSize = 0; 738 bool hasFixedSize = true; 739 740 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 741 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 742 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); 743 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); 744 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); 745 hasFixedSize &= alternative->m_hasFixedSize; 746 } 747 748 ASSERT(minimumInputSize != UINT_MAX); 749 ASSERT(maximumCallFrameSize >= initialCallFrameSize); 750 751 disjunction->m_hasFixedSize = hasFixedSize; 752 disjunction->m_minimumSize = minimumInputSize; 753 disjunction->m_callFrameSize = maximumCallFrameSize; 754 return maximumCallFrameSize; 755 } 756 757 void setupOffsets() 758 { 759 setupDisjunctionOffsets(m_pattern.m_body, 0, 0); 760 } 761 762 // This optimization identifies sets of parentheses that we will never need to backtrack. 763 // In these cases we do not need to store state from prior iterations. 764 // We can presently avoid backtracking for: 765 // * where the parens are at the end of the regular expression (last term in any of the 766 // alternatives of the main body disjunction). 767 // * where the parens are non-capturing, and quantified unbounded greedy (*). 768 // * where the parens do not contain any capturing subpatterns. 769 void checkForTerminalParentheses() 770 { 771 // This check is much too crude; should be just checking whether the candidate 772 // node contains nested capturing subpatterns, not the whole expression! 773 if (m_pattern.m_numSubpatterns) 774 return; 775 776 Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; 777 for (size_t i = 0; i < alternatives.size(); ++i) { 778 Vector<PatternTerm>& terms = alternatives[i]->m_terms; 779 if (terms.size()) { 780 PatternTerm& term = terms.last(); 781 if (term.type == PatternTerm::TypeParenthesesSubpattern 782 && term.quantityType == QuantifierGreedy 783 && term.quantityCount == quantifyInfinite 784 && !term.capture()) 785 term.parentheses.isTerminal = true; 786 } 787 } 788 } 789 790 void optimizeBOL() 791 { 792 // Look for expressions containing beginning of line (^) anchoring and unroll them. 793 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops 794 // This code relies on the parsing code tagging alternatives with m_containsBOL and 795 // m_startsWithBOL and rolling those up to containing alternatives. 796 // At this point, this is only valid for non-multiline expressions. 797 PatternDisjunction* disjunction = m_pattern.m_body; 798 799 if (!m_pattern.m_containsBOL || m_pattern.m_multiline) 800 return; 801 802 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); 803 804 // Set alternatives in disjunction to "onceThrough" 805 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) 806 disjunction->m_alternatives[alt]->setOnceThrough(); 807 808 if (loopDisjunction) { 809 // Move alternatives from loopDisjunction to disjunction 810 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) 811 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); 812 813 loopDisjunction->m_alternatives.clear(); 814 } 815 } 816 817 // This function collects the terms which are potentially matching the first number of depth characters in the result. 818 // If this function returns false then it found at least one term which makes the beginning character 819 // look-up optimization inefficient. 820 bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth) 821 { 822 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 823 PatternAlternative* alternative = disjunction->m_alternatives[alt]; 824 825 if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth)) 826 return false; 827 } 828 829 return true; 830 } 831 832 bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth) 833 { 834 bool checkNext = true; 835 unsigned numTerms = alternative->m_terms.size(); 836 837 while (checkNext && termIndex < numTerms) { 838 PatternTerm term = alternative->m_terms[termIndex]; 839 checkNext = false; 840 841 switch (term.type) { 842 case PatternTerm::TypeAssertionBOL: 843 case PatternTerm::TypeAssertionEOL: 844 case PatternTerm::TypeAssertionWordBoundary: 845 return false; 846 847 case PatternTerm::TypeBackReference: 848 case PatternTerm::TypeForwardReference: 849 return false; 850 851 case PatternTerm::TypePatternCharacter: 852 if (termIndex != numTerms - 1) { 853 beginTerms->append(TermChain(term)); 854 termIndex++; 855 checkNext = true; 856 } else if (term.quantityType == QuantifierFixedCount) { 857 beginTerms->append(TermChain(term)); 858 if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1) 859 if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1)) 860 return false; 861 } 862 863 break; 864 865 case PatternTerm::TypeCharacterClass: 866 return false; 867 868 case PatternTerm::TypeParentheticalAssertion: 869 if (term.invert()) 870 return false; 871 872 case PatternTerm::TypeParenthesesSubpattern: 873 if (term.quantityType != QuantifierFixedCount) { 874 if (termIndex == numTerms - 1) 875 break; 876 877 termIndex++; 878 checkNext = true; 879 } 880 881 if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth)) 882 return false; 883 884 break; 885 } 886 } 887 888 return true; 889 } 890 891 void setupBeginChars() 892 { 893 Vector<TermChain> beginTerms; 894 bool containsFixedCharacter = false; 895 896 if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1) 897 && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) { 898 unsigned size = beginTerms.size(); 899 900 // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization. 901 if (!size) 902 return; 903 904 m_pattern.m_containsBeginChars = true; 905 906 for (unsigned i = 0; i < size; i++) { 907 PatternTerm term = beginTerms[i].term; 908 909 // We have just collected PatternCharacter terms, other terms are not allowed. 910 ASSERT(term.type == PatternTerm::TypePatternCharacter); 911 912 if (term.quantityType == QuantifierFixedCount) 913 containsFixedCharacter = true; 914 915 UChar character = term.patternCharacter; 916 unsigned mask = 0; 917 918 if (character <= 0x7f) { 919 if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) { 920 mask = 32; 921 character = toASCIILower(character); 922 } 923 924 m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 925 } else { 926 UChar upper, lower; 927 if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) { 928 m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 929 m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 930 } else 931 m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount); 932 } 933 } 934 935 // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient. 936 if (!containsFixedCharacter) { 937 m_pattern.m_containsBeginChars = false; 938 return; 939 } 940 941 size = m_pattern.m_beginChars.size(); 942 943 if (size > 2) 944 m_beginCharHelper.merge(size - 1); 945 else if (size <= 1) 946 m_pattern.m_containsBeginChars = false; 947 } 948 } 949 950 private: 951 YarrPattern& m_pattern; 952 PatternAlternative* m_alternative; 953 CharacterClassConstructor m_characterClassConstructor; 954 BeginCharHelper m_beginCharHelper; 955 bool m_invertCharacterClass; 956 bool m_invertParentheticalAssertion; 957 }; 958 959 const char* YarrPattern::compile(const UString& patternString) 960 { 961 YarrPatternConstructor constructor(*this); 962 963 if (const char* error = parse(constructor, patternString)) 964 return error; 965 966 // If the pattern contains illegal backreferences reset & reparse. 967 // Quoting Netscape's "What's new in JavaScript 1.2", 968 // "Note: if the number of left parentheses is less than the number specified 969 // in \#, the \# is taken as an octal escape as described in the next row." 970 if (containsIllegalBackReference()) { 971 unsigned numSubpatterns = m_numSubpatterns; 972 973 constructor.reset(); 974 #if !ASSERT_DISABLED 975 const char* error = 976 #endif 977 parse(constructor, patternString, numSubpatterns); 978 979 ASSERT(!error); 980 ASSERT(numSubpatterns == m_numSubpatterns); 981 } 982 983 constructor.checkForTerminalParentheses(); 984 constructor.optimizeBOL(); 985 986 constructor.setupOffsets(); 987 constructor.setupBeginChars(); 988 989 return 0; 990 } 991 992 YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) 993 : m_ignoreCase(ignoreCase) 994 , m_multiline(multiline) 995 , m_containsBackreferences(false) 996 , m_containsBeginChars(false) 997 , m_containsBOL(false) 998 , m_numSubpatterns(0) 999 , m_maxBackReference(0) 1000 , newlineCached(0) 1001 , digitsCached(0) 1002 , spacesCached(0) 1003 , wordcharCached(0) 1004 , nondigitsCached(0) 1005 , nonspacesCached(0) 1006 , nonwordcharCached(0) 1007 { 1008 *error = compile(pattern); 1009 } 1010 1011 } } 1012