Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2010 Peter Varga (pvarga (at) inf.u-szeged.hu), University of Szeged
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "YarrPattern.h"
     29 
     30 #include "Yarr.h"
     31 #include "YarrParser.h"
     32 #include <wtf/Vector.h>
     33 
     34 using namespace WTF;
     35 
     36 namespace JSC { namespace Yarr {
     37 
     38 #include "RegExpJitTables.h"
     39 
     40 class CharacterClassConstructor {
     41 public:
     42     CharacterClassConstructor(bool isCaseInsensitive = false)
     43         : m_isCaseInsensitive(isCaseInsensitive)
     44     {
     45     }
     46 
     47     void reset()
     48     {
     49         m_matches.clear();
     50         m_ranges.clear();
     51         m_matchesUnicode.clear();
     52         m_rangesUnicode.clear();
     53     }
     54 
     55     void append(const CharacterClass* other)
     56     {
     57         for (size_t i = 0; i < other->m_matches.size(); ++i)
     58             addSorted(m_matches, other->m_matches[i]);
     59         for (size_t i = 0; i < other->m_ranges.size(); ++i)
     60             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
     61         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
     62             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
     63         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
     64             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
     65     }
     66 
     67     void putChar(UChar ch)
     68     {
     69         if (ch <= 0x7f) {
     70             if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
     71                 addSorted(m_matches, toASCIIUpper(ch));
     72                 addSorted(m_matches, toASCIILower(ch));
     73             } else
     74                 addSorted(m_matches, ch);
     75         } else {
     76             UChar upper, lower;
     77             if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
     78                 addSorted(m_matchesUnicode, upper);
     79                 addSorted(m_matchesUnicode, lower);
     80             } else
     81                 addSorted(m_matchesUnicode, ch);
     82         }
     83     }
     84 
     85     // returns true if this character has another case, and 'ch' is the upper case form.
     86     static inline bool isUnicodeUpper(UChar ch)
     87     {
     88         return ch != Unicode::toLower(ch);
     89     }
     90 
     91     // returns true if this character has another case, and 'ch' is the lower case form.
     92     static inline bool isUnicodeLower(UChar ch)
     93     {
     94         return ch != Unicode::toUpper(ch);
     95     }
     96 
     97     void putRange(UChar lo, UChar hi)
     98     {
     99         if (lo <= 0x7f) {
    100             char asciiLo = lo;
    101             char asciiHi = std::min(hi, (UChar)0x7f);
    102             addSortedRange(m_ranges, lo, asciiHi);
    103 
    104             if (m_isCaseInsensitive) {
    105                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
    106                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
    107                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
    108                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
    109             }
    110         }
    111         if (hi >= 0x80) {
    112             uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
    113             addSortedRange(m_rangesUnicode, unicodeCurr, hi);
    114 
    115             if (m_isCaseInsensitive) {
    116                 while (unicodeCurr <= hi) {
    117                     // If the upper bound of the range (hi) is 0xffff, the increments to
    118                     // unicodeCurr in this loop may take it to 0x10000.  This is fine
    119                     // (if so we won't re-enter the loop, since the loop condition above
    120                     // will definitely fail) - but this does mean we cannot use a UChar
    121                     // to represent unicodeCurr, we must use a 32-bit value instead.
    122                     ASSERT(unicodeCurr <= 0xffff);
    123 
    124                     if (isUnicodeUpper(unicodeCurr)) {
    125                         UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
    126                         UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
    127                         while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
    128                             lowerCaseRangeEnd++;
    129                         addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
    130                     } else if (isUnicodeLower(unicodeCurr)) {
    131                         UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
    132                         UChar upperCaseRangeEnd = upperCaseRangeBegin;
    133                         while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
    134                             upperCaseRangeEnd++;
    135                         addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
    136                     } else
    137                         ++unicodeCurr;
    138                 }
    139             }
    140         }
    141     }
    142 
    143     CharacterClass* charClass()
    144     {
    145         CharacterClass* characterClass = new CharacterClass(0);
    146 
    147         characterClass->m_matches.append(m_matches);
    148         characterClass->m_ranges.append(m_ranges);
    149         characterClass->m_matchesUnicode.append(m_matchesUnicode);
    150         characterClass->m_rangesUnicode.append(m_rangesUnicode);
    151 
    152         reset();
    153 
    154         return characterClass;
    155     }
    156 
    157 private:
    158     void addSorted(Vector<UChar>& matches, UChar ch)
    159     {
    160         unsigned pos = 0;
    161         unsigned range = matches.size();
    162 
    163         // binary chop, find position to insert char.
    164         while (range) {
    165             unsigned index = range >> 1;
    166 
    167             int val = matches[pos+index] - ch;
    168             if (!val)
    169                 return;
    170             else if (val > 0)
    171                 range = index;
    172             else {
    173                 pos += (index+1);
    174                 range -= (index+1);
    175             }
    176         }
    177 
    178         if (pos == matches.size())
    179             matches.append(ch);
    180         else
    181             matches.insert(pos, ch);
    182     }
    183 
    184     void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
    185     {
    186         unsigned end = ranges.size();
    187 
    188         // Simple linear scan - I doubt there are that many ranges anyway...
    189         // feel free to fix this with something faster (eg binary chop).
    190         for (unsigned i = 0; i < end; ++i) {
    191             // does the new range fall before the current position in the array
    192             if (hi < ranges[i].begin) {
    193                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
    194                 if (hi == (ranges[i].begin - 1)) {
    195                     ranges[i].begin = lo;
    196                     return;
    197                 }
    198                 ranges.insert(i, CharacterRange(lo, hi));
    199                 return;
    200             }
    201             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
    202             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
    203             // end of the last range they concatenate, which is just as good.
    204             if (lo <= (ranges[i].end + 1)) {
    205                 // found an intersect! we'll replace this entry in the array.
    206                 ranges[i].begin = std::min(ranges[i].begin, lo);
    207                 ranges[i].end = std::max(ranges[i].end, hi);
    208 
    209                 // now check if the new range can subsume any subsequent ranges.
    210                 unsigned next = i+1;
    211                 // each iteration of the loop we will either remove something from the list, or break the loop.
    212                 while (next < ranges.size()) {
    213                     if (ranges[next].begin <= (ranges[i].end + 1)) {
    214                         // the next entry now overlaps / concatenates this one.
    215                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
    216                         ranges.remove(next);
    217                     } else
    218                         break;
    219                 }
    220 
    221                 return;
    222             }
    223         }
    224 
    225         // CharacterRange comes after all existing ranges.
    226         ranges.append(CharacterRange(lo, hi));
    227     }
    228 
    229     bool m_isCaseInsensitive;
    230 
    231     Vector<UChar> m_matches;
    232     Vector<CharacterRange> m_ranges;
    233     Vector<UChar> m_matchesUnicode;
    234     Vector<CharacterRange> m_rangesUnicode;
    235 };
    236 
    237 struct BeginCharHelper {
    238     BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
    239         : m_beginChars(beginChars)
    240         , m_isCaseInsensitive(isCaseInsensitive)
    241     {}
    242 
    243     void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
    244     {
    245         if (quantityType == QuantifierFixedCount && quantityCount > 1) {
    246             // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
    247             beginChar.value |= beginChar.value << 16;
    248             beginChar.mask |= beginChar.mask << 16;
    249             addCharacter(beginChar);
    250         } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
    251             // In case of characters with fixed quantifier we should check the next character as well.
    252             linkHotTerms(beginChar, hotTerms);
    253         else
    254             // In case of greedy matching the next character checking is unnecessary therefore we just store
    255             // the first character.
    256             addCharacter(beginChar);
    257     }
    258 
    259     // Merge two following BeginChars in the vector to reduce the number of character checks.
    260     void merge(unsigned size)
    261     {
    262         for (unsigned i = 0; i < size; i++) {
    263             BeginChar* curr = &m_beginChars->at(i);
    264             BeginChar* next = &m_beginChars->at(i + 1);
    265 
    266             // If the current and the next size of value is different we should skip the merge process
    267             // because the 16bit and 32bit values are unmergable.
    268             if (curr->value <= 0xFFFF && next->value > 0xFFFF)
    269                 continue;
    270 
    271             unsigned diff = curr->value ^ next->value;
    272 
    273             curr->mask |= diff;
    274             curr->value |= curr->mask;
    275 
    276             m_beginChars->remove(i + 1);
    277             size--;
    278         }
    279     }
    280 
    281 private:
    282     void addCharacter(BeginChar beginChar)
    283     {
    284         unsigned pos = 0;
    285         unsigned range = m_beginChars->size();
    286 
    287         // binary chop, find position to insert char.
    288         while (range) {
    289             unsigned index = range >> 1;
    290 
    291             int val = m_beginChars->at(pos+index).value - beginChar.value;
    292             if (!val)
    293                 return;
    294             if (val < 0)
    295                 range = index;
    296             else {
    297                 pos += (index+1);
    298                 range -= (index+1);
    299             }
    300         }
    301 
    302         if (pos == m_beginChars->size())
    303             m_beginChars->append(beginChar);
    304         else
    305             m_beginChars->insert(pos, beginChar);
    306     }
    307 
    308     // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
    309     void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
    310     {
    311         for (unsigned i = 0; i < hotTerms->size(); i++) {
    312             PatternTerm hotTerm = hotTerms->at(i).term;
    313             ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
    314 
    315             UChar characterNext = hotTerm.patternCharacter;
    316 
    317             // Append a character to an existing BeginChar object.
    318             if (characterNext <= 0x7f) {
    319                 unsigned mask = 0;
    320 
    321                 if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
    322                     mask = 32;
    323                     characterNext = toASCIILower(characterNext);
    324                 }
    325 
    326                 addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
    327             } else {
    328                 UChar upper, lower;
    329                 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
    330                     addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
    331                     addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
    332                 } else
    333                     addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
    334             }
    335         }
    336     }
    337 
    338     Vector<BeginChar>* m_beginChars;
    339     bool m_isCaseInsensitive;
    340 };
    341 
    342 class YarrPatternConstructor {
    343 public:
    344     YarrPatternConstructor(YarrPattern& pattern)
    345         : m_pattern(pattern)
    346         , m_characterClassConstructor(pattern.m_ignoreCase)
    347         , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
    348         , m_invertParentheticalAssertion(false)
    349     {
    350         m_pattern.m_body = new PatternDisjunction();
    351         m_alternative = m_pattern.m_body->addNewAlternative();
    352         m_pattern.m_disjunctions.append(m_pattern.m_body);
    353     }
    354 
    355     ~YarrPatternConstructor()
    356     {
    357     }
    358 
    359     void reset()
    360     {
    361         m_pattern.reset();
    362         m_characterClassConstructor.reset();
    363 
    364         m_pattern.m_body = new PatternDisjunction();
    365         m_alternative = m_pattern.m_body->addNewAlternative();
    366         m_pattern.m_disjunctions.append(m_pattern.m_body);
    367     }
    368 
    369     void assertionBOL()
    370     {
    371         if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
    372             m_alternative->m_startsWithBOL = true;
    373             m_alternative->m_containsBOL = true;
    374             m_pattern.m_containsBOL = true;
    375         }
    376         m_alternative->m_terms.append(PatternTerm::BOL());
    377     }
    378     void assertionEOL()
    379     {
    380         m_alternative->m_terms.append(PatternTerm::EOL());
    381     }
    382     void assertionWordBoundary(bool invert)
    383     {
    384         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
    385     }
    386 
    387     void atomPatternCharacter(UChar ch)
    388     {
    389         // We handle case-insensitive checking of unicode characters which do have both
    390         // cases by handling them as if they were defined using a CharacterClass.
    391         if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
    392             atomCharacterClassBegin();
    393             atomCharacterClassAtom(ch);
    394             atomCharacterClassEnd();
    395         } else
    396             m_alternative->m_terms.append(PatternTerm(ch));
    397     }
    398 
    399     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
    400     {
    401         switch (classID) {
    402         case DigitClassID:
    403             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
    404             break;
    405         case SpaceClassID:
    406             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
    407             break;
    408         case WordClassID:
    409             m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
    410             break;
    411         case NewlineClassID:
    412             m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
    413             break;
    414         }
    415     }
    416 
    417     void atomCharacterClassBegin(bool invert = false)
    418     {
    419         m_invertCharacterClass = invert;
    420     }
    421 
    422     void atomCharacterClassAtom(UChar ch)
    423     {
    424         m_characterClassConstructor.putChar(ch);
    425     }
    426 
    427     void atomCharacterClassRange(UChar begin, UChar end)
    428     {
    429         m_characterClassConstructor.putRange(begin, end);
    430     }
    431 
    432     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
    433     {
    434         ASSERT(classID != NewlineClassID);
    435 
    436         switch (classID) {
    437         case DigitClassID:
    438             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
    439             break;
    440 
    441         case SpaceClassID:
    442             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
    443             break;
    444 
    445         case WordClassID:
    446             m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
    447             break;
    448 
    449         default:
    450             ASSERT_NOT_REACHED();
    451         }
    452     }
    453 
    454     void atomCharacterClassEnd()
    455     {
    456         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
    457         m_pattern.m_userCharacterClasses.append(newCharacterClass);
    458         m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
    459     }
    460 
    461     void atomParenthesesSubpatternBegin(bool capture = true)
    462     {
    463         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
    464         if (capture)
    465             m_pattern.m_numSubpatterns++;
    466 
    467         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
    468         m_pattern.m_disjunctions.append(parenthesesDisjunction);
    469         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
    470         m_alternative = parenthesesDisjunction->addNewAlternative();
    471     }
    472 
    473     void atomParentheticalAssertionBegin(bool invert = false)
    474     {
    475         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
    476         m_pattern.m_disjunctions.append(parenthesesDisjunction);
    477         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
    478         m_alternative = parenthesesDisjunction->addNewAlternative();
    479         m_invertParentheticalAssertion = invert;
    480     }
    481 
    482     void atomParenthesesEnd()
    483     {
    484         ASSERT(m_alternative->m_parent);
    485         ASSERT(m_alternative->m_parent->m_parent);
    486 
    487         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
    488         m_alternative = m_alternative->m_parent->m_parent;
    489 
    490         PatternTerm& lastTerm = m_alternative->lastTerm();
    491 
    492         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
    493         unsigned numBOLAnchoredAlts = 0;
    494         bool containsEmptyAlternative = false;
    495 
    496         for (unsigned i = 0; i < numParenAlternatives; i++) {
    497             if (!parenthesesDisjunction->m_alternatives[i]->m_terms.size() && numParenAlternatives > 1) {
    498                 PatternAlternative* altToRemove = parenthesesDisjunction->m_alternatives[i];
    499                 parenthesesDisjunction->m_alternatives.remove(i);
    500                 delete altToRemove;
    501                 --numParenAlternatives;
    502 
    503                 containsEmptyAlternative = true;
    504                 continue;
    505             }
    506 
    507             // Bubble up BOL flags
    508             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
    509                 numBOLAnchoredAlts++;
    510         }
    511 
    512         if (numBOLAnchoredAlts) {
    513             m_alternative->m_containsBOL = true;
    514             // If all the alternatives in parens start with BOL, then so does this one
    515             if (numBOLAnchoredAlts == numParenAlternatives)
    516                 m_alternative->m_startsWithBOL = true;
    517         }
    518 
    519         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
    520         m_invertParentheticalAssertion = false;
    521 
    522         if (containsEmptyAlternative) {
    523             // Backup and remove the current disjunction's alternatives.
    524             Vector<PatternAlternative*> alternatives;
    525             alternatives.append(parenthesesDisjunction->m_alternatives);
    526             parenthesesDisjunction->m_alternatives.clear();
    527             PatternAlternative* alternative = parenthesesDisjunction->addNewAlternative();
    528 
    529             // Insert a new non-capturing parentheses.
    530             unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
    531             PatternDisjunction* newDisjunction = new PatternDisjunction(alternative);
    532             m_pattern.m_disjunctions.append(newDisjunction);
    533             alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, newDisjunction, false, false));
    534             newDisjunction->m_alternatives.append(alternatives);
    535 
    536             // Set the quantifier of the new parentheses to '?' and set the inherited properties.
    537             PatternTerm& disjunctionTerm = alternative->lastTerm();
    538             disjunctionTerm.quantify(1, QuantifierGreedy);
    539             disjunctionTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
    540             alternative->m_containsBOL = m_alternative->m_containsBOL;
    541             alternative->m_startsWithBOL = m_alternative->m_startsWithBOL;
    542         }
    543     }
    544 
    545     void atomBackReference(unsigned subpatternId)
    546     {
    547         ASSERT(subpatternId);
    548         m_pattern.m_containsBackreferences = true;
    549         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
    550 
    551         if (subpatternId > m_pattern.m_numSubpatterns) {
    552             m_alternative->m_terms.append(PatternTerm::ForwardReference());
    553             return;
    554         }
    555 
    556         PatternAlternative* currentAlternative = m_alternative;
    557         ASSERT(currentAlternative);
    558 
    559         // Note to self: if we waited until the AST was baked, we could also remove forwards refs
    560         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
    561             PatternTerm& term = currentAlternative->lastTerm();
    562             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
    563 
    564             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
    565                 m_alternative->m_terms.append(PatternTerm::ForwardReference());
    566                 return;
    567             }
    568         }
    569 
    570         m_alternative->m_terms.append(PatternTerm(subpatternId));
    571     }
    572 
    573     // deep copy the argument disjunction.  If filterStartsWithBOL is true,
    574     // skip alternatives with m_startsWithBOL set true.
    575     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
    576     {
    577         PatternDisjunction* newDisjunction = 0;
    578         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
    579             PatternAlternative* alternative = disjunction->m_alternatives[alt];
    580             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
    581                 if (!newDisjunction) {
    582                     newDisjunction = new PatternDisjunction();
    583                     newDisjunction->m_parent = disjunction->m_parent;
    584                 }
    585                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
    586                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
    587                     newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
    588             }
    589         }
    590 
    591         if (newDisjunction)
    592             m_pattern.m_disjunctions.append(newDisjunction);
    593         return newDisjunction;
    594     }
    595 
    596     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
    597     {
    598         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
    599             return PatternTerm(term);
    600 
    601         PatternTerm termCopy = term;
    602         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
    603         return termCopy;
    604     }
    605 
    606     void quantifyAtom(unsigned min, unsigned max, bool greedy)
    607     {
    608         ASSERT(min <= max);
    609         ASSERT(m_alternative->m_terms.size());
    610 
    611         if (!max) {
    612             m_alternative->removeLastTerm();
    613             return;
    614         }
    615 
    616         PatternTerm& term = m_alternative->lastTerm();
    617         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
    618         ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
    619 
    620         // For any assertion with a zero minimum, not matching is valid and has no effect,
    621         // remove it.  Otherwise, we need to match as least once, but there is no point
    622         // matching more than once, so remove the quantifier.  It is not entirely clear
    623         // from the spec whether or not this behavior is correct, but I believe this
    624         // matches Firefox. :-/
    625         if (term.type == PatternTerm::TypeParentheticalAssertion) {
    626             if (!min)
    627                 m_alternative->removeLastTerm();
    628             return;
    629         }
    630 
    631         if (min == 0)
    632             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
    633         else if (min == max)
    634             term.quantify(min, QuantifierFixedCount);
    635         else {
    636             term.quantify(min, QuantifierFixedCount);
    637             m_alternative->m_terms.append(copyTerm(term));
    638             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
    639             m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
    640             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
    641                 m_alternative->lastTerm().parentheses.isCopy = true;
    642         }
    643     }
    644 
    645     void disjunction()
    646     {
    647         m_alternative = m_alternative->m_parent->addNewAlternative();
    648     }
    649 
    650     unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
    651     {
    652         alternative->m_hasFixedSize = true;
    653         unsigned currentInputPosition = initialInputPosition;
    654 
    655         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
    656             PatternTerm& term = alternative->m_terms[i];
    657 
    658             switch (term.type) {
    659             case PatternTerm::TypeAssertionBOL:
    660             case PatternTerm::TypeAssertionEOL:
    661             case PatternTerm::TypeAssertionWordBoundary:
    662                 term.inputPosition = currentInputPosition;
    663                 break;
    664 
    665             case PatternTerm::TypeBackReference:
    666                 term.inputPosition = currentInputPosition;
    667                 term.frameLocation = currentCallFrameSize;
    668                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
    669                 alternative->m_hasFixedSize = false;
    670                 break;
    671 
    672             case PatternTerm::TypeForwardReference:
    673                 break;
    674 
    675             case PatternTerm::TypePatternCharacter:
    676                 term.inputPosition = currentInputPosition;
    677                 if (term.quantityType != QuantifierFixedCount) {
    678                     term.frameLocation = currentCallFrameSize;
    679                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
    680                     alternative->m_hasFixedSize = false;
    681                 } else
    682                     currentInputPosition += term.quantityCount;
    683                 break;
    684 
    685             case PatternTerm::TypeCharacterClass:
    686                 term.inputPosition = currentInputPosition;
    687                 if (term.quantityType != QuantifierFixedCount) {
    688                     term.frameLocation = currentCallFrameSize;
    689                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
    690                     alternative->m_hasFixedSize = false;
    691                 } else
    692                     currentInputPosition += term.quantityCount;
    693                 break;
    694 
    695             case PatternTerm::TypeParenthesesSubpattern:
    696                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
    697                 term.frameLocation = currentCallFrameSize;
    698                 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
    699                     if (term.quantityType != QuantifierFixedCount)
    700                         currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
    701                     currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
    702                     // If quantity is fixed, then pre-check its minimum size.
    703                     if (term.quantityType == QuantifierFixedCount)
    704                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
    705                     term.inputPosition = currentInputPosition;
    706                 } else if (term.parentheses.isTerminal) {
    707                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
    708                     currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
    709                     term.inputPosition = currentInputPosition;
    710                 } else {
    711                     term.inputPosition = currentInputPosition;
    712                     setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
    713                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
    714                 }
    715                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
    716                 alternative->m_hasFixedSize = false;
    717                 break;
    718 
    719             case PatternTerm::TypeParentheticalAssertion:
    720                 term.inputPosition = currentInputPosition;
    721                 term.frameLocation = currentCallFrameSize;
    722                 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
    723                 break;
    724             }
    725         }
    726 
    727         alternative->m_minimumSize = currentInputPosition - initialInputPosition;
    728         return currentCallFrameSize;
    729     }
    730 
    731     unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
    732     {
    733         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
    734             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
    735 
    736         unsigned minimumInputSize = UINT_MAX;
    737         unsigned maximumCallFrameSize = 0;
    738         bool hasFixedSize = true;
    739 
    740         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
    741             PatternAlternative* alternative = disjunction->m_alternatives[alt];
    742             unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
    743             minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
    744             maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
    745             hasFixedSize &= alternative->m_hasFixedSize;
    746         }
    747 
    748         ASSERT(minimumInputSize != UINT_MAX);
    749         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
    750 
    751         disjunction->m_hasFixedSize = hasFixedSize;
    752         disjunction->m_minimumSize = minimumInputSize;
    753         disjunction->m_callFrameSize = maximumCallFrameSize;
    754         return maximumCallFrameSize;
    755     }
    756 
    757     void setupOffsets()
    758     {
    759         setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
    760     }
    761 
    762     // This optimization identifies sets of parentheses that we will never need to backtrack.
    763     // In these cases we do not need to store state from prior iterations.
    764     // We can presently avoid backtracking for:
    765     //   * where the parens are at the end of the regular expression (last term in any of the
    766     //     alternatives of the main body disjunction).
    767     //   * where the parens are non-capturing, and quantified unbounded greedy (*).
    768     //   * where the parens do not contain any capturing subpatterns.
    769     void checkForTerminalParentheses()
    770     {
    771         // This check is much too crude; should be just checking whether the candidate
    772         // node contains nested capturing subpatterns, not the whole expression!
    773         if (m_pattern.m_numSubpatterns)
    774             return;
    775 
    776         Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
    777         for (size_t i = 0; i < alternatives.size(); ++i) {
    778             Vector<PatternTerm>& terms = alternatives[i]->m_terms;
    779             if (terms.size()) {
    780                 PatternTerm& term = terms.last();
    781                 if (term.type == PatternTerm::TypeParenthesesSubpattern
    782                     && term.quantityType == QuantifierGreedy
    783                     && term.quantityCount == quantifyInfinite
    784                     && !term.capture())
    785                     term.parentheses.isTerminal = true;
    786             }
    787         }
    788     }
    789 
    790     void optimizeBOL()
    791     {
    792         // Look for expressions containing beginning of line (^) anchoring and unroll them.
    793         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
    794         // This code relies on the parsing code tagging alternatives with m_containsBOL and
    795         // m_startsWithBOL and rolling those up to containing alternatives.
    796         // At this point, this is only valid for non-multiline expressions.
    797         PatternDisjunction* disjunction = m_pattern.m_body;
    798 
    799         if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
    800             return;
    801 
    802         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
    803 
    804         // Set alternatives in disjunction to "onceThrough"
    805         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
    806             disjunction->m_alternatives[alt]->setOnceThrough();
    807 
    808         if (loopDisjunction) {
    809             // Move alternatives from loopDisjunction to disjunction
    810             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
    811                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
    812 
    813             loopDisjunction->m_alternatives.clear();
    814         }
    815     }
    816 
    817     // This function collects the terms which are potentially matching the first number of depth characters in the result.
    818     // If this function returns false then it found at least one term which makes the beginning character
    819     // look-up optimization inefficient.
    820     bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
    821     {
    822         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
    823             PatternAlternative* alternative = disjunction->m_alternatives[alt];
    824 
    825             if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
    826                 return false;
    827         }
    828 
    829         return true;
    830     }
    831 
    832     bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
    833     {
    834         bool checkNext = true;
    835         unsigned numTerms = alternative->m_terms.size();
    836 
    837         while (checkNext && termIndex < numTerms) {
    838             PatternTerm term = alternative->m_terms[termIndex];
    839             checkNext = false;
    840 
    841             switch (term.type) {
    842             case PatternTerm::TypeAssertionBOL:
    843             case PatternTerm::TypeAssertionEOL:
    844             case PatternTerm::TypeAssertionWordBoundary:
    845                 return false;
    846 
    847             case PatternTerm::TypeBackReference:
    848             case PatternTerm::TypeForwardReference:
    849                 return false;
    850 
    851             case PatternTerm::TypePatternCharacter:
    852                 if (termIndex != numTerms - 1) {
    853                     beginTerms->append(TermChain(term));
    854                     termIndex++;
    855                     checkNext = true;
    856                 } else if (term.quantityType == QuantifierFixedCount) {
    857                     beginTerms->append(TermChain(term));
    858                     if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
    859                         if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
    860                             return false;
    861                 }
    862 
    863                 break;
    864 
    865             case PatternTerm::TypeCharacterClass:
    866                 return false;
    867 
    868             case PatternTerm::TypeParentheticalAssertion:
    869                 if (term.invert())
    870                     return false;
    871 
    872             case PatternTerm::TypeParenthesesSubpattern:
    873                 if (term.quantityType != QuantifierFixedCount) {
    874                     if (termIndex == numTerms - 1)
    875                         break;
    876 
    877                     termIndex++;
    878                     checkNext = true;
    879                 }
    880 
    881                 if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
    882                     return false;
    883 
    884                 break;
    885             }
    886         }
    887 
    888         return true;
    889     }
    890 
    891     void setupBeginChars()
    892     {
    893         Vector<TermChain> beginTerms;
    894         bool containsFixedCharacter = false;
    895 
    896         if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
    897                 && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
    898             unsigned size = beginTerms.size();
    899 
    900             // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
    901             if (!size)
    902                 return;
    903 
    904             m_pattern.m_containsBeginChars = true;
    905 
    906             for (unsigned i = 0; i < size; i++) {
    907                 PatternTerm term = beginTerms[i].term;
    908 
    909                 // We have just collected PatternCharacter terms, other terms are not allowed.
    910                 ASSERT(term.type == PatternTerm::TypePatternCharacter);
    911 
    912                 if (term.quantityType == QuantifierFixedCount)
    913                     containsFixedCharacter = true;
    914 
    915                 UChar character = term.patternCharacter;
    916                 unsigned mask = 0;
    917 
    918                 if (character <= 0x7f) {
    919                     if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
    920                         mask = 32;
    921                         character = toASCIILower(character);
    922                     }
    923 
    924                     m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    925                 } else {
    926                     UChar upper, lower;
    927                     if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
    928                         m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    929                         m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    930                     } else
    931                         m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
    932                 }
    933             }
    934 
    935             // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
    936             if (!containsFixedCharacter) {
    937                 m_pattern.m_containsBeginChars = false;
    938                 return;
    939             }
    940 
    941             size = m_pattern.m_beginChars.size();
    942 
    943             if (size > 2)
    944                 m_beginCharHelper.merge(size - 1);
    945             else if (size <= 1)
    946                 m_pattern.m_containsBeginChars = false;
    947         }
    948     }
    949 
    950 private:
    951     YarrPattern& m_pattern;
    952     PatternAlternative* m_alternative;
    953     CharacterClassConstructor m_characterClassConstructor;
    954     BeginCharHelper m_beginCharHelper;
    955     bool m_invertCharacterClass;
    956     bool m_invertParentheticalAssertion;
    957 };
    958 
    959 const char* YarrPattern::compile(const UString& patternString)
    960 {
    961     YarrPatternConstructor constructor(*this);
    962 
    963     if (const char* error = parse(constructor, patternString))
    964         return error;
    965 
    966     // If the pattern contains illegal backreferences reset & reparse.
    967     // Quoting Netscape's "What's new in JavaScript 1.2",
    968     //      "Note: if the number of left parentheses is less than the number specified
    969     //       in \#, the \# is taken as an octal escape as described in the next row."
    970     if (containsIllegalBackReference()) {
    971         unsigned numSubpatterns = m_numSubpatterns;
    972 
    973         constructor.reset();
    974 #if !ASSERT_DISABLED
    975         const char* error =
    976 #endif
    977             parse(constructor, patternString, numSubpatterns);
    978 
    979         ASSERT(!error);
    980         ASSERT(numSubpatterns == m_numSubpatterns);
    981     }
    982 
    983     constructor.checkForTerminalParentheses();
    984     constructor.optimizeBOL();
    985 
    986     constructor.setupOffsets();
    987     constructor.setupBeginChars();
    988 
    989     return 0;
    990 }
    991 
    992 YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error)
    993     : m_ignoreCase(ignoreCase)
    994     , m_multiline(multiline)
    995     , m_containsBackreferences(false)
    996     , m_containsBeginChars(false)
    997     , m_containsBOL(false)
    998     , m_numSubpatterns(0)
    999     , m_maxBackReference(0)
   1000     , newlineCached(0)
   1001     , digitsCached(0)
   1002     , spacesCached(0)
   1003     , wordcharCached(0)
   1004     , nondigitsCached(0)
   1005     , nonspacesCached(0)
   1006     , nonwordcharCached(0)
   1007 {
   1008     *error = compile(pattern);
   1009 }
   1010 
   1011 } }
   1012