Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef RegexPattern_h
     27 #define RegexPattern_h
     28 
     29 #include <wtf/Platform.h>
     30 
     31 #if ENABLE(YARR)
     32 
     33 #include <wtf/Vector.h>
     34 #include <wtf/unicode/Unicode.h>
     35 
     36 
     37 namespace JSC { namespace Yarr {
     38 
     39 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
     40 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
     41 #define RegexStackSpaceForBackTrackInfoBackReference 2
     42 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
     43 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
     44 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
     45 #define RegexStackSpaceForBackTrackInfoParentheses 4
     46 
     47 struct PatternDisjunction;
     48 
     49 struct CharacterRange {
     50     UChar begin;
     51     UChar end;
     52 
     53     CharacterRange(UChar begin, UChar end)
     54         : begin(begin)
     55         , end(end)
     56     {
     57     }
     58 };
     59 
     60 struct CharacterClass : FastAllocBase {
     61     Vector<UChar> m_matches;
     62     Vector<CharacterRange> m_ranges;
     63     Vector<UChar> m_matchesUnicode;
     64     Vector<CharacterRange> m_rangesUnicode;
     65 };
     66 
     67 enum QuantifierType {
     68     QuantifierFixedCount,
     69     QuantifierGreedy,
     70     QuantifierNonGreedy,
     71 };
     72 
     73 struct PatternTerm {
     74     enum Type {
     75         TypeAssertionBOL,
     76         TypeAssertionEOL,
     77         TypeAssertionWordBoundary,
     78         TypePatternCharacter,
     79         TypeCharacterClass,
     80         TypeBackReference,
     81         TypeForwardReference,
     82         TypeParenthesesSubpattern,
     83         TypeParentheticalAssertion,
     84     } type;
     85     bool invertOrCapture;
     86     union {
     87         UChar patternCharacter;
     88         CharacterClass* characterClass;
     89         unsigned subpatternId;
     90         struct {
     91             PatternDisjunction* disjunction;
     92             unsigned subpatternId;
     93             unsigned lastSubpatternId;
     94             bool isCopy;
     95         } parentheses;
     96     };
     97     QuantifierType quantityType;
     98     unsigned quantityCount;
     99     int inputPosition;
    100     unsigned frameLocation;
    101 
    102     PatternTerm(UChar ch)
    103         : type(PatternTerm::TypePatternCharacter)
    104     {
    105         patternCharacter = ch;
    106         quantityType = QuantifierFixedCount;
    107         quantityCount = 1;
    108     }
    109 
    110     PatternTerm(CharacterClass* charClass, bool invert)
    111         : type(PatternTerm::TypeCharacterClass)
    112         , invertOrCapture(invert)
    113     {
    114         characterClass = charClass;
    115         quantityType = QuantifierFixedCount;
    116         quantityCount = 1;
    117     }
    118 
    119     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
    120         : type(type)
    121         , invertOrCapture(invertOrCapture)
    122     {
    123         parentheses.disjunction = disjunction;
    124         parentheses.subpatternId = subpatternId;
    125         parentheses.isCopy = false;
    126         quantityType = QuantifierFixedCount;
    127         quantityCount = 1;
    128     }
    129 
    130     PatternTerm(Type type, bool invert = false)
    131         : type(type)
    132         , invertOrCapture(invert)
    133     {
    134         quantityType = QuantifierFixedCount;
    135         quantityCount = 1;
    136     }
    137 
    138     PatternTerm(unsigned spatternId)
    139         : type(TypeBackReference)
    140         , invertOrCapture(false)
    141     {
    142         subpatternId = spatternId;
    143         quantityType = QuantifierFixedCount;
    144         quantityCount = 1;
    145     }
    146 
    147     static PatternTerm ForwardReference()
    148     {
    149         return PatternTerm(TypeForwardReference);
    150     }
    151 
    152     static PatternTerm BOL()
    153     {
    154         return PatternTerm(TypeAssertionBOL);
    155     }
    156 
    157     static PatternTerm EOL()
    158     {
    159         return PatternTerm(TypeAssertionEOL);
    160     }
    161 
    162     static PatternTerm WordBoundary(bool invert)
    163     {
    164         return PatternTerm(TypeAssertionWordBoundary, invert);
    165     }
    166 
    167     bool invert()
    168     {
    169         return invertOrCapture;
    170     }
    171 
    172     bool capture()
    173     {
    174         return invertOrCapture;
    175     }
    176 
    177     void quantify(unsigned count, QuantifierType type)
    178     {
    179         quantityCount = count;
    180         quantityType = type;
    181     }
    182 };
    183 
    184 struct PatternAlternative : FastAllocBase {
    185     PatternAlternative(PatternDisjunction* disjunction)
    186         : m_parent(disjunction)
    187     {
    188     }
    189 
    190     PatternTerm& lastTerm()
    191     {
    192         ASSERT(m_terms.size());
    193         return m_terms[m_terms.size() - 1];
    194     }
    195 
    196     void removeLastTerm()
    197     {
    198         ASSERT(m_terms.size());
    199         m_terms.shrink(m_terms.size() - 1);
    200     }
    201 
    202     Vector<PatternTerm> m_terms;
    203     PatternDisjunction* m_parent;
    204     unsigned m_minimumSize;
    205     bool m_hasFixedSize;
    206 };
    207 
    208 struct PatternDisjunction : FastAllocBase {
    209     PatternDisjunction(PatternAlternative* parent = 0)
    210         : m_parent(parent)
    211     {
    212     }
    213 
    214     ~PatternDisjunction()
    215     {
    216         deleteAllValues(m_alternatives);
    217     }
    218 
    219     PatternAlternative* addNewAlternative()
    220     {
    221         PatternAlternative* alternative = new PatternAlternative(this);
    222         m_alternatives.append(alternative);
    223         return alternative;
    224     }
    225 
    226     Vector<PatternAlternative*> m_alternatives;
    227     PatternAlternative* m_parent;
    228     unsigned m_minimumSize;
    229     unsigned m_callFrameSize;
    230     bool m_hasFixedSize;
    231 };
    232 
    233 // You probably don't want to be calling these functions directly
    234 // (please to be calling newlineCharacterClass() et al on your
    235 // friendly neighborhood RegexPattern instance to get nicely
    236 // cached copies).
    237 CharacterClass* newlineCreate();
    238 CharacterClass* digitsCreate();
    239 CharacterClass* spacesCreate();
    240 CharacterClass* wordcharCreate();
    241 CharacterClass* nondigitsCreate();
    242 CharacterClass* nonspacesCreate();
    243 CharacterClass* nonwordcharCreate();
    244 
    245 struct RegexPattern {
    246     RegexPattern(bool ignoreCase, bool multiline)
    247         : m_ignoreCase(ignoreCase)
    248         , m_multiline(multiline)
    249         , m_numSubpatterns(0)
    250         , m_maxBackReference(0)
    251         , newlineCached(0)
    252         , digitsCached(0)
    253         , spacesCached(0)
    254         , wordcharCached(0)
    255         , nondigitsCached(0)
    256         , nonspacesCached(0)
    257         , nonwordcharCached(0)
    258     {
    259     }
    260 
    261     ~RegexPattern()
    262     {
    263         deleteAllValues(m_disjunctions);
    264         deleteAllValues(m_userCharacterClasses);
    265     }
    266 
    267     void reset()
    268     {
    269         m_numSubpatterns = 0;
    270         m_maxBackReference = 0;
    271 
    272         newlineCached = 0;
    273         digitsCached = 0;
    274         spacesCached = 0;
    275         wordcharCached = 0;
    276         nondigitsCached = 0;
    277         nonspacesCached = 0;
    278         nonwordcharCached = 0;
    279 
    280         deleteAllValues(m_disjunctions);
    281         m_disjunctions.clear();
    282         deleteAllValues(m_userCharacterClasses);
    283         m_userCharacterClasses.clear();
    284     }
    285 
    286     bool containsIllegalBackReference()
    287     {
    288         return m_maxBackReference > m_numSubpatterns;
    289     }
    290 
    291     CharacterClass* newlineCharacterClass()
    292     {
    293         if (!newlineCached)
    294             m_userCharacterClasses.append(newlineCached = newlineCreate());
    295         return newlineCached;
    296     }
    297     CharacterClass* digitsCharacterClass()
    298     {
    299         if (!digitsCached)
    300             m_userCharacterClasses.append(digitsCached = digitsCreate());
    301         return digitsCached;
    302     }
    303     CharacterClass* spacesCharacterClass()
    304     {
    305         if (!spacesCached)
    306             m_userCharacterClasses.append(spacesCached = spacesCreate());
    307         return spacesCached;
    308     }
    309     CharacterClass* wordcharCharacterClass()
    310     {
    311         if (!wordcharCached)
    312             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
    313         return wordcharCached;
    314     }
    315     CharacterClass* nondigitsCharacterClass()
    316     {
    317         if (!nondigitsCached)
    318             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
    319         return nondigitsCached;
    320     }
    321     CharacterClass* nonspacesCharacterClass()
    322     {
    323         if (!nonspacesCached)
    324             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
    325         return nonspacesCached;
    326     }
    327     CharacterClass* nonwordcharCharacterClass()
    328     {
    329         if (!nonwordcharCached)
    330             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
    331         return nonwordcharCached;
    332     }
    333 
    334     bool m_ignoreCase;
    335     bool m_multiline;
    336     unsigned m_numSubpatterns;
    337     unsigned m_maxBackReference;
    338     PatternDisjunction* m_body;
    339     Vector<PatternDisjunction*, 4> m_disjunctions;
    340     Vector<CharacterClass*> m_userCharacterClasses;
    341 
    342 private:
    343     CharacterClass* newlineCached;
    344     CharacterClass* digitsCached;
    345     CharacterClass* spacesCached;
    346     CharacterClass* wordcharCached;
    347     CharacterClass* nondigitsCached;
    348     CharacterClass* nonspacesCached;
    349     CharacterClass* nonwordcharCached;
    350 };
    351 
    352 } } // namespace JSC::Yarr
    353 
    354 #endif
    355 
    356 #endif // RegexPattern_h
    357