Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2010 Peter Varga (pvarga (at) inf.u-szeged.hu), University of Szeged
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #ifndef YarrPattern_h
     28 #define YarrPattern_h
     29 
     30 #include <runtime/UString.h>
     31 #include <wtf/Vector.h>
     32 #include <wtf/unicode/Unicode.h>
     33 
     34 namespace JSC { namespace Yarr {
     35 
     36 struct PatternDisjunction;
     37 
     38 struct CharacterRange {
     39     UChar begin;
     40     UChar end;
     41 
     42     CharacterRange(UChar begin, UChar end)
     43         : begin(begin)
     44         , end(end)
     45     {
     46     }
     47 };
     48 
     49 struct CharacterClassTable : RefCounted<CharacterClassTable> {
     50     const char* m_table;
     51     bool m_inverted;
     52     static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
     53     {
     54         return adoptRef(new CharacterClassTable(table, inverted));
     55     }
     56 
     57 private:
     58     CharacterClassTable(const char* table, bool inverted)
     59         : m_table(table)
     60         , m_inverted(inverted)
     61     {
     62     }
     63 };
     64 
     65 struct CharacterClass {
     66     WTF_MAKE_FAST_ALLOCATED;
     67 public:
     68     // All CharacterClass instances have to have the full set of matches and ranges,
     69     // they may have an optional table for faster lookups (which must match the
     70     // specified matches and ranges)
     71     CharacterClass(PassRefPtr<CharacterClassTable> table)
     72         : m_table(table)
     73     {
     74     }
     75     Vector<UChar> m_matches;
     76     Vector<CharacterRange> m_ranges;
     77     Vector<UChar> m_matchesUnicode;
     78     Vector<CharacterRange> m_rangesUnicode;
     79     RefPtr<CharacterClassTable> m_table;
     80 };
     81 
     82 enum QuantifierType {
     83     QuantifierFixedCount,
     84     QuantifierGreedy,
     85     QuantifierNonGreedy,
     86 };
     87 
     88 struct PatternTerm {
     89     enum Type {
     90         TypeAssertionBOL,
     91         TypeAssertionEOL,
     92         TypeAssertionWordBoundary,
     93         TypePatternCharacter,
     94         TypeCharacterClass,
     95         TypeBackReference,
     96         TypeForwardReference,
     97         TypeParenthesesSubpattern,
     98         TypeParentheticalAssertion,
     99     } type;
    100     bool m_capture :1;
    101     bool m_invert :1;
    102     union {
    103         UChar patternCharacter;
    104         CharacterClass* characterClass;
    105         unsigned backReferenceSubpatternId;
    106         struct {
    107             PatternDisjunction* disjunction;
    108             unsigned subpatternId;
    109             unsigned lastSubpatternId;
    110             bool isCopy;
    111             bool isTerminal;
    112         } parentheses;
    113     };
    114     QuantifierType quantityType;
    115     unsigned quantityCount;
    116     int inputPosition;
    117     unsigned frameLocation;
    118 
    119     PatternTerm(UChar ch)
    120         : type(PatternTerm::TypePatternCharacter)
    121         , m_capture(false)
    122         , m_invert(false)
    123     {
    124         patternCharacter = ch;
    125         quantityType = QuantifierFixedCount;
    126         quantityCount = 1;
    127     }
    128 
    129     PatternTerm(CharacterClass* charClass, bool invert)
    130         : type(PatternTerm::TypeCharacterClass)
    131         , m_capture(false)
    132         , m_invert(invert)
    133     {
    134         characterClass = charClass;
    135         quantityType = QuantifierFixedCount;
    136         quantityCount = 1;
    137     }
    138 
    139     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
    140         : type(type)
    141         , m_capture(capture)
    142         , m_invert(invert)
    143     {
    144         parentheses.disjunction = disjunction;
    145         parentheses.subpatternId = subpatternId;
    146         parentheses.isCopy = false;
    147         parentheses.isTerminal = false;
    148         quantityType = QuantifierFixedCount;
    149         quantityCount = 1;
    150     }
    151 
    152     PatternTerm(Type type, bool invert = false)
    153         : type(type)
    154         , m_capture(false)
    155         , m_invert(invert)
    156     {
    157         quantityType = QuantifierFixedCount;
    158         quantityCount = 1;
    159     }
    160 
    161     PatternTerm(unsigned spatternId)
    162         : type(TypeBackReference)
    163         , m_capture(false)
    164         , m_invert(false)
    165     {
    166         backReferenceSubpatternId = spatternId;
    167         quantityType = QuantifierFixedCount;
    168         quantityCount = 1;
    169     }
    170 
    171     static PatternTerm ForwardReference()
    172     {
    173         return PatternTerm(TypeForwardReference);
    174     }
    175 
    176     static PatternTerm BOL()
    177     {
    178         return PatternTerm(TypeAssertionBOL);
    179     }
    180 
    181     static PatternTerm EOL()
    182     {
    183         return PatternTerm(TypeAssertionEOL);
    184     }
    185 
    186     static PatternTerm WordBoundary(bool invert)
    187     {
    188         return PatternTerm(TypeAssertionWordBoundary, invert);
    189     }
    190 
    191     bool invert()
    192     {
    193         return m_invert;
    194     }
    195 
    196     bool capture()
    197     {
    198         return m_capture;
    199     }
    200 
    201     void quantify(unsigned count, QuantifierType type)
    202     {
    203         quantityCount = count;
    204         quantityType = type;
    205     }
    206 };
    207 
    208 struct PatternAlternative {
    209     WTF_MAKE_FAST_ALLOCATED;
    210 public:
    211     PatternAlternative(PatternDisjunction* disjunction)
    212         : m_parent(disjunction)
    213         , m_onceThrough(false)
    214         , m_hasFixedSize(false)
    215         , m_startsWithBOL(false)
    216         , m_containsBOL(false)
    217     {
    218     }
    219 
    220     PatternTerm& lastTerm()
    221     {
    222         ASSERT(m_terms.size());
    223         return m_terms[m_terms.size() - 1];
    224     }
    225 
    226     void removeLastTerm()
    227     {
    228         ASSERT(m_terms.size());
    229         m_terms.shrink(m_terms.size() - 1);
    230     }
    231 
    232     void setOnceThrough()
    233     {
    234         m_onceThrough = true;
    235     }
    236 
    237     bool onceThrough()
    238     {
    239         return m_onceThrough;
    240     }
    241 
    242     Vector<PatternTerm> m_terms;
    243     PatternDisjunction* m_parent;
    244     unsigned m_minimumSize;
    245     bool m_onceThrough : 1;
    246     bool m_hasFixedSize : 1;
    247     bool m_startsWithBOL : 1;
    248     bool m_containsBOL : 1;
    249 };
    250 
    251 struct PatternDisjunction {
    252     WTF_MAKE_FAST_ALLOCATED;
    253 public:
    254     PatternDisjunction(PatternAlternative* parent = 0)
    255         : m_parent(parent)
    256         , m_hasFixedSize(false)
    257     {
    258     }
    259 
    260     ~PatternDisjunction()
    261     {
    262         deleteAllValues(m_alternatives);
    263     }
    264 
    265     PatternAlternative* addNewAlternative()
    266     {
    267         PatternAlternative* alternative = new PatternAlternative(this);
    268         m_alternatives.append(alternative);
    269         return alternative;
    270     }
    271 
    272     Vector<PatternAlternative*> m_alternatives;
    273     PatternAlternative* m_parent;
    274     unsigned m_minimumSize;
    275     unsigned m_callFrameSize;
    276     bool m_hasFixedSize;
    277 };
    278 
    279 // You probably don't want to be calling these functions directly
    280 // (please to be calling newlineCharacterClass() et al on your
    281 // friendly neighborhood YarrPattern instance to get nicely
    282 // cached copies).
    283 CharacterClass* newlineCreate();
    284 CharacterClass* digitsCreate();
    285 CharacterClass* spacesCreate();
    286 CharacterClass* wordcharCreate();
    287 CharacterClass* nondigitsCreate();
    288 CharacterClass* nonspacesCreate();
    289 CharacterClass* nonwordcharCreate();
    290 
    291 struct TermChain {
    292     TermChain(PatternTerm term)
    293         : term(term)
    294     {}
    295 
    296     PatternTerm term;
    297     Vector<TermChain> hotTerms;
    298 };
    299 
    300 struct BeginChar {
    301     BeginChar()
    302         : value(0)
    303         , mask(0)
    304     {}
    305 
    306     BeginChar(unsigned value, unsigned mask)
    307         : value(value)
    308         , mask(mask)
    309     {}
    310 
    311     unsigned value;
    312     unsigned mask;
    313 };
    314 
    315 struct YarrPattern {
    316     YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error);
    317 
    318     ~YarrPattern()
    319     {
    320         deleteAllValues(m_disjunctions);
    321         deleteAllValues(m_userCharacterClasses);
    322     }
    323 
    324     void reset()
    325     {
    326         m_numSubpatterns = 0;
    327         m_maxBackReference = 0;
    328 
    329         m_containsBackreferences = false;
    330         m_containsBeginChars = false;
    331         m_containsBOL = false;
    332 
    333         newlineCached = 0;
    334         digitsCached = 0;
    335         spacesCached = 0;
    336         wordcharCached = 0;
    337         nondigitsCached = 0;
    338         nonspacesCached = 0;
    339         nonwordcharCached = 0;
    340 
    341         deleteAllValues(m_disjunctions);
    342         m_disjunctions.clear();
    343         deleteAllValues(m_userCharacterClasses);
    344         m_userCharacterClasses.clear();
    345         m_beginChars.clear();
    346     }
    347 
    348     bool containsIllegalBackReference()
    349     {
    350         return m_maxBackReference > m_numSubpatterns;
    351     }
    352 
    353     CharacterClass* newlineCharacterClass()
    354     {
    355         if (!newlineCached)
    356             m_userCharacterClasses.append(newlineCached = newlineCreate());
    357         return newlineCached;
    358     }
    359     CharacterClass* digitsCharacterClass()
    360     {
    361         if (!digitsCached)
    362             m_userCharacterClasses.append(digitsCached = digitsCreate());
    363         return digitsCached;
    364     }
    365     CharacterClass* spacesCharacterClass()
    366     {
    367         if (!spacesCached)
    368             m_userCharacterClasses.append(spacesCached = spacesCreate());
    369         return spacesCached;
    370     }
    371     CharacterClass* wordcharCharacterClass()
    372     {
    373         if (!wordcharCached)
    374             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
    375         return wordcharCached;
    376     }
    377     CharacterClass* nondigitsCharacterClass()
    378     {
    379         if (!nondigitsCached)
    380             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
    381         return nondigitsCached;
    382     }
    383     CharacterClass* nonspacesCharacterClass()
    384     {
    385         if (!nonspacesCached)
    386             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
    387         return nonspacesCached;
    388     }
    389     CharacterClass* nonwordcharCharacterClass()
    390     {
    391         if (!nonwordcharCached)
    392             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
    393         return nonwordcharCached;
    394     }
    395 
    396     bool m_ignoreCase : 1;
    397     bool m_multiline : 1;
    398     bool m_containsBackreferences : 1;
    399     bool m_containsBeginChars : 1;
    400     bool m_containsBOL : 1;
    401     unsigned m_numSubpatterns;
    402     unsigned m_maxBackReference;
    403     PatternDisjunction* m_body;
    404     Vector<PatternDisjunction*, 4> m_disjunctions;
    405     Vector<CharacterClass*> m_userCharacterClasses;
    406     Vector<BeginChar> m_beginChars;
    407 
    408 private:
    409     const char* compile(const UString& patternString);
    410 
    411     CharacterClass* newlineCached;
    412     CharacterClass* digitsCached;
    413     CharacterClass* spacesCached;
    414     CharacterClass* wordcharCached;
    415     CharacterClass* nondigitsCached;
    416     CharacterClass* nonspacesCached;
    417     CharacterClass* nonwordcharCached;
    418 };
    419 
    420 } } // namespace JSC::Yarr
    421 
    422 #endif // YarrPattern_h
    423