Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef YarrInterpreter_h
     27 #define YarrInterpreter_h
     28 
     29 #include "YarrPattern.h"
     30 #include <wtf/PassOwnPtr.h>
     31 #include <wtf/unicode/Unicode.h>
     32 
     33 namespace WTF {
     34 class BumpPointerAllocator;
     35 }
     36 using WTF::BumpPointerAllocator;
     37 
     38 namespace JSC { namespace Yarr {
     39 
     40 class ByteDisjunction;
     41 
     42 struct ByteTerm {
     43     enum Type {
     44         TypeBodyAlternativeBegin,
     45         TypeBodyAlternativeDisjunction,
     46         TypeBodyAlternativeEnd,
     47         TypeAlternativeBegin,
     48         TypeAlternativeDisjunction,
     49         TypeAlternativeEnd,
     50         TypeSubpatternBegin,
     51         TypeSubpatternEnd,
     52         TypeAssertionBOL,
     53         TypeAssertionEOL,
     54         TypeAssertionWordBoundary,
     55         TypePatternCharacterOnce,
     56         TypePatternCharacterFixed,
     57         TypePatternCharacterGreedy,
     58         TypePatternCharacterNonGreedy,
     59         TypePatternCasedCharacterOnce,
     60         TypePatternCasedCharacterFixed,
     61         TypePatternCasedCharacterGreedy,
     62         TypePatternCasedCharacterNonGreedy,
     63         TypeCharacterClass,
     64         TypeBackReference,
     65         TypeParenthesesSubpattern,
     66         TypeParenthesesSubpatternOnceBegin,
     67         TypeParenthesesSubpatternOnceEnd,
     68         TypeParenthesesSubpatternTerminalBegin,
     69         TypeParenthesesSubpatternTerminalEnd,
     70         TypeParentheticalAssertionBegin,
     71         TypeParentheticalAssertionEnd,
     72         TypeCheckInput,
     73         TypeUncheckInput,
     74     } type;
     75     union {
     76         struct {
     77             union {
     78                 UChar patternCharacter;
     79                 struct {
     80                     UChar lo;
     81                     UChar hi;
     82                 } casedCharacter;
     83                 CharacterClass* characterClass;
     84                 unsigned subpatternId;
     85             };
     86             union {
     87                 ByteDisjunction* parenthesesDisjunction;
     88                 unsigned parenthesesWidth;
     89             };
     90             QuantifierType quantityType;
     91             unsigned quantityCount;
     92         } atom;
     93         struct {
     94             int next;
     95             int end;
     96             bool onceThrough;
     97         } alternative;
     98         unsigned checkInputCount;
     99     };
    100     unsigned frameLocation;
    101     bool m_capture : 1;
    102     bool m_invert : 1;
    103     int inputPosition;
    104 
    105     ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
    106         : frameLocation(frameLocation)
    107         , m_capture(false)
    108         , m_invert(false)
    109     {
    110         switch (quantityType) {
    111         case QuantifierFixedCount:
    112             type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
    113             break;
    114         case QuantifierGreedy:
    115             type = ByteTerm::TypePatternCharacterGreedy;
    116             break;
    117         case QuantifierNonGreedy:
    118             type = ByteTerm::TypePatternCharacterNonGreedy;
    119             break;
    120         }
    121 
    122         atom.patternCharacter = ch;
    123         atom.quantityType = quantityType;
    124         atom.quantityCount = quantityCount;
    125         inputPosition = inputPos;
    126     }
    127 
    128     ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
    129         : frameLocation(frameLocation)
    130         , m_capture(false)
    131         , m_invert(false)
    132     {
    133         switch (quantityType) {
    134         case QuantifierFixedCount:
    135             type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
    136             break;
    137         case QuantifierGreedy:
    138             type = ByteTerm::TypePatternCasedCharacterGreedy;
    139             break;
    140         case QuantifierNonGreedy:
    141             type = ByteTerm::TypePatternCasedCharacterNonGreedy;
    142             break;
    143         }
    144 
    145         atom.casedCharacter.lo = lo;
    146         atom.casedCharacter.hi = hi;
    147         atom.quantityType = quantityType;
    148         atom.quantityCount = quantityCount;
    149         inputPosition = inputPos;
    150     }
    151 
    152     ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
    153         : type(ByteTerm::TypeCharacterClass)
    154         , m_capture(false)
    155         , m_invert(invert)
    156     {
    157         atom.characterClass = characterClass;
    158         atom.quantityType = QuantifierFixedCount;
    159         atom.quantityCount = 1;
    160         inputPosition = inputPos;
    161     }
    162 
    163     ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
    164         : type(type)
    165         , m_capture(capture)
    166         , m_invert(false)
    167     {
    168         atom.subpatternId = subpatternId;
    169         atom.parenthesesDisjunction = parenthesesInfo;
    170         atom.quantityType = QuantifierFixedCount;
    171         atom.quantityCount = 1;
    172         inputPosition = inputPos;
    173     }
    174 
    175     ByteTerm(Type type, bool invert = false)
    176         : type(type)
    177         , m_capture(false)
    178         , m_invert(invert)
    179     {
    180         atom.quantityType = QuantifierFixedCount;
    181         atom.quantityCount = 1;
    182     }
    183 
    184     ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
    185         : type(type)
    186         , m_capture(capture)
    187         , m_invert(invert)
    188     {
    189         atom.subpatternId = subpatternId;
    190         atom.quantityType = QuantifierFixedCount;
    191         atom.quantityCount = 1;
    192         inputPosition = inputPos;
    193     }
    194 
    195     static ByteTerm BOL(int inputPos)
    196     {
    197         ByteTerm term(TypeAssertionBOL);
    198         term.inputPosition = inputPos;
    199         return term;
    200     }
    201 
    202     static ByteTerm CheckInput(unsigned count)
    203     {
    204         ByteTerm term(TypeCheckInput);
    205         term.checkInputCount = count;
    206         return term;
    207     }
    208 
    209     static ByteTerm UncheckInput(unsigned count)
    210     {
    211         ByteTerm term(TypeUncheckInput);
    212         term.checkInputCount = count;
    213         return term;
    214     }
    215 
    216     static ByteTerm EOL(int inputPos)
    217     {
    218         ByteTerm term(TypeAssertionEOL);
    219         term.inputPosition = inputPos;
    220         return term;
    221     }
    222 
    223     static ByteTerm WordBoundary(bool invert, int inputPos)
    224     {
    225         ByteTerm term(TypeAssertionWordBoundary, invert);
    226         term.inputPosition = inputPos;
    227         return term;
    228     }
    229 
    230     static ByteTerm BackReference(unsigned subpatternId, int inputPos)
    231     {
    232         return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
    233     }
    234 
    235     static ByteTerm BodyAlternativeBegin(bool onceThrough)
    236     {
    237         ByteTerm term(TypeBodyAlternativeBegin);
    238         term.alternative.next = 0;
    239         term.alternative.end = 0;
    240         term.alternative.onceThrough = onceThrough;
    241         return term;
    242     }
    243 
    244     static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
    245     {
    246         ByteTerm term(TypeBodyAlternativeDisjunction);
    247         term.alternative.next = 0;
    248         term.alternative.end = 0;
    249         term.alternative.onceThrough = onceThrough;
    250         return term;
    251     }
    252 
    253     static ByteTerm BodyAlternativeEnd()
    254     {
    255         ByteTerm term(TypeBodyAlternativeEnd);
    256         term.alternative.next = 0;
    257         term.alternative.end = 0;
    258         term.alternative.onceThrough = false;
    259         return term;
    260     }
    261 
    262     static ByteTerm AlternativeBegin()
    263     {
    264         ByteTerm term(TypeAlternativeBegin);
    265         term.alternative.next = 0;
    266         term.alternative.end = 0;
    267         term.alternative.onceThrough = false;
    268         return term;
    269     }
    270 
    271     static ByteTerm AlternativeDisjunction()
    272     {
    273         ByteTerm term(TypeAlternativeDisjunction);
    274         term.alternative.next = 0;
    275         term.alternative.end = 0;
    276         term.alternative.onceThrough = false;
    277         return term;
    278     }
    279 
    280     static ByteTerm AlternativeEnd()
    281     {
    282         ByteTerm term(TypeAlternativeEnd);
    283         term.alternative.next = 0;
    284         term.alternative.end = 0;
    285         term.alternative.onceThrough = false;
    286         return term;
    287     }
    288 
    289     static ByteTerm SubpatternBegin()
    290     {
    291         return ByteTerm(TypeSubpatternBegin);
    292     }
    293 
    294     static ByteTerm SubpatternEnd()
    295     {
    296         return ByteTerm(TypeSubpatternEnd);
    297     }
    298 
    299     bool invert()
    300     {
    301         return m_invert;
    302     }
    303 
    304     bool capture()
    305     {
    306         return m_capture;
    307     }
    308 };
    309 
    310 class ByteDisjunction {
    311     WTF_MAKE_FAST_ALLOCATED;
    312 public:
    313     ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
    314         : m_numSubpatterns(numSubpatterns)
    315         , m_frameSize(frameSize)
    316     {
    317     }
    318 
    319     Vector<ByteTerm> terms;
    320     unsigned m_numSubpatterns;
    321     unsigned m_frameSize;
    322 };
    323 
    324 struct BytecodePattern {
    325     WTF_MAKE_FAST_ALLOCATED;
    326 public:
    327     BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator)
    328         : m_body(body)
    329         , m_ignoreCase(pattern.m_ignoreCase)
    330         , m_multiline(pattern.m_multiline)
    331         , m_containsBeginChars(pattern.m_containsBeginChars)
    332         , m_allocator(allocator)
    333     {
    334         newlineCharacterClass = pattern.newlineCharacterClass();
    335         wordcharCharacterClass = pattern.wordcharCharacterClass();
    336 
    337         m_allParenthesesInfo.append(allParenthesesInfo);
    338         m_userCharacterClasses.append(pattern.m_userCharacterClasses);
    339         // 'Steal' the YarrPattern's CharacterClasses!  We clear its
    340         // array, so that it won't delete them on destruction.  We'll
    341         // take responsibility for that.
    342         pattern.m_userCharacterClasses.clear();
    343 
    344         m_beginChars.append(pattern.m_beginChars);
    345     }
    346 
    347     ~BytecodePattern()
    348     {
    349         deleteAllValues(m_allParenthesesInfo);
    350         deleteAllValues(m_userCharacterClasses);
    351     }
    352 
    353     OwnPtr<ByteDisjunction> m_body;
    354     bool m_ignoreCase;
    355     bool m_multiline;
    356     bool m_containsBeginChars;
    357     // Each BytecodePattern is associated with a RegExp, each RegExp is associated
    358     // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regExpAllocator.
    359     BumpPointerAllocator* m_allocator;
    360 
    361     CharacterClass* newlineCharacterClass;
    362     CharacterClass* wordcharCharacterClass;
    363 
    364     Vector<BeginChar> m_beginChars;
    365 
    366 private:
    367     Vector<ByteDisjunction*> m_allParenthesesInfo;
    368     Vector<CharacterClass*> m_userCharacterClasses;
    369 };
    370 
    371 } } // namespace JSC::Yarr
    372 
    373 #endif // YarrInterpreter_h
    374