Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef RegexInterpreter_h
     27 #define RegexInterpreter_h
     28 
     29 #include <wtf/Platform.h>
     30 
     31 #if ENABLE(YARR)
     32 
     33 #include <wtf/unicode/Unicode.h>
     34 #include "RegexParser.h"
     35 #include "RegexPattern.h"
     36 
     37 namespace JSC { namespace Yarr {
     38 
     39 class ByteDisjunction;
     40 
     41 struct ByteTerm {
     42     enum Type {
     43         TypeBodyAlternativeBegin,
     44         TypeBodyAlternativeDisjunction,
     45         TypeBodyAlternativeEnd,
     46         TypeAlternativeBegin,
     47         TypeAlternativeDisjunction,
     48         TypeAlternativeEnd,
     49         TypeSubpatternBegin,
     50         TypeSubpatternEnd,
     51         TypeAssertionBOL,
     52         TypeAssertionEOL,
     53         TypeAssertionWordBoundary,
     54         TypePatternCharacterOnce,
     55         TypePatternCharacterFixed,
     56         TypePatternCharacterGreedy,
     57         TypePatternCharacterNonGreedy,
     58         TypePatternCasedCharacterOnce,
     59         TypePatternCasedCharacterFixed,
     60         TypePatternCasedCharacterGreedy,
     61         TypePatternCasedCharacterNonGreedy,
     62         TypeCharacterClass,
     63         TypeBackReference,
     64         TypeParenthesesSubpattern,
     65         TypeParenthesesSubpatternOnceBegin,
     66         TypeParenthesesSubpatternOnceEnd,
     67         TypeParentheticalAssertionBegin,
     68         TypeParentheticalAssertionEnd,
     69         TypeCheckInput,
     70     } type;
     71     bool invertOrCapture;
     72     union {
     73         struct {
     74             union {
     75                 UChar patternCharacter;
     76                 struct {
     77                     UChar lo;
     78                     UChar hi;
     79                 } casedCharacter;
     80                 CharacterClass* characterClass;
     81                 unsigned subpatternId;
     82             };
     83             union {
     84                 ByteDisjunction* parenthesesDisjunction;
     85                 unsigned parenthesesWidth;
     86             };
     87             QuantifierType quantityType;
     88             unsigned quantityCount;
     89         } atom;
     90         struct {
     91             int next;
     92             int end;
     93         } alternative;
     94         unsigned checkInputCount;
     95     };
     96     unsigned frameLocation;
     97     int inputPosition;
     98 
     99     ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
    100         : frameLocation(frameLocation)
    101     {
    102         switch (quantityType) {
    103         case QuantifierFixedCount:
    104             type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
    105             break;
    106         case QuantifierGreedy:
    107             type = ByteTerm::TypePatternCharacterGreedy;
    108             break;
    109         case QuantifierNonGreedy:
    110             type = ByteTerm::TypePatternCharacterNonGreedy;
    111             break;
    112         }
    113 
    114         atom.patternCharacter = ch;
    115         atom.quantityType = quantityType;
    116         atom.quantityCount = quantityCount;
    117         inputPosition = inputPos;
    118     }
    119 
    120     ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
    121         : frameLocation(frameLocation)
    122     {
    123         switch (quantityType) {
    124         case QuantifierFixedCount:
    125             type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
    126             break;
    127         case QuantifierGreedy:
    128             type = ByteTerm::TypePatternCasedCharacterGreedy;
    129             break;
    130         case QuantifierNonGreedy:
    131             type = ByteTerm::TypePatternCasedCharacterNonGreedy;
    132             break;
    133         }
    134 
    135         atom.casedCharacter.lo = lo;
    136         atom.casedCharacter.hi = hi;
    137         atom.quantityType = quantityType;
    138         atom.quantityCount = quantityCount;
    139         inputPosition = inputPos;
    140     }
    141 
    142     ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
    143         : type(ByteTerm::TypeCharacterClass)
    144         , invertOrCapture(invert)
    145     {
    146         atom.characterClass = characterClass;
    147         atom.quantityType = QuantifierFixedCount;
    148         atom.quantityCount = 1;
    149         inputPosition = inputPos;
    150     }
    151 
    152     ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos)
    153         : type(type)
    154         , invertOrCapture(invertOrCapture)
    155     {
    156         atom.subpatternId = subpatternId;
    157         atom.parenthesesDisjunction = parenthesesInfo;
    158         atom.quantityType = QuantifierFixedCount;
    159         atom.quantityCount = 1;
    160         inputPosition = inputPos;
    161     }
    162 
    163     ByteTerm(Type type, bool invert = false)
    164         : type(type)
    165         , invertOrCapture(invert)
    166     {
    167         atom.quantityType = QuantifierFixedCount;
    168         atom.quantityCount = 1;
    169     }
    170 
    171     ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos)
    172         : type(type)
    173         , invertOrCapture(invertOrCapture)
    174     {
    175         atom.subpatternId = subpatternId;
    176         atom.quantityType = QuantifierFixedCount;
    177         atom.quantityCount = 1;
    178         inputPosition = inputPos;
    179     }
    180 
    181     static ByteTerm BOL(int inputPos)
    182     {
    183         ByteTerm term(TypeAssertionBOL);
    184         term.inputPosition = inputPos;
    185         return term;
    186     }
    187 
    188     static ByteTerm CheckInput(unsigned count)
    189     {
    190         ByteTerm term(TypeCheckInput);
    191         term.checkInputCount = count;
    192         return term;
    193     }
    194 
    195     static ByteTerm EOL(int inputPos)
    196     {
    197         ByteTerm term(TypeAssertionEOL);
    198         term.inputPosition = inputPos;
    199         return term;
    200     }
    201 
    202     static ByteTerm WordBoundary(bool invert, int inputPos)
    203     {
    204         ByteTerm term(TypeAssertionWordBoundary, invert);
    205         term.inputPosition = inputPos;
    206         return term;
    207     }
    208 
    209     static ByteTerm BackReference(unsigned subpatternId, int inputPos)
    210     {
    211         return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
    212     }
    213 
    214     static ByteTerm BodyAlternativeBegin()
    215     {
    216         ByteTerm term(TypeBodyAlternativeBegin);
    217         term.alternative.next = 0;
    218         term.alternative.end = 0;
    219         return term;
    220     }
    221 
    222     static ByteTerm BodyAlternativeDisjunction()
    223     {
    224         ByteTerm term(TypeBodyAlternativeDisjunction);
    225         term.alternative.next = 0;
    226         term.alternative.end = 0;
    227         return term;
    228     }
    229 
    230     static ByteTerm BodyAlternativeEnd()
    231     {
    232         ByteTerm term(TypeBodyAlternativeEnd);
    233         term.alternative.next = 0;
    234         term.alternative.end = 0;
    235         return term;
    236     }
    237 
    238     static ByteTerm AlternativeBegin()
    239     {
    240         ByteTerm term(TypeAlternativeBegin);
    241         term.alternative.next = 0;
    242         term.alternative.end = 0;
    243         return term;
    244     }
    245 
    246     static ByteTerm AlternativeDisjunction()
    247     {
    248         ByteTerm term(TypeAlternativeDisjunction);
    249         term.alternative.next = 0;
    250         term.alternative.end = 0;
    251         return term;
    252     }
    253 
    254     static ByteTerm AlternativeEnd()
    255     {
    256         ByteTerm term(TypeAlternativeEnd);
    257         term.alternative.next = 0;
    258         term.alternative.end = 0;
    259         return term;
    260     }
    261 
    262     static ByteTerm SubpatternBegin()
    263     {
    264         return ByteTerm(TypeSubpatternBegin);
    265     }
    266 
    267     static ByteTerm SubpatternEnd()
    268     {
    269         return ByteTerm(TypeSubpatternEnd);
    270     }
    271 
    272     bool invert()
    273     {
    274         return invertOrCapture;
    275     }
    276 
    277     bool capture()
    278     {
    279         return invertOrCapture;
    280     }
    281 };
    282 
    283 class ByteDisjunction : public FastAllocBase {
    284 public:
    285     ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
    286         : m_numSubpatterns(numSubpatterns)
    287         , m_frameSize(frameSize)
    288     {
    289     }
    290 
    291     Vector<ByteTerm> terms;
    292     unsigned m_numSubpatterns;
    293     unsigned m_frameSize;
    294 };
    295 
    296 struct BytecodePattern : FastAllocBase {
    297     BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
    298         : m_body(body)
    299         , m_ignoreCase(pattern.m_ignoreCase)
    300         , m_multiline(pattern.m_multiline)
    301     {
    302         newlineCharacterClass = pattern.newlineCharacterClass();
    303         wordcharCharacterClass = pattern.wordcharCharacterClass();
    304 
    305         m_allParenthesesInfo.append(allParenthesesInfo);
    306         m_userCharacterClasses.append(pattern.m_userCharacterClasses);
    307         // 'Steal' the RegexPattern's CharacterClasses!  We clear its
    308         // array, so that it won't delete them on destruction.  We'll
    309         // take responsibility for that.
    310         pattern.m_userCharacterClasses.clear();
    311     }
    312 
    313     ~BytecodePattern()
    314     {
    315         deleteAllValues(m_allParenthesesInfo);
    316         deleteAllValues(m_userCharacterClasses);
    317     }
    318 
    319     OwnPtr<ByteDisjunction> m_body;
    320     bool m_ignoreCase;
    321     bool m_multiline;
    322 
    323     CharacterClass* newlineCharacterClass;
    324     CharacterClass* wordcharCharacterClass;
    325 private:
    326     Vector<ByteDisjunction*> m_allParenthesesInfo;
    327     Vector<CharacterClass*> m_userCharacterClasses;
    328 };
    329 
    330 BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
    331 int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
    332 
    333 } } // namespace JSC::Yarr
    334 
    335 #endif
    336 
    337 #endif // RegexInterpreter_h
    338