Home | History | Annotate | Download | only in wrec
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "WRECParser.h"
     28 
     29 #if ENABLE(WREC)
     30 
     31 #include "CharacterClassConstructor.h"
     32 #include "WRECFunctors.h"
     33 
     34 using namespace WTF;
     35 
     36 namespace JSC { namespace WREC {
     37 
     38 // These error messages match the error messages used by PCRE.
     39 const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
     40 const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
     41 const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
     42 const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
     43 const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
     44 const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
     45 const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
     46 const char* Parser::EscapeUnterminated = "\\ at end of pattern";
     47 
     48 class PatternCharacterSequence {
     49 typedef Generator::JumpList JumpList;
     50 
     51 public:
     52     PatternCharacterSequence(Generator& generator, JumpList& failures)
     53         : m_generator(generator)
     54         , m_failures(failures)
     55     {
     56     }
     57 
     58     size_t size() { return m_sequence.size(); }
     59 
     60     void append(int ch)
     61     {
     62         m_sequence.append(ch);
     63     }
     64 
     65     void flush()
     66     {
     67         if (!m_sequence.size())
     68             return;
     69 
     70         m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
     71         m_sequence.clear();
     72     }
     73 
     74     void flush(const Quantifier& quantifier)
     75     {
     76         if (!m_sequence.size())
     77             return;
     78 
     79         m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);
     80 
     81         switch (quantifier.type) {
     82         case Quantifier::None:
     83         case Quantifier::Error:
     84             ASSERT_NOT_REACHED();
     85             break;
     86 
     87         case Quantifier::Greedy: {
     88             GeneratePatternCharacterFunctor functor(m_sequence.last());
     89             m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
     90             break;
     91         }
     92 
     93         case Quantifier::NonGreedy: {
     94             GeneratePatternCharacterFunctor functor(m_sequence.last());
     95             m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
     96             break;
     97         }
     98         }
     99 
    100         m_sequence.clear();
    101     }
    102 
    103 private:
    104     Generator& m_generator;
    105     JumpList& m_failures;
    106     Vector<int, 8> m_sequence;
    107 };
    108 
    109 ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
    110 {
    111     switch (peek()) {
    112         case '?':
    113             consume();
    114             return Quantifier(Quantifier::Greedy, 0, 1);
    115 
    116         case '*':
    117             consume();
    118             return Quantifier(Quantifier::Greedy, 0);
    119 
    120         case '+':
    121             consume();
    122             return Quantifier(Quantifier::Greedy, 1);
    123 
    124         case '{': {
    125             SavedState state(*this);
    126             consume();
    127 
    128             // Accept: {n}, {n,}, {n,m}.
    129             // Reject: {n,m} where n > m.
    130             // Ignore: Anything else, such as {n, m}.
    131 
    132             if (!peekIsDigit()) {
    133                 state.restore();
    134                 return Quantifier();
    135             }
    136 
    137             unsigned min = consumeNumber();
    138             unsigned max = min;
    139 
    140             if (peek() == ',') {
    141                 consume();
    142                 max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
    143             }
    144 
    145             if (peek() != '}') {
    146                 state.restore();
    147                 return Quantifier();
    148             }
    149             consume();
    150 
    151             if (min > max) {
    152                 setError(QuantifierOutOfOrder);
    153                 return Quantifier(Quantifier::Error);
    154             }
    155 
    156             return Quantifier(Quantifier::Greedy, min, max);
    157          }
    158 
    159          default:
    160             return Quantifier(); // No quantifier.
    161     }
    162 }
    163 
    164 Quantifier Parser::consumeQuantifier()
    165 {
    166     Quantifier q = consumeGreedyQuantifier();
    167 
    168     if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
    169         consume();
    170         q.type = Quantifier::NonGreedy;
    171     }
    172 
    173     return q;
    174 }
    175 
    176 bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
    177 {
    178     Quantifier q = consumeQuantifier();
    179 
    180     switch (q.type) {
    181     case Quantifier::None: {
    182         m_generator.generateCharacterClass(failures, charClass, invert);
    183         break;
    184     }
    185 
    186     case Quantifier::Greedy: {
    187         GenerateCharacterClassFunctor functor(&charClass, invert);
    188         m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
    189         break;
    190     }
    191 
    192     case Quantifier::NonGreedy: {
    193         GenerateCharacterClassFunctor functor(&charClass, invert);
    194         m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
    195         break;
    196     }
    197 
    198     case Quantifier::Error:
    199         return false;
    200     }
    201 
    202     return true;
    203 }
    204 
    205 bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
    206 {
    207     Quantifier q = consumeQuantifier();
    208 
    209     switch (q.type) {
    210     case Quantifier::None: {
    211         m_generator.generateBackreference(failures, subpatternId);
    212         break;
    213     }
    214 
    215     case Quantifier::Greedy:
    216     case Quantifier::NonGreedy:
    217         m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
    218         return true;
    219 
    220     case Quantifier::Error:
    221         return false;
    222     }
    223 
    224     return true;
    225 }
    226 
    227 bool Parser::parseParentheses(JumpList& failures)
    228 {
    229     ParenthesesType type = consumeParenthesesType();
    230 
    231     // FIXME: WREC originally failed to backtrack correctly in cases such as
    232     // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
    233     // unsupported parentheses, we fall back on PCRE.
    234 
    235     switch (type) {
    236         case Generator::Assertion: {
    237             m_generator.generateParenthesesAssertion(failures);
    238 
    239             if (consume() != ')') {
    240                 setError(ParenthesesUnmatched);
    241                 return false;
    242             }
    243 
    244             Quantifier quantifier = consumeQuantifier();
    245             if (quantifier.type != Quantifier::None && quantifier.min == 0) {
    246                 setError(ParenthesesNotSupported);
    247                 return false;
    248             }
    249 
    250             return true;
    251         }
    252         case Generator::InvertedAssertion: {
    253             m_generator.generateParenthesesInvertedAssertion(failures);
    254 
    255             if (consume() != ')') {
    256                 setError(ParenthesesUnmatched);
    257                 return false;
    258             }
    259 
    260             Quantifier quantifier = consumeQuantifier();
    261             if (quantifier.type != Quantifier::None && quantifier.min == 0) {
    262                 setError(ParenthesesNotSupported);
    263                 return false;
    264             }
    265 
    266             return true;
    267         }
    268         default:
    269             setError(ParenthesesNotSupported);
    270             return false;
    271     }
    272 }
    273 
    274 bool Parser::parseCharacterClass(JumpList& failures)
    275 {
    276     bool invert = false;
    277     if (peek() == '^') {
    278         consume();
    279         invert = true;
    280     }
    281 
    282     CharacterClassConstructor constructor(m_ignoreCase);
    283 
    284     int ch;
    285     while ((ch = peek()) != ']') {
    286         switch (ch) {
    287         case EndOfPattern:
    288             setError(CharacterClassUnmatched);
    289             return false;
    290 
    291         case '\\': {
    292             consume();
    293             Escape escape = consumeEscape(true);
    294 
    295             switch (escape.type()) {
    296                 case Escape::PatternCharacter: {
    297                     int character = PatternCharacterEscape::cast(escape).character();
    298                     if (character == '-')
    299                         constructor.flushBeforeEscapedHyphen();
    300                     constructor.put(character);
    301                     break;
    302                 }
    303                 case Escape::CharacterClass: {
    304                     const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
    305                     ASSERT(!characterClassEscape.invert());
    306                     constructor.append(characterClassEscape.characterClass());
    307                     break;
    308                 }
    309                 case Escape::Error:
    310                     return false;
    311                 case Escape::Backreference:
    312                 case Escape::WordBoundaryAssertion: {
    313                     ASSERT_NOT_REACHED();
    314                     break;
    315                 }
    316             }
    317             break;
    318         }
    319 
    320         default:
    321             consume();
    322             constructor.put(ch);
    323         }
    324     }
    325     consume();
    326 
    327     // lazily catch reversed ranges ([z-a])in character classes
    328     if (constructor.isUpsideDown()) {
    329         setError(CharacterClassOutOfOrder);
    330         return false;
    331     }
    332 
    333     constructor.flush();
    334     CharacterClass charClass = constructor.charClass();
    335     return parseCharacterClassQuantifier(failures, charClass, invert);
    336 }
    337 
    338 bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
    339 {
    340     switch (escape.type()) {
    341         case Escape::PatternCharacter:
    342             ASSERT_NOT_REACHED();
    343             return false;
    344 
    345         case Escape::CharacterClass:
    346             return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());
    347 
    348         case Escape::Backreference:
    349             return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());
    350 
    351         case Escape::WordBoundaryAssertion:
    352             m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
    353             return true;
    354 
    355         case Escape::Error:
    356             return false;
    357     }
    358 
    359     ASSERT_NOT_REACHED();
    360     return false;
    361 }
    362 
    363 Escape Parser::consumeEscape(bool inCharacterClass)
    364 {
    365     switch (peek()) {
    366     case EndOfPattern:
    367         setError(EscapeUnterminated);
    368         return Escape(Escape::Error);
    369 
    370     // Assertions
    371     case 'b':
    372         consume();
    373         if (inCharacterClass)
    374             return PatternCharacterEscape('\b');
    375         return WordBoundaryAssertionEscape(false); // do not invert
    376     case 'B':
    377         consume();
    378         if (inCharacterClass)
    379             return PatternCharacterEscape('B');
    380         return WordBoundaryAssertionEscape(true); // invert
    381 
    382     // CharacterClassEscape
    383     case 'd':
    384         consume();
    385         return CharacterClassEscape(CharacterClass::digits(), false);
    386     case 's':
    387         consume();
    388         return CharacterClassEscape(CharacterClass::spaces(), false);
    389     case 'w':
    390         consume();
    391         return CharacterClassEscape(CharacterClass::wordchar(), false);
    392     case 'D':
    393         consume();
    394         return inCharacterClass
    395             ? CharacterClassEscape(CharacterClass::nondigits(), false)
    396             : CharacterClassEscape(CharacterClass::digits(), true);
    397     case 'S':
    398         consume();
    399         return inCharacterClass
    400             ? CharacterClassEscape(CharacterClass::nonspaces(), false)
    401             : CharacterClassEscape(CharacterClass::spaces(), true);
    402     case 'W':
    403         consume();
    404         return inCharacterClass
    405             ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
    406             : CharacterClassEscape(CharacterClass::wordchar(), true);
    407 
    408     // DecimalEscape
    409     case '1':
    410     case '2':
    411     case '3':
    412     case '4':
    413     case '5':
    414     case '6':
    415     case '7':
    416     case '8':
    417     case '9': {
    418         if (peekDigit() > m_numSubpatterns || inCharacterClass) {
    419             // To match Firefox, we parse an invalid backreference in the range [1-7]
    420             // as an octal escape.
    421             return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
    422         }
    423 
    424         int value = 0;
    425         do {
    426             unsigned newValue = value * 10 + peekDigit();
    427             if (newValue > m_numSubpatterns)
    428                 break;
    429             value = newValue;
    430             consume();
    431         } while (peekIsDigit());
    432 
    433         return BackreferenceEscape(value);
    434     }
    435 
    436     // Octal escape
    437     case '0':
    438         consume();
    439         return PatternCharacterEscape(consumeOctal());
    440 
    441     // ControlEscape
    442     case 'f':
    443         consume();
    444         return PatternCharacterEscape('\f');
    445     case 'n':
    446         consume();
    447         return PatternCharacterEscape('\n');
    448     case 'r':
    449         consume();
    450         return PatternCharacterEscape('\r');
    451     case 't':
    452         consume();
    453         return PatternCharacterEscape('\t');
    454     case 'v':
    455         consume();
    456         return PatternCharacterEscape('\v');
    457 
    458     // ControlLetter
    459     case 'c': {
    460         SavedState state(*this);
    461         consume();
    462 
    463         int control = consume();
    464         // To match Firefox, inside a character class, we also accept numbers
    465         // and '_' as control characters.
    466         if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
    467             state.restore();
    468             return PatternCharacterEscape('\\');
    469         }
    470         return PatternCharacterEscape(control & 31);
    471     }
    472 
    473     // HexEscape
    474     case 'x': {
    475         consume();
    476 
    477         SavedState state(*this);
    478         int x = consumeHex(2);
    479         if (x == -1) {
    480             state.restore();
    481             return PatternCharacterEscape('x');
    482         }
    483         return PatternCharacterEscape(x);
    484     }
    485 
    486     // UnicodeEscape
    487     case 'u': {
    488         consume();
    489 
    490         SavedState state(*this);
    491         int x = consumeHex(4);
    492         if (x == -1) {
    493             state.restore();
    494             return PatternCharacterEscape('u');
    495         }
    496         return PatternCharacterEscape(x);
    497     }
    498 
    499     // IdentityEscape
    500     default:
    501         return PatternCharacterEscape(consume());
    502     }
    503 }
    504 
    505 void Parser::parseAlternative(JumpList& failures)
    506 {
    507     PatternCharacterSequence sequence(m_generator, failures);
    508 
    509     while (1) {
    510         switch (peek()) {
    511         case EndOfPattern:
    512         case '|':
    513         case ')':
    514             sequence.flush();
    515             return;
    516 
    517         case '*':
    518         case '+':
    519         case '?':
    520         case '{': {
    521             Quantifier q = consumeQuantifier();
    522 
    523             if (q.type == Quantifier::None) {
    524                 sequence.append(consume());
    525                 continue;
    526             }
    527 
    528             if (q.type == Quantifier::Error)
    529                 return;
    530 
    531             if (!sequence.size()) {
    532                 setError(QuantifierWithoutAtom);
    533                 return;
    534             }
    535 
    536             sequence.flush(q);
    537             continue;
    538         }
    539 
    540         case '^':
    541             consume();
    542 
    543             sequence.flush();
    544             m_generator.generateAssertionBOL(failures);
    545             continue;
    546 
    547         case '$':
    548             consume();
    549 
    550             sequence.flush();
    551             m_generator.generateAssertionEOL(failures);
    552             continue;
    553 
    554         case '.':
    555             consume();
    556 
    557             sequence.flush();
    558             if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
    559                 return;
    560             continue;
    561 
    562         case '[':
    563             consume();
    564 
    565             sequence.flush();
    566             if (!parseCharacterClass(failures))
    567                 return;
    568             continue;
    569 
    570         case '(':
    571             consume();
    572 
    573             sequence.flush();
    574             if (!parseParentheses(failures))
    575                 return;
    576             continue;
    577 
    578         case '\\': {
    579             consume();
    580 
    581             Escape escape = consumeEscape(false);
    582             if (escape.type() == Escape::PatternCharacter) {
    583                 sequence.append(PatternCharacterEscape::cast(escape).character());
    584                 continue;
    585             }
    586 
    587             sequence.flush();
    588             if (!parseNonCharacterEscape(failures, escape))
    589                 return;
    590             continue;
    591         }
    592 
    593         default:
    594             sequence.append(consume());
    595             continue;
    596         }
    597     }
    598 }
    599 
    600 /*
    601   TOS holds index.
    602 */
    603 void Parser::parseDisjunction(JumpList& failures)
    604 {
    605     parseAlternative(failures);
    606     if (peek() != '|')
    607         return;
    608 
    609     JumpList successes;
    610     do {
    611         consume();
    612         m_generator.terminateAlternative(successes, failures);
    613         parseAlternative(failures);
    614     } while (peek() == '|');
    615 
    616     m_generator.terminateDisjunction(successes);
    617 }
    618 
    619 Generator::ParenthesesType Parser::consumeParenthesesType()
    620 {
    621     if (peek() != '?')
    622         return Generator::Capturing;
    623     consume();
    624 
    625     switch (consume()) {
    626     case ':':
    627         return Generator::NonCapturing;
    628 
    629     case '=':
    630         return Generator::Assertion;
    631 
    632     case '!':
    633         return Generator::InvertedAssertion;
    634 
    635     default:
    636         setError(ParenthesesTypeInvalid);
    637         return Generator::Error;
    638     }
    639 }
    640 
    641 } } // namespace JSC::WREC
    642 
    643 #endif // ENABLE(WREC)
    644