Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef YarrParser_h
     27 #define YarrParser_h
     28 
     29 #include <runtime/UString.h>
     30 #include "Yarr.h"
     31 #include <wtf/ASCIICType.h>
     32 #include <wtf/unicode/Unicode.h>
     33 
     34 namespace JSC { namespace Yarr {
     35 
     36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
     37 
     38 enum BuiltInCharacterClassID {
     39     DigitClassID,
     40     SpaceClassID,
     41     WordClassID,
     42     NewlineClassID,
     43 };
     44 
     45 // The Parser class should not be used directly - only via the Yarr::parse() method.
     46 template<class Delegate>
     47 class Parser {
     48 private:
     49     template<class FriendDelegate>
     50     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
     51 
     52     enum ErrorCode {
     53         NoError,
     54         PatternTooLarge,
     55         QuantifierOutOfOrder,
     56         QuantifierWithoutAtom,
     57         MissingParentheses,
     58         ParenthesesUnmatched,
     59         ParenthesesTypeInvalid,
     60         CharacterClassUnmatched,
     61         CharacterClassOutOfOrder,
     62         EscapeUnterminated,
     63         NumberOfErrorCodes
     64     };
     65 
     66     /*
     67      * CharacterClassParserDelegate:
     68      *
     69      * The class CharacterClassParserDelegate is used in the parsing of character
     70      * classes.  This class handles detection of character ranges.  This class
     71      * implements enough of the delegate interface such that it can be passed to
     72      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
     73      * to perform the parsing of escape characters in character sets.
     74      */
     75     class CharacterClassParserDelegate {
     76     public:
     77         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
     78             : m_delegate(delegate)
     79             , m_err(err)
     80             , m_state(Empty)
     81             , m_character(0)
     82         {
     83         }
     84 
     85         /*
     86          * begin():
     87          *
     88          * Called at beginning of construction.
     89          */
     90         void begin(bool invert)
     91         {
     92             m_delegate.atomCharacterClassBegin(invert);
     93         }
     94 
     95         /*
     96          * atomPatternCharacter():
     97          *
     98          * This method is called either from parseCharacterClass() (for an unescaped
     99          * character in a character class), or from parseEscape(). In the former case
    100          * the value true will be passed for the argument 'hyphenIsRange', and in this
    101          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
    102          * is different to /[a\-z]/).
    103          */
    104         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
    105         {
    106             switch (m_state) {
    107             case AfterCharacterClass:
    108                 // Following a builtin character class we need look out for a hyphen.
    109                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
    110                 // If we see a hyphen following a charater class then unlike usual
    111                 // we'll report it to the delegate immediately, and put ourself into
    112                 // a poisoned state. Any following calls to add another character or
    113                 // character class will result in an error. (A hypen following a
    114                 // character-class is itself valid, but only  at the end of a regex).
    115                 if (hyphenIsRange && ch == '-') {
    116                     m_delegate.atomCharacterClassAtom('-');
    117                     m_state = AfterCharacterClassHyphen;
    118                     return;
    119                 }
    120                 // Otherwise just fall through - cached character so treat this as Empty.
    121 
    122             case Empty:
    123                 m_character = ch;
    124                 m_state = CachedCharacter;
    125                 return;
    126 
    127             case CachedCharacter:
    128                 if (hyphenIsRange && ch == '-')
    129                     m_state = CachedCharacterHyphen;
    130                 else {
    131                     m_delegate.atomCharacterClassAtom(m_character);
    132                     m_character = ch;
    133                 }
    134                 return;
    135 
    136             case CachedCharacterHyphen:
    137                 if (ch < m_character) {
    138                     m_err = CharacterClassOutOfOrder;
    139                     return;
    140                 }
    141                 m_delegate.atomCharacterClassRange(m_character, ch);
    142                 m_state = Empty;
    143                 return;
    144 
    145                 // See coment in atomBuiltInCharacterClass below.
    146                 // This too is technically an error, per ECMA-262, and again we
    147                 // we chose to allow this.  Note a subtlely here that while we
    148                 // diverge from the spec's definition of CharacterRange we do
    149                 // remain in compliance with the grammar.  For example, consider
    150                 // the expression /[\d-a-z]/.  We comply with the grammar in
    151                 // this case by not allowing a-z to be matched as a range.
    152             case AfterCharacterClassHyphen:
    153                 m_delegate.atomCharacterClassAtom(ch);
    154                 m_state = Empty;
    155                 return;
    156             }
    157         }
    158 
    159         /*
    160          * atomBuiltInCharacterClass():
    161          *
    162          * Adds a built-in character class, called by parseEscape().
    163          */
    164         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
    165         {
    166             switch (m_state) {
    167             case CachedCharacter:
    168                 // Flush the currently cached character, then fall through.
    169                 m_delegate.atomCharacterClassAtom(m_character);
    170 
    171             case Empty:
    172             case AfterCharacterClass:
    173                 m_state = AfterCharacterClass;
    174                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
    175                 return;
    176 
    177                 // If we hit either of these cases, we have an invalid range that
    178                 // looks something like /[x-\d]/ or /[\d-\d]/.
    179                 // According to ECMA-262 this should be a syntax error, but
    180                 // empirical testing shows this to break teh webz.  Instead we
    181                 // comply with to the ECMA-262 grammar, and assume the grammar to
    182                 // have matched the range correctly, but tweak our interpretation
    183                 // of CharacterRange.  Effectively we implicitly handle the hyphen
    184                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
    185             case CachedCharacterHyphen:
    186                 m_delegate.atomCharacterClassAtom(m_character);
    187                 m_delegate.atomCharacterClassAtom('-');
    188                 // fall through
    189             case AfterCharacterClassHyphen:
    190                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
    191                 m_state = Empty;
    192                 return;
    193             }
    194         }
    195 
    196         /*
    197          * end():
    198          *
    199          * Called at end of construction.
    200          */
    201         void end()
    202         {
    203             if (m_state == CachedCharacter)
    204                 m_delegate.atomCharacterClassAtom(m_character);
    205             else if (m_state == CachedCharacterHyphen) {
    206                 m_delegate.atomCharacterClassAtom(m_character);
    207                 m_delegate.atomCharacterClassAtom('-');
    208             }
    209             m_delegate.atomCharacterClassEnd();
    210         }
    211 
    212         // parseEscape() should never call these delegate methods when
    213         // invoked with inCharacterClass set.
    214         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
    215         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
    216 
    217     private:
    218         Delegate& m_delegate;
    219         ErrorCode& m_err;
    220         enum CharacterClassConstructionState {
    221             Empty,
    222             CachedCharacter,
    223             CachedCharacterHyphen,
    224             AfterCharacterClass,
    225             AfterCharacterClassHyphen,
    226         } m_state;
    227         UChar m_character;
    228     };
    229 
    230     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
    231         : m_delegate(delegate)
    232         , m_backReferenceLimit(backReferenceLimit)
    233         , m_err(NoError)
    234         , m_data(pattern.characters())
    235         , m_size(pattern.length())
    236         , m_index(0)
    237         , m_parenthesesNestingDepth(0)
    238     {
    239     }
    240 
    241     /*
    242      * parseEscape():
    243      *
    244      * Helper for parseTokens() AND parseCharacterClass().
    245      * Unlike the other parser methods, this function does not report tokens
    246      * directly to the member delegate (m_delegate), instead tokens are
    247      * emitted to the delegate provided as an argument.  In the case of atom
    248      * escapes, parseTokens() will call parseEscape() passing m_delegate as
    249      * an argument, and as such the escape will be reported to the delegate.
    250      *
    251      * However this method may also be used by parseCharacterClass(), in which
    252      * case a CharacterClassParserDelegate will be passed as the delegate that
    253      * tokens should be added to.  A boolean flag is also provided to indicate
    254      * whether that an escape in a CharacterClass is being parsed (some parsing
    255      * rules change in this context).
    256      *
    257      * The boolean value returned by this method indicates whether the token
    258      * parsed was an atom (outside of a characted class \b and \B will be
    259      * interpreted as assertions).
    260      */
    261     template<bool inCharacterClass, class EscapeDelegate>
    262     bool parseEscape(EscapeDelegate& delegate)
    263     {
    264         ASSERT(!m_err);
    265         ASSERT(peek() == '\\');
    266         consume();
    267 
    268         if (atEndOfPattern()) {
    269             m_err = EscapeUnterminated;
    270             return false;
    271         }
    272 
    273         switch (peek()) {
    274         // Assertions
    275         case 'b':
    276             consume();
    277             if (inCharacterClass)
    278                 delegate.atomPatternCharacter('\b');
    279             else {
    280                 delegate.assertionWordBoundary(false);
    281                 return false;
    282             }
    283             break;
    284         case 'B':
    285             consume();
    286             if (inCharacterClass)
    287                 delegate.atomPatternCharacter('B');
    288             else {
    289                 delegate.assertionWordBoundary(true);
    290                 return false;
    291             }
    292             break;
    293 
    294         // CharacterClassEscape
    295         case 'd':
    296             consume();
    297             delegate.atomBuiltInCharacterClass(DigitClassID, false);
    298             break;
    299         case 's':
    300             consume();
    301             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
    302             break;
    303         case 'w':
    304             consume();
    305             delegate.atomBuiltInCharacterClass(WordClassID, false);
    306             break;
    307         case 'D':
    308             consume();
    309             delegate.atomBuiltInCharacterClass(DigitClassID, true);
    310             break;
    311         case 'S':
    312             consume();
    313             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
    314             break;
    315         case 'W':
    316             consume();
    317             delegate.atomBuiltInCharacterClass(WordClassID, true);
    318             break;
    319 
    320         // DecimalEscape
    321         case '1':
    322         case '2':
    323         case '3':
    324         case '4':
    325         case '5':
    326         case '6':
    327         case '7':
    328         case '8':
    329         case '9': {
    330             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
    331             // First, try to parse this as backreference.
    332             if (!inCharacterClass) {
    333                 ParseState state = saveState();
    334 
    335                 unsigned backReference = consumeNumber();
    336                 if (backReference <= m_backReferenceLimit) {
    337                     delegate.atomBackReference(backReference);
    338                     break;
    339                 }
    340 
    341                 restoreState(state);
    342             }
    343 
    344             // Not a backreference, and not octal.
    345             if (peek() >= '8') {
    346                 delegate.atomPatternCharacter('\\');
    347                 break;
    348             }
    349 
    350             // Fall-through to handle this as an octal escape.
    351         }
    352 
    353         // Octal escape
    354         case '0':
    355             delegate.atomPatternCharacter(consumeOctal());
    356             break;
    357 
    358         // ControlEscape
    359         case 'f':
    360             consume();
    361             delegate.atomPatternCharacter('\f');
    362             break;
    363         case 'n':
    364             consume();
    365             delegate.atomPatternCharacter('\n');
    366             break;
    367         case 'r':
    368             consume();
    369             delegate.atomPatternCharacter('\r');
    370             break;
    371         case 't':
    372             consume();
    373             delegate.atomPatternCharacter('\t');
    374             break;
    375         case 'v':
    376             consume();
    377             delegate.atomPatternCharacter('\v');
    378             break;
    379 
    380         // ControlLetter
    381         case 'c': {
    382             ParseState state = saveState();
    383             consume();
    384             if (!atEndOfPattern()) {
    385                 int control = consume();
    386 
    387                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
    388                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
    389                     delegate.atomPatternCharacter(control & 0x1f);
    390                     break;
    391                 }
    392             }
    393             restoreState(state);
    394             delegate.atomPatternCharacter('\\');
    395             break;
    396         }
    397 
    398         // HexEscape
    399         case 'x': {
    400             consume();
    401             int x = tryConsumeHex(2);
    402             if (x == -1)
    403                 delegate.atomPatternCharacter('x');
    404             else
    405                 delegate.atomPatternCharacter(x);
    406             break;
    407         }
    408 
    409         // UnicodeEscape
    410         case 'u': {
    411             consume();
    412             int u = tryConsumeHex(4);
    413             if (u == -1)
    414                 delegate.atomPatternCharacter('u');
    415             else
    416                 delegate.atomPatternCharacter(u);
    417             break;
    418         }
    419 
    420         // IdentityEscape
    421         default:
    422             delegate.atomPatternCharacter(consume());
    423         }
    424 
    425         return true;
    426     }
    427 
    428     /*
    429      * parseAtomEscape(), parseCharacterClassEscape():
    430      *
    431      * These methods alias to parseEscape().
    432      */
    433     bool parseAtomEscape()
    434     {
    435         return parseEscape<false>(m_delegate);
    436     }
    437     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
    438     {
    439         parseEscape<true>(delegate);
    440     }
    441 
    442     /*
    443      * parseCharacterClass():
    444      *
    445      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
    446      * to an instance of CharacterClassParserDelegate, to describe the character class to the
    447      * delegate.
    448      */
    449     void parseCharacterClass()
    450     {
    451         ASSERT(!m_err);
    452         ASSERT(peek() == '[');
    453         consume();
    454 
    455         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
    456 
    457         characterClassConstructor.begin(tryConsume('^'));
    458 
    459         while (!atEndOfPattern()) {
    460             switch (peek()) {
    461             case ']':
    462                 consume();
    463                 characterClassConstructor.end();
    464                 return;
    465 
    466             case '\\':
    467                 parseCharacterClassEscape(characterClassConstructor);
    468                 break;
    469 
    470             default:
    471                 characterClassConstructor.atomPatternCharacter(consume(), true);
    472             }
    473 
    474             if (m_err)
    475                 return;
    476         }
    477 
    478         m_err = CharacterClassUnmatched;
    479     }
    480 
    481     /*
    482      * parseParenthesesBegin():
    483      *
    484      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
    485      */
    486     void parseParenthesesBegin()
    487     {
    488         ASSERT(!m_err);
    489         ASSERT(peek() == '(');
    490         consume();
    491 
    492         if (tryConsume('?')) {
    493             if (atEndOfPattern()) {
    494                 m_err = ParenthesesTypeInvalid;
    495                 return;
    496             }
    497 
    498             switch (consume()) {
    499             case ':':
    500                 m_delegate.atomParenthesesSubpatternBegin(false);
    501                 break;
    502 
    503             case '=':
    504                 m_delegate.atomParentheticalAssertionBegin();
    505                 break;
    506 
    507             case '!':
    508                 m_delegate.atomParentheticalAssertionBegin(true);
    509                 break;
    510 
    511             default:
    512                 m_err = ParenthesesTypeInvalid;
    513             }
    514         } else
    515             m_delegate.atomParenthesesSubpatternBegin();
    516 
    517         ++m_parenthesesNestingDepth;
    518     }
    519 
    520     /*
    521      * parseParenthesesEnd():
    522      *
    523      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
    524      */
    525     void parseParenthesesEnd()
    526     {
    527         ASSERT(!m_err);
    528         ASSERT(peek() == ')');
    529         consume();
    530 
    531         if (m_parenthesesNestingDepth > 0)
    532             m_delegate.atomParenthesesEnd();
    533         else
    534             m_err = ParenthesesUnmatched;
    535 
    536         --m_parenthesesNestingDepth;
    537     }
    538 
    539     /*
    540      * parseQuantifier():
    541      *
    542      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
    543      */
    544     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
    545     {
    546         ASSERT(!m_err);
    547         ASSERT(min <= max);
    548 
    549         if (lastTokenWasAnAtom)
    550             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
    551         else
    552             m_err = QuantifierWithoutAtom;
    553     }
    554 
    555     /*
    556      * parseTokens():
    557      *
    558      * This method loops over the input pattern reporting tokens to the delegate.
    559      * The method returns when a parse error is detected, or the end of the pattern
    560      * is reached.  One piece of state is tracked around the loop, which is whether
    561      * the last token passed to the delegate was an atom (this is necessary to detect
    562      * a parse error when a quantifier provided without an atom to quantify).
    563      */
    564     void parseTokens()
    565     {
    566         bool lastTokenWasAnAtom = false;
    567 
    568         while (!atEndOfPattern()) {
    569             switch (peek()) {
    570             case '|':
    571                 consume();
    572                 m_delegate.disjunction();
    573                 lastTokenWasAnAtom = false;
    574                 break;
    575 
    576             case '(':
    577                 parseParenthesesBegin();
    578                 lastTokenWasAnAtom = false;
    579                 break;
    580 
    581             case ')':
    582                 parseParenthesesEnd();
    583                 lastTokenWasAnAtom = true;
    584                 break;
    585 
    586             case '^':
    587                 consume();
    588                 m_delegate.assertionBOL();
    589                 lastTokenWasAnAtom = false;
    590                 break;
    591 
    592             case '$':
    593                 consume();
    594                 m_delegate.assertionEOL();
    595                 lastTokenWasAnAtom = false;
    596                 break;
    597 
    598             case '.':
    599                 consume();
    600                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
    601                 lastTokenWasAnAtom = true;
    602                 break;
    603 
    604             case '[':
    605                 parseCharacterClass();
    606                 lastTokenWasAnAtom = true;
    607                 break;
    608 
    609             case '\\':
    610                 lastTokenWasAnAtom = parseAtomEscape();
    611                 break;
    612 
    613             case '*':
    614                 consume();
    615                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
    616                 lastTokenWasAnAtom = false;
    617                 break;
    618 
    619             case '+':
    620                 consume();
    621                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
    622                 lastTokenWasAnAtom = false;
    623                 break;
    624 
    625             case '?':
    626                 consume();
    627                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
    628                 lastTokenWasAnAtom = false;
    629                 break;
    630 
    631             case '{': {
    632                 ParseState state = saveState();
    633 
    634                 consume();
    635                 if (peekIsDigit()) {
    636                     unsigned min = consumeNumber();
    637                     unsigned max = min;
    638 
    639                     if (tryConsume(','))
    640                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
    641 
    642                     if (tryConsume('}')) {
    643                         if (min <= max)
    644                             parseQuantifier(lastTokenWasAnAtom, min, max);
    645                         else
    646                             m_err = QuantifierOutOfOrder;
    647                         lastTokenWasAnAtom = false;
    648                         break;
    649                     }
    650                 }
    651 
    652                 restoreState(state);
    653             } // if we did not find a complete quantifer, fall through to the default case.
    654 
    655             default:
    656                 m_delegate.atomPatternCharacter(consume());
    657                 lastTokenWasAnAtom = true;
    658             }
    659 
    660             if (m_err)
    661                 return;
    662         }
    663 
    664         if (m_parenthesesNestingDepth > 0)
    665             m_err = MissingParentheses;
    666     }
    667 
    668     /*
    669      * parse():
    670      *
    671      * This method calls parseTokens() to parse over the input and converts any
    672      * error code to a const char* for a result.
    673      */
    674     const char* parse()
    675     {
    676         if (m_size > MAX_PATTERN_SIZE)
    677             m_err = PatternTooLarge;
    678         else
    679             parseTokens();
    680         ASSERT(atEndOfPattern() || m_err);
    681 
    682         // The order of this array must match the ErrorCode enum.
    683         static const char* errorMessages[NumberOfErrorCodes] = {
    684             0, // NoError
    685             REGEXP_ERROR_PREFIX "regular expression too large",
    686             REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
    687             REGEXP_ERROR_PREFIX "nothing to repeat",
    688             REGEXP_ERROR_PREFIX "missing )",
    689             REGEXP_ERROR_PREFIX "unmatched parentheses",
    690             REGEXP_ERROR_PREFIX "unrecognized character after (?",
    691             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
    692             REGEXP_ERROR_PREFIX "range out of order in character class",
    693             REGEXP_ERROR_PREFIX "\\ at end of pattern"
    694         };
    695 
    696         return errorMessages[m_err];
    697     }
    698 
    699 
    700     // Misc helper functions:
    701 
    702     typedef unsigned ParseState;
    703 
    704     ParseState saveState()
    705     {
    706         return m_index;
    707     }
    708 
    709     void restoreState(ParseState state)
    710     {
    711         m_index = state;
    712     }
    713 
    714     bool atEndOfPattern()
    715     {
    716         ASSERT(m_index <= m_size);
    717         return m_index == m_size;
    718     }
    719 
    720     int peek()
    721     {
    722         ASSERT(m_index < m_size);
    723         return m_data[m_index];
    724     }
    725 
    726     bool peekIsDigit()
    727     {
    728         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
    729     }
    730 
    731     unsigned peekDigit()
    732     {
    733         ASSERT(peekIsDigit());
    734         return peek() - '0';
    735     }
    736 
    737     int consume()
    738     {
    739         ASSERT(m_index < m_size);
    740         return m_data[m_index++];
    741     }
    742 
    743     unsigned consumeDigit()
    744     {
    745         ASSERT(peekIsDigit());
    746         return consume() - '0';
    747     }
    748 
    749     unsigned consumeNumber()
    750     {
    751         unsigned n = consumeDigit();
    752         // check for overflow.
    753         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
    754             n = newValue;
    755             consume();
    756         }
    757         return n;
    758     }
    759 
    760     unsigned consumeOctal()
    761     {
    762         ASSERT(WTF::isASCIIOctalDigit(peek()));
    763 
    764         unsigned n = consumeDigit();
    765         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
    766             n = n * 8 + consumeDigit();
    767         return n;
    768     }
    769 
    770     bool tryConsume(UChar ch)
    771     {
    772         if (atEndOfPattern() || (m_data[m_index] != ch))
    773             return false;
    774         ++m_index;
    775         return true;
    776     }
    777 
    778     int tryConsumeHex(int count)
    779     {
    780         ParseState state = saveState();
    781 
    782         int n = 0;
    783         while (count--) {
    784             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
    785                 restoreState(state);
    786                 return -1;
    787             }
    788             n = (n << 4) | WTF::toASCIIHexValue(consume());
    789         }
    790         return n;
    791     }
    792 
    793     Delegate& m_delegate;
    794     unsigned m_backReferenceLimit;
    795     ErrorCode m_err;
    796     const UChar* m_data;
    797     unsigned m_size;
    798     unsigned m_index;
    799     unsigned m_parenthesesNestingDepth;
    800 
    801     // Derived by empirical testing of compile time in PCRE and WREC.
    802     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
    803 };
    804 
    805 /*
    806  * Yarr::parse():
    807  *
    808  * The parse method is passed a pattern to be parsed and a delegate upon which
    809  * callbacks will be made to record the parsed tokens forming the regex.
    810  * Yarr::parse() returns null on success, or a const C string providing an error
    811  * message where a parse error occurs.
    812  *
    813  * The Delegate must implement the following interface:
    814  *
    815  *    void assertionBOL();
    816  *    void assertionEOL();
    817  *    void assertionWordBoundary(bool invert);
    818  *
    819  *    void atomPatternCharacter(UChar ch);
    820  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
    821  *    void atomCharacterClassBegin(bool invert)
    822  *    void atomCharacterClassAtom(UChar ch)
    823  *    void atomCharacterClassRange(UChar begin, UChar end)
    824  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
    825  *    void atomCharacterClassEnd()
    826  *    void atomParenthesesSubpatternBegin(bool capture = true);
    827  *    void atomParentheticalAssertionBegin(bool invert = false);
    828  *    void atomParenthesesEnd();
    829  *    void atomBackReference(unsigned subpatternId);
    830  *
    831  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
    832  *
    833  *    void disjunction();
    834  *
    835  * The regular expression is described by a sequence of assertion*() and atom*()
    836  * callbacks to the delegate, describing the terms in the regular expression.
    837  * Following an atom a quantifyAtom() call may occur to indicate that the previous
    838  * atom should be quantified.  In the case of atoms described across multiple
    839  * calls (parentheses and character classes) the call to quantifyAtom() will come
    840  * after the call to the atom*End() method, never after atom*Begin().
    841  *
    842  * Character classes may either be described by a single call to
    843  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
    844  * In the latter case, ...Begin() will be called, followed by a sequence of
    845  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
    846  *
    847  * Sequences of atoms and assertions are broken into alternatives via calls to
    848  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
    849  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
    850  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
    851  * capturing subpattern, this will be the subpatternId associated with these
    852  * parentheses, and will also by definition be the lowest subpatternId of these
    853  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
    854  * is passed the subpatternId of the last capturing subexpression nested within
    855  * these paretheses.  In the case of a capturing subpattern with no nested
    856  * capturing subpatterns, the same subpatternId will be passed to the begin and
    857  * end functions.  In the case of non-capturing subpatterns the subpatternId
    858  * passed to the begin method is also the first possible subpatternId that might
    859  * be nested within these paretheses.  If a set of non-capturing parentheses does
    860  * not contain any capturing subpatterns, then the subpatternId passed to begin
    861  * will be greater than the subpatternId passed to end.
    862  */
    863 
    864 template<class Delegate>
    865 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
    866 {
    867     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
    868 }
    869 
    870 } } // namespace JSC::Yarr
    871 
    872 #endif // YarrParser_h
    873