Home | History | Annotate | Download | only in yarr
      1 /*
      2  * Copyright (C) 2009 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef RegexParser_h
     27 #define RegexParser_h
     28 
     29 #include <wtf/Platform.h>
     30 
     31 #if ENABLE(YARR)
     32 
     33 #include <UString.h>
     34 #include <wtf/ASCIICType.h>
     35 #include <wtf/unicode/Unicode.h>
     36 #include <limits.h>
     37 
     38 namespace JSC { namespace Yarr {
     39 
     40 enum BuiltInCharacterClassID {
     41     DigitClassID,
     42     SpaceClassID,
     43     WordClassID,
     44     NewlineClassID,
     45 };
     46 
     47 // The Parser class should not be used directly - only via the Yarr::parse() method.
     48 template<class Delegate>
     49 class Parser {
     50 private:
     51     template<class FriendDelegate>
     52     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
     53 
     54     enum ErrorCode {
     55         NoError,
     56         PatternTooLarge,
     57         QuantifierOutOfOrder,
     58         QuantifierWithoutAtom,
     59         MissingParentheses,
     60         ParenthesesUnmatched,
     61         ParenthesesTypeInvalid,
     62         CharacterClassUnmatched,
     63         CharacterClassOutOfOrder,
     64         EscapeUnterminated,
     65         NumberOfErrorCodes
     66     };
     67 
     68     /*
     69      * CharacterClassParserDelegate:
     70      *
     71      * The class CharacterClassParserDelegate is used in the parsing of character
     72      * classes.  This class handles detection of character ranges.  This class
     73      * implements enough of the delegate interface such that it can be passed to
     74      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
     75      * to perform the parsing of escape characters in character sets.
     76      */
     77     class CharacterClassParserDelegate {
     78     public:
     79         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
     80             : m_delegate(delegate)
     81             , m_err(err)
     82             , m_state(empty)
     83         {
     84         }
     85 
     86         /*
     87          * begin():
     88          *
     89          * Called at beginning of construction.
     90          */
     91         void begin(bool invert)
     92         {
     93             m_delegate.atomCharacterClassBegin(invert);
     94         }
     95 
     96         /*
     97          * atomPatternCharacterUnescaped():
     98          *
     99          * This method is called directly from parseCharacterClass(), to report a new
    100          * pattern character token.  This method differs from atomPatternCharacter(),
    101          * which will be called from parseEscape(), since a hypen provided via this
    102          * method may be indicating a character range, but a hyphen parsed by
    103          * parseEscape() cannot be interpreted as doing so.
    104          */
    105         void atomPatternCharacterUnescaped(UChar ch)
    106         {
    107             switch (m_state) {
    108             case empty:
    109                 m_character = ch;
    110                 m_state = cachedCharacter;
    111                 break;
    112 
    113             case cachedCharacter:
    114                 if (ch == '-')
    115                     m_state = cachedCharacterHyphen;
    116                 else {
    117                     m_delegate.atomCharacterClassAtom(m_character);
    118                     m_character = ch;
    119                 }
    120                 break;
    121 
    122             case cachedCharacterHyphen:
    123                 if (ch >= m_character)
    124                     m_delegate.atomCharacterClassRange(m_character, ch);
    125                 else
    126                     m_err = CharacterClassOutOfOrder;
    127                 m_state = empty;
    128             }
    129         }
    130 
    131         /*
    132          * atomPatternCharacter():
    133          *
    134          * Adds a pattern character, called by parseEscape(), as such will not
    135          * interpret a hyphen as indicating a character range.
    136          */
    137         void atomPatternCharacter(UChar ch)
    138         {
    139             // Flush if a character is already pending to prevent the
    140             // hyphen from begin interpreted as indicating a range.
    141             if((ch == '-') && (m_state == cachedCharacter))
    142                 flush();
    143 
    144             atomPatternCharacterUnescaped(ch);
    145         }
    146 
    147         /*
    148          * atomBuiltInCharacterClass():
    149          *
    150          * Adds a built-in character class, called by parseEscape().
    151          */
    152         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
    153         {
    154             flush();
    155             m_delegate.atomCharacterClassBuiltIn(classID, invert);
    156         }
    157 
    158         /*
    159          * end():
    160          *
    161          * Called at end of construction.
    162          */
    163         void end()
    164         {
    165             flush();
    166             m_delegate.atomCharacterClassEnd();
    167         }
    168 
    169         // parseEscape() should never call these delegate methods when
    170         // invoked with inCharacterClass set.
    171         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
    172         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
    173 
    174     private:
    175         void flush()
    176         {
    177             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
    178                 m_delegate.atomCharacterClassAtom(m_character);
    179             if (m_state == cachedCharacterHyphen)
    180                 m_delegate.atomCharacterClassAtom('-');
    181             m_state = empty;
    182         }
    183 
    184         Delegate& m_delegate;
    185         ErrorCode& m_err;
    186         enum CharacterClassConstructionState {
    187             empty,
    188             cachedCharacter,
    189             cachedCharacterHyphen,
    190         } m_state;
    191         UChar m_character;
    192     };
    193 
    194     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
    195         : m_delegate(delegate)
    196         , m_backReferenceLimit(backReferenceLimit)
    197         , m_err(NoError)
    198         , m_data(pattern.data())
    199         , m_size(pattern.size())
    200         , m_index(0)
    201         , m_parenthesesNestingDepth(0)
    202     {
    203     }
    204 
    205     /*
    206      * parseEscape():
    207      *
    208      * Helper for parseTokens() AND parseCharacterClass().
    209      * Unlike the other parser methods, this function does not report tokens
    210      * directly to the member delegate (m_delegate), instead tokens are
    211      * emitted to the delegate provided as an argument.  In the case of atom
    212      * escapes, parseTokens() will call parseEscape() passing m_delegate as
    213      * an argument, and as such the escape will be reported to the delegate.
    214      *
    215      * However this method may also be used by parseCharacterClass(), in which
    216      * case a CharacterClassParserDelegate will be passed as the delegate that
    217      * tokens should be added to.  A boolean flag is also provided to indicate
    218      * whether that an escape in a CharacterClass is being parsed (some parsing
    219      * rules change in this context).
    220      *
    221      * The boolean value returned by this method indicates whether the token
    222      * parsed was an atom (outside of a characted class \b and \B will be
    223      * interpreted as assertions).
    224      */
    225     template<bool inCharacterClass, class EscapeDelegate>
    226     bool parseEscape(EscapeDelegate& delegate)
    227     {
    228         ASSERT(!m_err);
    229         ASSERT(peek() == '\\');
    230         consume();
    231 
    232         if (atEndOfPattern()) {
    233             m_err = EscapeUnterminated;
    234             return false;
    235         }
    236 
    237         switch (peek()) {
    238         // Assertions
    239         case 'b':
    240             consume();
    241             if (inCharacterClass)
    242                 delegate.atomPatternCharacter('\b');
    243             else {
    244                 delegate.assertionWordBoundary(false);
    245                 return false;
    246             }
    247             break;
    248         case 'B':
    249             consume();
    250             if (inCharacterClass)
    251                 delegate.atomPatternCharacter('B');
    252             else {
    253                 delegate.assertionWordBoundary(true);
    254                 return false;
    255             }
    256             break;
    257 
    258         // CharacterClassEscape
    259         case 'd':
    260             consume();
    261             delegate.atomBuiltInCharacterClass(DigitClassID, false);
    262             break;
    263         case 's':
    264             consume();
    265             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
    266             break;
    267         case 'w':
    268             consume();
    269             delegate.atomBuiltInCharacterClass(WordClassID, false);
    270             break;
    271         case 'D':
    272             consume();
    273             delegate.atomBuiltInCharacterClass(DigitClassID, true);
    274             break;
    275         case 'S':
    276             consume();
    277             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
    278             break;
    279         case 'W':
    280             consume();
    281             delegate.atomBuiltInCharacterClass(WordClassID, true);
    282             break;
    283 
    284         // DecimalEscape
    285         case '1':
    286         case '2':
    287         case '3':
    288         case '4':
    289         case '5':
    290         case '6':
    291         case '7':
    292         case '8':
    293         case '9': {
    294             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
    295             // First, try to parse this as backreference.
    296             if (!inCharacterClass) {
    297                 ParseState state = saveState();
    298 
    299                 unsigned backReference = consumeNumber();
    300                 if (backReference <= m_backReferenceLimit) {
    301                     delegate.atomBackReference(backReference);
    302                     break;
    303                 }
    304 
    305                 restoreState(state);
    306             }
    307 
    308             // Not a backreference, and not octal.
    309             if (peek() >= '8') {
    310                 delegate.atomPatternCharacter('\\');
    311                 break;
    312             }
    313 
    314             // Fall-through to handle this as an octal escape.
    315         }
    316 
    317         // Octal escape
    318         case '0':
    319             delegate.atomPatternCharacter(consumeOctal());
    320             break;
    321 
    322         // ControlEscape
    323         case 'f':
    324             consume();
    325             delegate.atomPatternCharacter('\f');
    326             break;
    327         case 'n':
    328             consume();
    329             delegate.atomPatternCharacter('\n');
    330             break;
    331         case 'r':
    332             consume();
    333             delegate.atomPatternCharacter('\r');
    334             break;
    335         case 't':
    336             consume();
    337             delegate.atomPatternCharacter('\t');
    338             break;
    339         case 'v':
    340             consume();
    341             delegate.atomPatternCharacter('\v');
    342             break;
    343 
    344         // ControlLetter
    345         case 'c': {
    346             ParseState state = saveState();
    347             consume();
    348             if (!atEndOfPattern()) {
    349                 int control = consume();
    350 
    351                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
    352                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
    353                     delegate.atomPatternCharacter(control & 0x1f);
    354                     break;
    355                 }
    356             }
    357             restoreState(state);
    358             delegate.atomPatternCharacter('\\');
    359             break;
    360         }
    361 
    362         // HexEscape
    363         case 'x': {
    364             consume();
    365             int x = tryConsumeHex(2);
    366             if (x == -1)
    367                 delegate.atomPatternCharacter('x');
    368             else
    369                 delegate.atomPatternCharacter(x);
    370             break;
    371         }
    372 
    373         // UnicodeEscape
    374         case 'u': {
    375             consume();
    376             int u = tryConsumeHex(4);
    377             if (u == -1)
    378                 delegate.atomPatternCharacter('u');
    379             else
    380                 delegate.atomPatternCharacter(u);
    381             break;
    382         }
    383 
    384         // IdentityEscape
    385         default:
    386             delegate.atomPatternCharacter(consume());
    387         }
    388 
    389         return true;
    390     }
    391 
    392     /*
    393      * parseAtomEscape(), parseCharacterClassEscape():
    394      *
    395      * These methods alias to parseEscape().
    396      */
    397     bool parseAtomEscape()
    398     {
    399         return parseEscape<false>(m_delegate);
    400     }
    401     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
    402     {
    403         parseEscape<true>(delegate);
    404     }
    405 
    406     /*
    407      * parseCharacterClass():
    408      *
    409      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
    410      * to an instance of CharacterClassParserDelegate, to describe the character class to the
    411      * delegate.
    412      */
    413     void parseCharacterClass()
    414     {
    415         ASSERT(!m_err);
    416         ASSERT(peek() == '[');
    417         consume();
    418 
    419         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
    420 
    421         characterClassConstructor.begin(tryConsume('^'));
    422 
    423         while (!atEndOfPattern()) {
    424             switch (peek()) {
    425             case ']':
    426                 consume();
    427                 characterClassConstructor.end();
    428                 return;
    429 
    430             case '\\':
    431                 parseCharacterClassEscape(characterClassConstructor);
    432                 break;
    433 
    434             default:
    435                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
    436             }
    437 
    438             if (m_err)
    439                 return;
    440         }
    441 
    442         m_err = CharacterClassUnmatched;
    443     }
    444 
    445     /*
    446      * parseParenthesesBegin():
    447      *
    448      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
    449      */
    450     void parseParenthesesBegin()
    451     {
    452         ASSERT(!m_err);
    453         ASSERT(peek() == '(');
    454         consume();
    455 
    456         if (tryConsume('?')) {
    457             if (atEndOfPattern()) {
    458                 m_err = ParenthesesTypeInvalid;
    459                 return;
    460             }
    461 
    462             switch (consume()) {
    463             case ':':
    464                 m_delegate.atomParenthesesSubpatternBegin(false);
    465                 break;
    466 
    467             case '=':
    468                 m_delegate.atomParentheticalAssertionBegin();
    469                 break;
    470 
    471             case '!':
    472                 m_delegate.atomParentheticalAssertionBegin(true);
    473                 break;
    474 
    475             default:
    476                 m_err = ParenthesesTypeInvalid;
    477             }
    478         } else
    479             m_delegate.atomParenthesesSubpatternBegin();
    480 
    481         ++m_parenthesesNestingDepth;
    482     }
    483 
    484     /*
    485      * parseParenthesesEnd():
    486      *
    487      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
    488      */
    489     void parseParenthesesEnd()
    490     {
    491         ASSERT(!m_err);
    492         ASSERT(peek() == ')');
    493         consume();
    494 
    495         if (m_parenthesesNestingDepth > 0)
    496             m_delegate.atomParenthesesEnd();
    497         else
    498             m_err = ParenthesesUnmatched;
    499 
    500         --m_parenthesesNestingDepth;
    501     }
    502 
    503     /*
    504      * parseQuantifier():
    505      *
    506      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
    507      */
    508     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
    509     {
    510         ASSERT(!m_err);
    511         ASSERT(min <= max);
    512 
    513         if (lastTokenWasAnAtom)
    514             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
    515         else
    516             m_err = QuantifierWithoutAtom;
    517     }
    518 
    519     /*
    520      * parseTokens():
    521      *
    522      * This method loops over the input pattern reporting tokens to the delegate.
    523      * The method returns when a parse error is detected, or the end of the pattern
    524      * is reached.  One piece of state is tracked around the loop, which is whether
    525      * the last token passed to the delegate was an atom (this is necessary to detect
    526      * a parse error when a quantifier provided without an atom to quantify).
    527      */
    528     void parseTokens()
    529     {
    530         bool lastTokenWasAnAtom = false;
    531 
    532         while (!atEndOfPattern()) {
    533             switch (peek()) {
    534             case '|':
    535                 consume();
    536                 m_delegate.disjunction();
    537                 lastTokenWasAnAtom = false;
    538                 break;
    539 
    540             case '(':
    541                 parseParenthesesBegin();
    542                 lastTokenWasAnAtom = false;
    543                 break;
    544 
    545             case ')':
    546                 parseParenthesesEnd();
    547                 lastTokenWasAnAtom = true;
    548                 break;
    549 
    550             case '^':
    551                 consume();
    552                 m_delegate.assertionBOL();
    553                 lastTokenWasAnAtom = false;
    554                 break;
    555 
    556             case '$':
    557                 consume();
    558                 m_delegate.assertionEOL();
    559                 lastTokenWasAnAtom = false;
    560                 break;
    561 
    562             case '.':
    563                 consume();
    564                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
    565                 lastTokenWasAnAtom = true;
    566                 break;
    567 
    568             case '[':
    569                 parseCharacterClass();
    570                 lastTokenWasAnAtom = true;
    571                 break;
    572 
    573             case '\\':
    574                 lastTokenWasAnAtom = parseAtomEscape();
    575                 break;
    576 
    577             case '*':
    578                 consume();
    579                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
    580                 lastTokenWasAnAtom = false;
    581                 break;
    582 
    583             case '+':
    584                 consume();
    585                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
    586                 lastTokenWasAnAtom = false;
    587                 break;
    588 
    589             case '?':
    590                 consume();
    591                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
    592                 lastTokenWasAnAtom = false;
    593                 break;
    594 
    595             case '{': {
    596                 ParseState state = saveState();
    597 
    598                 consume();
    599                 if (peekIsDigit()) {
    600                     unsigned min = consumeNumber();
    601                     unsigned max = min;
    602 
    603                     if (tryConsume(','))
    604                         max = peekIsDigit() ? consumeNumber() : UINT_MAX;
    605 
    606                     if (tryConsume('}')) {
    607                         if (min <= max)
    608                             parseQuantifier(lastTokenWasAnAtom, min, max);
    609                         else
    610                             m_err = QuantifierOutOfOrder;
    611                         lastTokenWasAnAtom = false;
    612                         break;
    613                     }
    614                 }
    615 
    616                 restoreState(state);
    617             } // if we did not find a complete quantifer, fall through to the default case.
    618 
    619             default:
    620                 m_delegate.atomPatternCharacter(consume());
    621                 lastTokenWasAnAtom = true;
    622             }
    623 
    624             if (m_err)
    625                 return;
    626         }
    627 
    628         if (m_parenthesesNestingDepth > 0)
    629             m_err = MissingParentheses;
    630     }
    631 
    632     /*
    633      * parse():
    634      *
    635      * This method calls regexBegin(), calls parseTokens() to parse over the input
    636      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
    637      * error code to a const char* for a result.
    638      */
    639     const char* parse()
    640     {
    641         m_delegate.regexBegin();
    642 
    643         if (m_size > MAX_PATTERN_SIZE)
    644             m_err = PatternTooLarge;
    645         else
    646             parseTokens();
    647         ASSERT(atEndOfPattern() || m_err);
    648 
    649         if (m_err)
    650             m_delegate.regexError();
    651         else
    652             m_delegate.regexEnd();
    653 
    654         // The order of this array must match the ErrorCode enum.
    655         static const char* errorMessages[NumberOfErrorCodes] = {
    656             0, // NoError
    657             "regular expression too large",
    658             "numbers out of order in {} quantifier",
    659             "nothing to repeat",
    660             "missing )",
    661             "unmatched parentheses",
    662             "unrecognized character after (?",
    663             "missing terminating ] for character class",
    664             "range out of order in character class",
    665             "\\ at end of pattern"
    666         };
    667 
    668         return errorMessages[m_err];
    669     }
    670 
    671 
    672     // Misc helper functions:
    673 
    674     typedef unsigned ParseState;
    675 
    676     ParseState saveState()
    677     {
    678         return m_index;
    679     }
    680 
    681     void restoreState(ParseState state)
    682     {
    683         m_index = state;
    684     }
    685 
    686     bool atEndOfPattern()
    687     {
    688         ASSERT(m_index <= m_size);
    689         return m_index == m_size;
    690     }
    691 
    692     int peek()
    693     {
    694         ASSERT(m_index < m_size);
    695         return m_data[m_index];
    696     }
    697 
    698     bool peekIsDigit()
    699     {
    700         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
    701     }
    702 
    703     unsigned peekDigit()
    704     {
    705         ASSERT(peekIsDigit());
    706         return peek() - '0';
    707     }
    708 
    709     int consume()
    710     {
    711         ASSERT(m_index < m_size);
    712         return m_data[m_index++];
    713     }
    714 
    715     unsigned consumeDigit()
    716     {
    717         ASSERT(peekIsDigit());
    718         return consume() - '0';
    719     }
    720 
    721     unsigned consumeNumber()
    722     {
    723         unsigned n = consumeDigit();
    724         // check for overflow.
    725         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
    726             n = newValue;
    727             consume();
    728         }
    729         return n;
    730     }
    731 
    732     unsigned consumeOctal()
    733     {
    734         ASSERT(WTF::isASCIIOctalDigit(peek()));
    735 
    736         unsigned n = consumeDigit();
    737         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
    738             n = n * 8 + consumeDigit();
    739         return n;
    740     }
    741 
    742     bool tryConsume(UChar ch)
    743     {
    744         if (atEndOfPattern() || (m_data[m_index] != ch))
    745             return false;
    746         ++m_index;
    747         return true;
    748     }
    749 
    750     int tryConsumeHex(int count)
    751     {
    752         ParseState state = saveState();
    753 
    754         int n = 0;
    755         while (count--) {
    756             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
    757                 restoreState(state);
    758                 return -1;
    759             }
    760             n = (n << 4) | WTF::toASCIIHexValue(consume());
    761         }
    762         return n;
    763     }
    764 
    765     Delegate& m_delegate;
    766     unsigned m_backReferenceLimit;
    767     ErrorCode m_err;
    768     const UChar* m_data;
    769     unsigned m_size;
    770     unsigned m_index;
    771     unsigned m_parenthesesNestingDepth;
    772 
    773     // Derived by empirical testing of compile time in PCRE and WREC.
    774     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
    775 };
    776 
    777 /*
    778  * Yarr::parse():
    779  *
    780  * The parse method is passed a pattern to be parsed and a delegate upon which
    781  * callbacks will be made to record the parsed tokens forming the regex.
    782  * Yarr::parse() returns null on success, or a const C string providing an error
    783  * message where a parse error occurs.
    784  *
    785  * The Delegate must implement the following interface:
    786  *
    787  *    void assertionBOL();
    788  *    void assertionEOL();
    789  *    void assertionWordBoundary(bool invert);
    790  *
    791  *    void atomPatternCharacter(UChar ch);
    792  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
    793  *    void atomCharacterClassBegin(bool invert)
    794  *    void atomCharacterClassAtom(UChar ch)
    795  *    void atomCharacterClassRange(UChar begin, UChar end)
    796  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
    797  *    void atomCharacterClassEnd()
    798  *    void atomParenthesesSubpatternBegin(bool capture = true);
    799  *    void atomParentheticalAssertionBegin(bool invert = false);
    800  *    void atomParenthesesEnd();
    801  *    void atomBackReference(unsigned subpatternId);
    802  *
    803  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
    804  *
    805  *    void disjunction();
    806  *
    807  *    void regexBegin();
    808  *    void regexEnd();
    809  *    void regexError();
    810  *
    811  * Before any call recording tokens are made, regexBegin() will be called on the
    812  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
    813  * be called, as appropriate.
    814  *
    815  * The regular expression is described by a sequence of assertion*() and atom*()
    816  * callbacks to the delegate, describing the terms in the regular expression.
    817  * Following an atom a quantifyAtom() call may occur to indicate that the previous
    818  * atom should be quantified.  In the case of atoms described across multiple
    819  * calls (parentheses and character classes) the call to quantifyAtom() will come
    820  * after the call to the atom*End() method, never after atom*Begin().
    821  *
    822  * Character classes may either be described by a single call to
    823  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
    824  * In the latter case, ...Begin() will be called, followed by a sequence of
    825  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
    826  *
    827  * Sequences of atoms and assertions are broken into alternatives via calls to
    828  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
    829  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
    830  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
    831  * capturing subpattern, this will be the subpatternId associated with these
    832  * parentheses, and will also by definition be the lowest subpatternId of these
    833  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
    834  * is passed the subpatternId of the last capturing subexpression nested within
    835  * these paretheses.  In the case of a capturing subpattern with no nested
    836  * capturing subpatterns, the same subpatternId will be passed to the begin and
    837  * end functions.  In the case of non-capturing subpatterns the subpatternId
    838  * passed to the begin method is also the first possible subpatternId that might
    839  * be nested within these paretheses.  If a set of non-capturing parentheses does
    840  * not contain any capturing subpatterns, then the subpatternId passed to begin
    841  * will be greater than the subpatternId passed to end.
    842  */
    843 
    844 template<class Delegate>
    845 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
    846 {
    847     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
    848 }
    849 
    850 } } // namespace JSC::Yarr
    851 
    852 #endif
    853 
    854 #endif // RegexParser_h
    855