Home | History | Annotate | Download | only in pcre
      1 /* This is JavaScriptCore's variant of the PCRE library. While this library
      2 started out as a copy of PCRE, many of the features of PCRE have been
      3 removed. This library now supports only the regular expression features
      4 required by the JavaScript language specification, and has only the functions
      5 needed by JavaScriptCore and the rest of WebKit.
      6 
      7                  Originally written by Philip Hazel
      8            Copyright (c) 1997-2006 University of Cambridge
      9     Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
     10     Copyright (C) 2007 Eric Seidel <eric (at) webkit.org>
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 /* This module contains the external function jsRegExpExecute(), along with
     42 supporting internal functions that are not used by other modules. */
     43 
     44 #include "config.h"
     45 
     46 #include "pcre_internal.h"
     47 
     48 #include <string.h>
     49 #include <wtf/ASCIICType.h>
     50 #include <wtf/FastMalloc.h>
     51 
     52 using namespace WTF;
     53 
     54 /* Negative values for the firstchar and reqchar variables */
     55 
     56 #define REQ_UNSET (-2)
     57 #define REQ_NONE  (-1)
     58 
     59 /*************************************************
     60 *      Code parameters and static tables         *
     61 *************************************************/
     62 
     63 /* Maximum number of items on the nested bracket stacks at compile time. This
     64 applies to the nesting of all kinds of parentheses. It does not limit
     65 un-nested, non-capturing parentheses. This number can be made bigger if
     66 necessary - it is used to dimension one int and one unsigned char vector at
     67 compile time. */
     68 
     69 #define BRASTACK_SIZE 200
     70 
     71 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
     72 are simple data values; negative values are for special things like \d and so
     73 on. Zero means further processing is needed (for things like \x), or the escape
     74 is invalid. */
     75 
     76 static const short escapes[] = {
     77      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
     78      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
     79    '@',      0, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
     80      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
     81      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
     82      0,      0,      0,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
     83    '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
     84      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
     85      0,      0,    '\r', -ESC_s,   '\t',      0,  '\v', -ESC_w,   /* p - w */
     86      0,      0,      0                                            /* x - z */
     87 };
     88 
     89 /* Error code numbers. They are given names so that they can more easily be
     90 tracked. */
     91 
     92 enum ErrorCode {
     93     ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
     94     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
     95 };
     96 
     97 /* The texts of compile-time error messages. These are "char *" because they
     98 are passed to the outside world. */
     99 
    100 static const char* errorText(ErrorCode code)
    101 {
    102     static const char errorTexts[] =
    103       /* 1 */
    104       "\\ at end of pattern\0"
    105       "\\c at end of pattern\0"
    106       "character value in \\x{...} sequence is too large\0"
    107       "numbers out of order in {} quantifier\0"
    108       /* 5 */
    109       "number too big in {} quantifier\0"
    110       "missing terminating ] for character class\0"
    111       "internal error: code overflow\0"
    112       "range out of order in character class\0"
    113       "nothing to repeat\0"
    114       /* 10 */
    115       "unmatched parentheses\0"
    116       "internal error: unexpected repeat\0"
    117       "unrecognized character after (?\0"
    118       "failed to get memory\0"
    119       "missing )\0"
    120       /* 15 */
    121       "reference to non-existent subpattern\0"
    122       "regular expression too large\0"
    123       "parentheses nested too deeply"
    124     ;
    125 
    126     int i = code;
    127     const char* text = errorTexts;
    128     while (i > 1)
    129         i -= !*text++;
    130     return text;
    131 }
    132 
    133 /* Structure for passing "static" information around between the functions
    134 doing the compiling. */
    135 
    136 struct CompileData {
    137     CompileData() {
    138         topBackref = 0;
    139         backrefMap = 0;
    140         reqVaryOpt = 0;
    141         needOuterBracket = false;
    142         numCapturingBrackets = 0;
    143     }
    144     int topBackref;            /* Maximum back reference */
    145     unsigned backrefMap;       /* Bitmap of low back refs */
    146     int reqVaryOpt;            /* "After variable item" flag for reqByte */
    147     bool needOuterBracket;
    148     int numCapturingBrackets;
    149 };
    150 
    151 /* Definitions to allow mutual recursion */
    152 
    153 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
    154 static bool bracketIsAnchored(const unsigned char* code);
    155 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
    156 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
    157 
    158 /*************************************************
    159 *            Handle escapes                      *
    160 *************************************************/
    161 
    162 /* This function is called when a \ has been encountered. It either returns a
    163 positive value for a simple escape such as \n, or a negative value which
    164 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
    165 a positive value greater than 255 may be returned. On entry, ptr is pointing at
    166 the \. On exit, it is on the final character of the escape sequence.
    167 
    168 Arguments:
    169   ptrPtr         points to the pattern position pointer
    170   errorCodePtr   points to the errorcode variable
    171   bracount       number of previous extracting brackets
    172   options        the options bits
    173   isClass        true if inside a character class
    174 
    175 Returns:         zero or positive => a data character
    176                  negative => a special escape sequence
    177                  on error, errorPtr is set
    178 */
    179 
    180 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
    181 {
    182     const UChar* ptr = *ptrPtr + 1;
    183 
    184     /* If backslash is at the end of the pattern, it's an error. */
    185     if (ptr == patternEnd) {
    186         *errorCodePtr = ERR1;
    187         *ptrPtr = ptr;
    188         return 0;
    189     }
    190 
    191     int c = *ptr;
    192 
    193     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
    194      a table. A non-zero result is something that can be returned immediately.
    195      Otherwise further processing may be required. */
    196 
    197     if (c < '0' || c > 'z') { /* Not alphameric */
    198     } else if (int escapeValue = escapes[c - '0']) {
    199         c = escapeValue;
    200         if (isClass) {
    201             if (-c == ESC_b)
    202                 c = '\b'; /* \b is backslash in a class */
    203             else if (-c == ESC_B)
    204                 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
    205         }
    206     /* Escapes that need further processing, or are illegal. */
    207 
    208     } else {
    209         switch (c) {
    210             case '1':
    211             case '2':
    212             case '3':
    213             case '4':
    214             case '5':
    215             case '6':
    216             case '7':
    217             case '8':
    218             case '9':
    219                 /* Escape sequences starting with a non-zero digit are backreferences,
    220                  unless there are insufficient brackets, in which case they are octal
    221                  escape sequences. Those sequences end on the first non-octal character
    222                  or when we overflow 0-255, whichever comes first. */
    223 
    224                 if (!isClass) {
    225                     const UChar* oldptr = ptr;
    226                     c -= '0';
    227                     while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
    228                         c = c * 10 + *(++ptr) - '0';
    229                     if (c <= bracount) {
    230                         c = -(ESC_REF + c);
    231                         break;
    232                     }
    233                     ptr = oldptr;      /* Put the pointer back and fall through */
    234                 }
    235 
    236                 /* Handle an octal number following \. If the first digit is 8 or 9,
    237                  this is not octal. */
    238 
    239                 if ((c = *ptr) >= '8') {
    240                     c = '\\';
    241                     ptr -= 1;
    242                     break;
    243                 }
    244 
    245             /* \0 always starts an octal number, but we may drop through to here with a
    246              larger first octal digit. */
    247 
    248             case '0': {
    249                 c -= '0';
    250                 int i;
    251                 for (i = 1; i <= 2; ++i) {
    252                     if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
    253                         break;
    254                     int cc = c * 8 + ptr[i] - '0';
    255                     if (cc > 255)
    256                         break;
    257                     c = cc;
    258                 }
    259                 ptr += i - 1;
    260                 break;
    261             }
    262 
    263             case 'x': {
    264                 c = 0;
    265                 int i;
    266                 for (i = 1; i <= 2; ++i) {
    267                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
    268                         c = 'x';
    269                         i = 1;
    270                         break;
    271                     }
    272                     int cc = ptr[i];
    273                     if (cc >= 'a')
    274                         cc -= 32;             /* Convert to upper case */
    275                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
    276                 }
    277                 ptr += i - 1;
    278                 break;
    279             }
    280 
    281             case 'u': {
    282                 c = 0;
    283                 int i;
    284                 for (i = 1; i <= 4; ++i) {
    285                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
    286                         c = 'u';
    287                         i = 1;
    288                         break;
    289                     }
    290                     int cc = ptr[i];
    291                     if (cc >= 'a')
    292                         cc -= 32;             /* Convert to upper case */
    293                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
    294                 }
    295                 ptr += i - 1;
    296                 break;
    297             }
    298 
    299             case 'c':
    300                 if (++ptr == patternEnd) {
    301                     *errorCodePtr = ERR2;
    302                     return 0;
    303                 }
    304 
    305                 c = *ptr;
    306 
    307                 /* To match Firefox, inside a character class, we also accept
    308                    numbers and '_' as control characters */
    309                 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
    310                     c = '\\';
    311                     ptr -= 2;
    312                     break;
    313                 }
    314 
    315                 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
    316                  is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
    317                 c = toASCIIUpper(c) ^ 0x40;
    318                 break;
    319             }
    320     }
    321 
    322     *ptrPtr = ptr;
    323     return c;
    324 }
    325 
    326 /*************************************************
    327 *            Check for counted repeat            *
    328 *************************************************/
    329 
    330 /* This function is called when a '{' is encountered in a place where it might
    331 start a quantifier. It looks ahead to see if it really is a quantifier or not.
    332 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
    333 where the ddds are digits.
    334 
    335 Arguments:
    336   p         pointer to the first char after '{'
    337 
    338 Returns:    true or false
    339 */
    340 
    341 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
    342 {
    343     if (p >= patternEnd || !isASCIIDigit(*p))
    344         return false;
    345     p++;
    346     while (p < patternEnd && isASCIIDigit(*p))
    347         p++;
    348     if (p < patternEnd && *p == '}')
    349         return true;
    350 
    351     if (p >= patternEnd || *p++ != ',')
    352         return false;
    353     if (p < patternEnd && *p == '}')
    354         return true;
    355 
    356     if (p >= patternEnd || !isASCIIDigit(*p))
    357         return false;
    358     p++;
    359     while (p < patternEnd && isASCIIDigit(*p))
    360         p++;
    361 
    362     return (p < patternEnd && *p == '}');
    363 }
    364 
    365 /*************************************************
    366 *         Read repeat counts                     *
    367 *************************************************/
    368 
    369 /* Read an item of the form {n,m} and return the values. This is called only
    370 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
    371 so the syntax is guaranteed to be correct, but we need to check the values.
    372 
    373 Arguments:
    374   p              pointer to first char after '{'
    375   minp           pointer to int for min
    376   maxp           pointer to int for max
    377                  returned as -1 if no max
    378   errorCodePtr   points to error code variable
    379 
    380 Returns:         pointer to '}' on success;
    381                  current ptr on error, with errorCodePtr set non-zero
    382 */
    383 
    384 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
    385 {
    386     int min = 0;
    387     int max = -1;
    388 
    389     /* Read the minimum value and do a paranoid check: a negative value indicates
    390      an integer overflow. */
    391 
    392     while (isASCIIDigit(*p))
    393         min = min * 10 + *p++ - '0';
    394     if (min < 0 || min > 65535) {
    395         *errorCodePtr = ERR5;
    396         return p;
    397     }
    398 
    399     /* Read the maximum value if there is one, and again do a paranoid on its size.
    400      Also, max must not be less than min. */
    401 
    402     if (*p == '}')
    403         max = min;
    404     else {
    405         if (*(++p) != '}') {
    406             max = 0;
    407             while (isASCIIDigit(*p))
    408                 max = max * 10 + *p++ - '0';
    409             if (max < 0 || max > 65535) {
    410                 *errorCodePtr = ERR5;
    411                 return p;
    412             }
    413             if (max < min) {
    414                 *errorCodePtr = ERR4;
    415                 return p;
    416             }
    417         }
    418     }
    419 
    420     /* Fill in the required variables, and pass back the pointer to the terminating
    421      '}'. */
    422 
    423     *minp = min;
    424     *maxp = max;
    425     return p;
    426 }
    427 
    428 /*************************************************
    429 *      Find first significant op code            *
    430 *************************************************/
    431 
    432 /* This is called by several functions that scan a compiled expression looking
    433 for a fixed first character, or an anchoring op code etc. It skips over things
    434 that do not influence this.
    435 
    436 Arguments:
    437   code         pointer to the start of the group
    438 Returns:       pointer to the first significant opcode
    439 */
    440 
    441 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
    442 {
    443     while (*code == OP_BRANUMBER)
    444         code += 3;
    445     return code;
    446 }
    447 
    448 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
    449 {
    450     while (true) {
    451         switch (*code) {
    452             case OP_ASSERT_NOT:
    453                 advanceToEndOfBracket(code);
    454                 code += 1 + LINK_SIZE;
    455                 break;
    456             case OP_WORD_BOUNDARY:
    457             case OP_NOT_WORD_BOUNDARY:
    458                 ++code;
    459                 break;
    460             case OP_BRANUMBER:
    461                 code += 3;
    462                 break;
    463             default:
    464                 return code;
    465         }
    466     }
    467 }
    468 
    469 /*************************************************
    470 *           Get othercase range                  *
    471 *************************************************/
    472 
    473 /* This function is passed the start and end of a class range, in UTF-8 mode
    474 with UCP support. It searches up the characters, looking for internal ranges of
    475 characters in the "other" case. Each call returns the next one, updating the
    476 start address.
    477 
    478 Arguments:
    479   cptr        points to starting character value; updated
    480   d           end value
    481   ocptr       where to put start of othercase range
    482   odptr       where to put end of othercase range
    483 
    484 Yield:        true when range returned; false when no more
    485 */
    486 
    487 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
    488 {
    489     int c, othercase = 0;
    490 
    491     for (c = *cptr; c <= d; c++) {
    492         if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
    493             break;
    494     }
    495 
    496     if (c > d)
    497         return false;
    498 
    499     *ocptr = othercase;
    500     int next = othercase + 1;
    501 
    502     for (++c; c <= d; c++) {
    503         if (jsc_pcre_ucp_othercase(c) != next)
    504             break;
    505         next++;
    506     }
    507 
    508     *odptr = next - 1;
    509     *cptr = c;
    510 
    511     return true;
    512 }
    513 
    514 /*************************************************
    515  *       Convert character value to UTF-8         *
    516  *************************************************/
    517 
    518 /* This function takes an integer value in the range 0 - 0x7fffffff
    519  and encodes it as a UTF-8 character in 0 to 6 bytes.
    520 
    521  Arguments:
    522  cvalue     the character value
    523  buffer     pointer to buffer for result - at least 6 bytes long
    524 
    525  Returns:     number of characters placed in the buffer
    526  */
    527 
    528 static int encodeUTF8(int cvalue, unsigned char *buffer)
    529 {
    530     int i;
    531     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
    532         if (cvalue <= jsc_pcre_utf8_table1[i])
    533             break;
    534     buffer += i;
    535     for (int j = i; j > 0; j--) {
    536         *buffer-- = 0x80 | (cvalue & 0x3f);
    537         cvalue >>= 6;
    538     }
    539     *buffer = jsc_pcre_utf8_table2[i] | cvalue;
    540     return i + 1;
    541 }
    542 
    543 /*************************************************
    544 *           Compile one branch                   *
    545 *************************************************/
    546 
    547 /* Scan the pattern, compiling it into the code vector.
    548 
    549 Arguments:
    550   options        the option bits
    551   brackets       points to number of extracting brackets used
    552   codePtr        points to the pointer to the current code point
    553   ptrPtr         points to the current pattern pointer
    554   errorCodePtr   points to error code variable
    555   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
    556   reqbyteptr     set to the last literal character required, else < 0
    557   cd             contains pointers to tables etc.
    558 
    559 Returns:         true on success
    560                  false, with *errorCodePtr set non-zero on error
    561 */
    562 
    563 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
    564 {
    565     return ((ptr + 1 < patternEnd) && ptr[1] == expected);
    566 }
    567 
    568 static bool
    569 compileBranch(int options, int* brackets, unsigned char** codePtr,
    570                const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
    571                int* reqbyteptr, CompileData& cd)
    572 {
    573     int repeatType, opType;
    574     int repeatMin = 0, repeat_max = 0;      /* To please picky compilers */
    575     int bravalue = 0;
    576     int reqvary, tempreqvary;
    577     int c;
    578     unsigned char* code = *codePtr;
    579     unsigned char* tempcode;
    580     bool didGroupSetFirstByte = false;
    581     const UChar* ptr = *ptrPtr;
    582     const UChar* tempptr;
    583     unsigned char* previous = NULL;
    584     unsigned char classbits[32];
    585 
    586     bool class_utf8;
    587     unsigned char* class_utf8data;
    588     unsigned char utf8_char[6];
    589 
    590     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
    591      matching encountered yet". It gets changed to REQ_NONE if we hit something that
    592      matches a non-fixed char first char; reqByte just remains unset if we never
    593      find one.
    594 
    595      When we hit a repeat whose minimum is zero, we may have to adjust these values
    596      to take the zero repeat into account. This is implemented by setting them to
    597      zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
    598      item types that can be repeated set these backoff variables appropriately. */
    599 
    600     int firstByte = REQ_UNSET;
    601     int reqByte = REQ_UNSET;
    602     int zeroReqByte = REQ_UNSET;
    603     int zeroFirstByte = REQ_UNSET;
    604 
    605     /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
    606      according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
    607      value > 255. It is added into the firstByte or reqByte variables to record the
    608      case status of the value. This is used only for ASCII characters. */
    609 
    610     int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
    611 
    612     /* Switch on next character until the end of the branch */
    613 
    614     for (;; ptr++) {
    615         bool negateClass;
    616         bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
    617         int classCharCount;
    618         int classLastChar;
    619         int skipBytes;
    620         int subReqByte;
    621         int subFirstByte;
    622         int mcLength;
    623         unsigned char mcbuffer[8];
    624 
    625         /* Next byte in the pattern */
    626 
    627         c = ptr < patternEnd ? *ptr : 0;
    628 
    629         /* Fill in length of a previous callout, except when the next thing is
    630          a quantifier. */
    631 
    632         bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
    633 
    634         switch (c) {
    635             /* The branch terminates at end of string, |, or ). */
    636 
    637             case 0:
    638                 if (ptr < patternEnd)
    639                     goto NORMAL_CHAR;
    640                 // End of string; fall through
    641             case '|':
    642             case ')':
    643                 *firstbyteptr = firstByte;
    644                 *reqbyteptr = reqByte;
    645                 *codePtr = code;
    646                 *ptrPtr = ptr;
    647                 return true;
    648 
    649             /* Handle single-character metacharacters. In multiline mode, ^ disables
    650              the setting of any following char as a first character. */
    651 
    652             case '^':
    653                 if (options & MatchAcrossMultipleLinesOption) {
    654                     if (firstByte == REQ_UNSET)
    655                         firstByte = REQ_NONE;
    656                     *code++ = OP_BOL;
    657                 } else
    658                     *code++ = OP_CIRC;
    659                 previous = NULL;
    660                 break;
    661 
    662             case '$':
    663                 previous = NULL;
    664                 if (options & MatchAcrossMultipleLinesOption)
    665                   *code++ = OP_EOL;
    666                 else
    667                   *code++ = OP_DOLL;
    668                 break;
    669 
    670             /* There can never be a first char if '.' is first, whatever happens about
    671              repeats. The value of reqByte doesn't change either. */
    672 
    673             case '.':
    674                 if (firstByte == REQ_UNSET)
    675                     firstByte = REQ_NONE;
    676                 zeroFirstByte = firstByte;
    677                 zeroReqByte = reqByte;
    678                 previous = code;
    679                 *code++ = OP_NOT_NEWLINE;
    680                 break;
    681 
    682             /* Character classes. If the included characters are all < 256, we build a
    683              32-byte bitmap of the permitted characters, except in the special case
    684              where there is only one such character. For negated classes, we build the
    685              map as usual, then invert it at the end. However, we use a different opcode
    686              so that data characters > 255 can be handled correctly.
    687 
    688              If the class contains characters outside the 0-255 range, a different
    689              opcode is compiled. It may optionally have a bit map for characters < 256,
    690              but those above are are explicitly listed afterwards. A flag byte tells
    691              whether the bitmap is present, and whether this is a negated class or not.
    692              */
    693 
    694             case '[': {
    695                 previous = code;
    696                 shouldFlipNegation = false;
    697 
    698                 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
    699                  they are encountered at the top level, so we'll do that too. */
    700 
    701                 /* If the first character is '^', set the negation flag and skip it. */
    702 
    703                 if (ptr + 1 >= patternEnd) {
    704                     *errorCodePtr = ERR6;
    705                     return false;
    706                 }
    707 
    708                 if (ptr[1] == '^') {
    709                     negateClass = true;
    710                     ++ptr;
    711                 } else
    712                     negateClass = false;
    713 
    714                 /* Keep a count of chars with values < 256 so that we can optimize the case
    715                  of just a single character (as long as it's < 256). For higher valued UTF-8
    716                  characters, we don't yet do any optimization. */
    717 
    718                 classCharCount = 0;
    719                 classLastChar = -1;
    720 
    721                 class_utf8 = false;                       /* No chars >= 256 */
    722                 class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
    723 
    724                 /* Initialize the 32-char bit map to all zeros. We have to build the
    725                  map in a temporary bit of store, in case the class contains only 1
    726                  character (< 256), because in that case the compiled code doesn't use the
    727                  bit map. */
    728 
    729                 memset(classbits, 0, 32 * sizeof(unsigned char));
    730 
    731                 /* Process characters until ] is reached. The first pass
    732                  through the regex checked the overall syntax, so we don't need to be very
    733                  strict here. At the start of the loop, c contains the first byte of the
    734                  character. */
    735 
    736                 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
    737                     /* Backslash may introduce a single character, or it may introduce one
    738                      of the specials, which just set a flag. Escaped items are checked for
    739                      validity in the pre-compiling pass. The sequence \b is a special case.
    740                      Inside a class (and only there) it is treated as backspace. Elsewhere
    741                      it marks a word boundary. Other escapes have preset maps ready to
    742                      or into the one we are building. We assume they have more than one
    743                      character in them, so set classCharCount bigger than one. */
    744 
    745                     if (c == '\\') {
    746                         c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
    747                         if (c < 0) {
    748                             classCharCount += 2;     /* Greater than 1 is what matters */
    749                             switch (-c) {
    750                                 case ESC_d:
    751                                     for (c = 0; c < 32; c++)
    752                                         classbits[c] |= classBitmapForChar(c + cbit_digit);
    753                                     continue;
    754 
    755                                 case ESC_D:
    756                                     shouldFlipNegation = true;
    757                                     for (c = 0; c < 32; c++)
    758                                         classbits[c] |= ~classBitmapForChar(c + cbit_digit);
    759                                     continue;
    760 
    761                                 case ESC_w:
    762                                     for (c = 0; c < 32; c++)
    763                                         classbits[c] |= classBitmapForChar(c + cbit_word);
    764                                     continue;
    765 
    766                                 case ESC_W:
    767                                     shouldFlipNegation = true;
    768                                     for (c = 0; c < 32; c++)
    769                                         classbits[c] |= ~classBitmapForChar(c + cbit_word);
    770                                     continue;
    771 
    772                                 case ESC_s:
    773                                     for (c = 0; c < 32; c++)
    774                                          classbits[c] |= classBitmapForChar(c + cbit_space);
    775                                     continue;
    776 
    777                                 case ESC_S:
    778                                     shouldFlipNegation = true;
    779                                     for (c = 0; c < 32; c++)
    780                                          classbits[c] |= ~classBitmapForChar(c + cbit_space);
    781                                     continue;
    782 
    783                                     /* Unrecognized escapes are faulted if PCRE is running in its
    784                                      strict mode. By default, for compatibility with Perl, they are
    785                                      treated as literals. */
    786 
    787                                 default:
    788                                     c = *ptr;              /* The final character */
    789                                     classCharCount -= 2;  /* Undo the default count from above */
    790                             }
    791                         }
    792 
    793                         /* Fall through if we have a single character (c >= 0). This may be
    794                          > 256 in UTF-8 mode. */
    795 
    796                     }   /* End of backslash handling */
    797 
    798                     /* A single character may be followed by '-' to form a range. However,
    799                      Perl does not permit ']' to be the end of the range. A '-' character
    800                      here is treated as a literal. */
    801 
    802                     if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
    803                         ptr += 2;
    804 
    805                         int d = *ptr;
    806 
    807                         /* The second part of a range can be a single-character escape, but
    808                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
    809                          in such circumstances. */
    810 
    811                         if (d == '\\') {
    812                             const UChar* oldptr = ptr;
    813                             d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
    814 
    815                             /* \X is literal X; any other special means the '-' was literal */
    816                             if (d < 0) {
    817                                 ptr = oldptr - 2;
    818                                 goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    819                             }
    820                         }
    821 
    822                         /* The check that the two values are in the correct order happens in
    823                          the pre-pass. Optimize one-character ranges */
    824 
    825                         if (d == c)
    826                             goto LONE_SINGLE_CHARACTER;  /* A few lines below */
    827 
    828                         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
    829                          matching, we have to use an XCLASS with extra data items. Caseless
    830                          matching for characters > 127 is available only if UCP support is
    831                          available. */
    832 
    833                         if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
    834                             class_utf8 = true;
    835 
    836                             /* With UCP support, we can find the other case equivalents of
    837                              the relevant characters. There may be several ranges. Optimize how
    838                              they fit with the basic range. */
    839 
    840                             if (options & IgnoreCaseOption) {
    841                                 int occ, ocd;
    842                                 int cc = c;
    843                                 int origd = d;
    844                                 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
    845                                     if (occ >= c && ocd <= d)
    846                                         continue;  /* Skip embedded ranges */
    847 
    848                                     if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
    849                                     {                                  /* if there is overlap,   */
    850                                         c = occ;                           /* noting that if occ < c */
    851                                         continue;                          /* we can't have ocd > d  */
    852                                     }                                  /* because a subrange is  */
    853                                     if (ocd > d && occ <= d + 1)         /* always shorter than    */
    854                                     {                                  /* the basic range.       */
    855                                         d = ocd;
    856                                         continue;
    857                                     }
    858 
    859                                     if (occ == ocd)
    860                                         *class_utf8data++ = XCL_SINGLE;
    861                                     else {
    862                                         *class_utf8data++ = XCL_RANGE;
    863                                         class_utf8data += encodeUTF8(occ, class_utf8data);
    864                                     }
    865                                     class_utf8data += encodeUTF8(ocd, class_utf8data);
    866                                 }
    867                             }
    868 
    869                             /* Now record the original range, possibly modified for UCP caseless
    870                              overlapping ranges. */
    871 
    872                             *class_utf8data++ = XCL_RANGE;
    873                             class_utf8data += encodeUTF8(c, class_utf8data);
    874                             class_utf8data += encodeUTF8(d, class_utf8data);
    875 
    876                             /* With UCP support, we are done. Without UCP support, there is no
    877                              caseless matching for UTF-8 characters > 127; we can use the bit map
    878                              for the smaller ones. */
    879 
    880                             continue;    /* With next character in the class */
    881                         }
    882 
    883                         /* We use the bit map for all cases when not in UTF-8 mode; else
    884                          ranges that lie entirely within 0-127 when there is UCP support; else
    885                          for partial ranges without UCP support. */
    886 
    887                         for (; c <= d; c++) {
    888                             classbits[c/8] |= (1 << (c&7));
    889                             if (options & IgnoreCaseOption) {
    890                                 int uc = flipCase(c);
    891                                 classbits[uc/8] |= (1 << (uc&7));
    892                             }
    893                             classCharCount++;                /* in case a one-char range */
    894                             classLastChar = c;
    895                         }
    896 
    897                         continue;   /* Go get the next char in the class */
    898                     }
    899 
    900                     /* Handle a lone single character - we can get here for a normal
    901                      non-escape char, or after \ that introduces a single character or for an
    902                      apparent range that isn't. */
    903 
    904                 LONE_SINGLE_CHARACTER:
    905 
    906                     /* Handle a character that cannot go in the bit map */
    907 
    908                     if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
    909                         class_utf8 = true;
    910                         *class_utf8data++ = XCL_SINGLE;
    911                         class_utf8data += encodeUTF8(c, class_utf8data);
    912 
    913                         if (options & IgnoreCaseOption) {
    914                             int othercase;
    915                             if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
    916                                 *class_utf8data++ = XCL_SINGLE;
    917                                 class_utf8data += encodeUTF8(othercase, class_utf8data);
    918                             }
    919                         }
    920                     } else {
    921                         /* Handle a single-byte character */
    922                         classbits[c/8] |= (1 << (c&7));
    923                         if (options & IgnoreCaseOption) {
    924                             c = flipCase(c);
    925                             classbits[c/8] |= (1 << (c&7));
    926                         }
    927                         classCharCount++;
    928                         classLastChar = c;
    929                     }
    930                 }
    931 
    932                 /* If classCharCount is 1, we saw precisely one character whose value is
    933                  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
    934                  can optimize the negative case only if there were no characters >= 128
    935                  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
    936                  single-bytes only. This is an historical hangover. Maybe one day we can
    937                  tidy these opcodes to handle multi-byte characters.
    938 
    939                  The optimization throws away the bit map. We turn the item into a
    940                  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
    941                  that OP_NOT does not support multibyte characters. In the positive case, it
    942                  can cause firstByte to be set. Otherwise, there can be no first char if
    943                  this item is first, whatever repeat count may follow. In the case of
    944                  reqByte, save the previous value for reinstating. */
    945 
    946                 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
    947                     zeroReqByte = reqByte;
    948 
    949                     /* The OP_NOT opcode works on one-byte characters only. */
    950 
    951                     if (negateClass) {
    952                         if (firstByte == REQ_UNSET)
    953                             firstByte = REQ_NONE;
    954                         zeroFirstByte = firstByte;
    955                         *code++ = OP_NOT;
    956                         *code++ = classLastChar;
    957                         break;
    958                     }
    959 
    960                     /* For a single, positive character, get the value into c, and
    961                      then we can handle this with the normal one-character code. */
    962 
    963                     c = classLastChar;
    964                     goto NORMAL_CHAR;
    965                 }       /* End of 1-char optimization */
    966 
    967                 /* The general case - not the one-char optimization. If this is the first
    968                  thing in the branch, there can be no first char setting, whatever the
    969                  repeat count. Any reqByte setting must remain unchanged after any kind of
    970                  repeat. */
    971 
    972                 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
    973                 zeroFirstByte = firstByte;
    974                 zeroReqByte = reqByte;
    975 
    976                 /* If there are characters with values > 255, we have to compile an
    977                  extended class, with its own opcode. If there are no characters < 256,
    978                  we can omit the bitmap. */
    979 
    980                 if (class_utf8 && !shouldFlipNegation) {
    981                     *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
    982                     *code++ = OP_XCLASS;
    983                     code += LINK_SIZE;
    984                     *code = negateClass? XCL_NOT : 0;
    985 
    986                     /* If the map is required, install it, and move on to the end of
    987                      the extra data */
    988 
    989                     if (classCharCount > 0) {
    990                         *code++ |= XCL_MAP;
    991                         memcpy(code, classbits, 32);
    992                         code = class_utf8data;
    993                     }
    994 
    995                     /* If the map is not required, slide down the extra data. */
    996 
    997                     else {
    998                         int len = class_utf8data - (code + 33);
    999                         memmove(code + 1, code + 33, len);
   1000                         code += len + 1;
   1001                     }
   1002 
   1003                     /* Now fill in the complete length of the item */
   1004 
   1005                     putLinkValue(previous + 1, code - previous);
   1006                     break;   /* End of class handling */
   1007                 }
   1008 
   1009                 /* If there are no characters > 255, negate the 32-byte map if necessary,
   1010                  and copy it into the code vector. If this is the first thing in the branch,
   1011                  there can be no first char setting, whatever the repeat count. Any reqByte
   1012                  setting must remain unchanged after any kind of repeat. */
   1013 
   1014                 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
   1015                 if (negateClass)
   1016                     for (c = 0; c < 32; c++)
   1017                         code[c] = ~classbits[c];
   1018                 else
   1019                     memcpy(code, classbits, 32);
   1020                 code += 32;
   1021                 break;
   1022             }
   1023 
   1024             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
   1025              has been tested above. */
   1026 
   1027             case '{':
   1028                 if (!isQuantifier)
   1029                     goto NORMAL_CHAR;
   1030                 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
   1031                 if (*errorCodePtr)
   1032                     goto FAILED;
   1033                 goto REPEAT;
   1034 
   1035             case '*':
   1036                 repeatMin = 0;
   1037                 repeat_max = -1;
   1038                 goto REPEAT;
   1039 
   1040             case '+':
   1041                 repeatMin = 1;
   1042                 repeat_max = -1;
   1043                 goto REPEAT;
   1044 
   1045             case '?':
   1046                 repeatMin = 0;
   1047                 repeat_max = 1;
   1048 
   1049             REPEAT:
   1050                 if (!previous) {
   1051                     *errorCodePtr = ERR9;
   1052                     goto FAILED;
   1053                 }
   1054 
   1055                 if (repeatMin == 0) {
   1056                     firstByte = zeroFirstByte;    /* Adjust for zero repeat */
   1057                     reqByte = zeroReqByte;        /* Ditto */
   1058                 }
   1059 
   1060                 /* Remember whether this is a variable length repeat */
   1061 
   1062                 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
   1063 
   1064                 opType = 0;                    /* Default single-char op codes */
   1065 
   1066                 /* Save start of previous item, in case we have to move it up to make space
   1067                  for an inserted OP_ONCE for the additional '+' extension. */
   1068                 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
   1069 
   1070                 tempcode = previous;
   1071 
   1072                 /* If the next character is '+', we have a possessive quantifier. This
   1073                  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
   1074                  If the next character is '?' this is a minimizing repeat, by default,
   1075                  but if PCRE_UNGREEDY is set, it works the other way round. We change the
   1076                  repeat type to the non-default. */
   1077 
   1078                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
   1079                     repeatType = 1;
   1080                     ptr++;
   1081                 } else
   1082                     repeatType = 0;
   1083 
   1084                 /* If previous was a character match, abolish the item and generate a
   1085                  repeat item instead. If a char item has a minumum of more than one, ensure
   1086                  that it is set in reqByte - it might not be if a sequence such as x{3} is
   1087                  the first thing in a branch because the x will have gone into firstByte
   1088                  instead.  */
   1089 
   1090                 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
   1091                     /* Deal with UTF-8 characters that take up more than one byte. It's
   1092                      easier to write this out separately than try to macrify it. Use c to
   1093                      hold the length of the character in bytes, plus 0x80 to flag that it's a
   1094                      length rather than a small character. */
   1095 
   1096                     if (code[-1] & 0x80) {
   1097                         unsigned char *lastchar = code - 1;
   1098                         while((*lastchar & 0xc0) == 0x80)
   1099                             lastchar--;
   1100                         c = code - lastchar;            /* Length of UTF-8 character */
   1101                         memcpy(utf8_char, lastchar, c); /* Save the char */
   1102                         c |= 0x80;                      /* Flag c as a length */
   1103                     }
   1104                     else {
   1105                         c = code[-1];
   1106                         if (repeatMin > 1)
   1107                             reqByte = c | reqCaseOpt | cd.reqVaryOpt;
   1108                     }
   1109 
   1110                     goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
   1111                 }
   1112 
   1113                 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
   1114                     c = previous[1];
   1115                     if (repeatMin > 1)
   1116                         reqByte = c | reqCaseOpt | cd.reqVaryOpt;
   1117                     goto OUTPUT_SINGLE_REPEAT;
   1118                 }
   1119 
   1120                 /* If previous was a single negated character ([^a] or similar), we use
   1121                  one of the special opcodes, replacing it. The code is shared with single-
   1122                  character repeats by setting opt_type to add a suitable offset into
   1123                  repeatType. OP_NOT is currently used only for single-byte chars. */
   1124 
   1125                 else if (*previous == OP_NOT) {
   1126                     opType = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
   1127                     c = previous[1];
   1128                     goto OUTPUT_SINGLE_REPEAT;
   1129                 }
   1130 
   1131                 /* If previous was a character type match (\d or similar), abolish it and
   1132                  create a suitable repeat item. The code is shared with single-character
   1133                  repeats by setting opType to add a suitable offset into repeatType. */
   1134 
   1135                 else if (*previous <= OP_NOT_NEWLINE) {
   1136                     opType = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
   1137                     c = *previous;
   1138 
   1139                 OUTPUT_SINGLE_REPEAT:
   1140                     int prop_type = -1;
   1141                     int prop_value = -1;
   1142 
   1143                     unsigned char* oldcode = code;
   1144                     code = previous;                  /* Usually overwrite previous item */
   1145 
   1146                     /* If the maximum is zero then the minimum must also be zero; Perl allows
   1147                      this case, so we do too - by simply omitting the item altogether. */
   1148 
   1149                     if (repeat_max == 0)
   1150                         goto END_REPEAT;
   1151 
   1152                     /* Combine the opType with the repeatType */
   1153 
   1154                     repeatType += opType;
   1155 
   1156                     /* A minimum of zero is handled either as the special case * or ?, or as
   1157                      an UPTO, with the maximum given. */
   1158 
   1159                     if (repeatMin == 0) {
   1160                         if (repeat_max == -1)
   1161                             *code++ = OP_STAR + repeatType;
   1162                         else if (repeat_max == 1)
   1163                             *code++ = OP_QUERY + repeatType;
   1164                         else {
   1165                             *code++ = OP_UPTO + repeatType;
   1166                             put2ByteValueAndAdvance(code, repeat_max);
   1167                         }
   1168                     }
   1169 
   1170                     /* A repeat minimum of 1 is optimized into some special cases. If the
   1171                      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
   1172                      left in place and, if the maximum is greater than 1, we use OP_UPTO with
   1173                      one less than the maximum. */
   1174 
   1175                     else if (repeatMin == 1) {
   1176                         if (repeat_max == -1)
   1177                             *code++ = OP_PLUS + repeatType;
   1178                         else {
   1179                             code = oldcode;                 /* leave previous item in place */
   1180                             if (repeat_max == 1)
   1181                                 goto END_REPEAT;
   1182                             *code++ = OP_UPTO + repeatType;
   1183                             put2ByteValueAndAdvance(code, repeat_max - 1);
   1184                         }
   1185                     }
   1186 
   1187                     /* The case {n,n} is just an EXACT, while the general case {n,m} is
   1188                      handled as an EXACT followed by an UPTO. */
   1189 
   1190                     else {
   1191                         *code++ = OP_EXACT + opType;  /* NB EXACT doesn't have repeatType */
   1192                         put2ByteValueAndAdvance(code, repeatMin);
   1193 
   1194                         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
   1195                          we have to insert the character for the previous code. For a repeated
   1196                          Unicode property match, there are two extra bytes that define the
   1197                          required property. In UTF-8 mode, long characters have their length in
   1198                          c, with the 0x80 bit as a flag. */
   1199 
   1200                         if (repeat_max < 0) {
   1201                             if (c >= 128) {
   1202                                 memcpy(code, utf8_char, c & 7);
   1203                                 code += c & 7;
   1204                             } else {
   1205                                 *code++ = c;
   1206                                 if (prop_type >= 0) {
   1207                                     *code++ = prop_type;
   1208                                     *code++ = prop_value;
   1209                                 }
   1210                             }
   1211                             *code++ = OP_STAR + repeatType;
   1212                         }
   1213 
   1214                         /* Else insert an UPTO if the max is greater than the min, again
   1215                          preceded by the character, for the previously inserted code. */
   1216 
   1217                         else if (repeat_max != repeatMin) {
   1218                             if (c >= 128) {
   1219                                 memcpy(code, utf8_char, c & 7);
   1220                                 code += c & 7;
   1221                             } else
   1222                                 *code++ = c;
   1223                             if (prop_type >= 0) {
   1224                                 *code++ = prop_type;
   1225                                 *code++ = prop_value;
   1226                             }
   1227                             repeat_max -= repeatMin;
   1228                             *code++ = OP_UPTO + repeatType;
   1229                             put2ByteValueAndAdvance(code, repeat_max);
   1230                         }
   1231                     }
   1232 
   1233                     /* The character or character type itself comes last in all cases. */
   1234 
   1235                     if (c >= 128) {
   1236                         memcpy(code, utf8_char, c & 7);
   1237                         code += c & 7;
   1238                     } else
   1239                         *code++ = c;
   1240 
   1241                     /* For a repeated Unicode property match, there are two extra bytes that
   1242                      define the required property. */
   1243 
   1244                     if (prop_type >= 0) {
   1245                         *code++ = prop_type;
   1246                         *code++ = prop_value;
   1247                     }
   1248                 }
   1249 
   1250                 /* If previous was a character class or a back reference, we put the repeat
   1251                  stuff after it, but just skip the item if the repeat was {0,0}. */
   1252 
   1253                 else if (*previous == OP_CLASS ||
   1254                          *previous == OP_NCLASS ||
   1255                          *previous == OP_XCLASS ||
   1256                          *previous == OP_REF)
   1257                 {
   1258                     if (repeat_max == 0) {
   1259                         code = previous;
   1260                         goto END_REPEAT;
   1261                     }
   1262 
   1263                     if (repeatMin == 0 && repeat_max == -1)
   1264                         *code++ = OP_CRSTAR + repeatType;
   1265                     else if (repeatMin == 1 && repeat_max == -1)
   1266                         *code++ = OP_CRPLUS + repeatType;
   1267                     else if (repeatMin == 0 && repeat_max == 1)
   1268                         *code++ = OP_CRQUERY + repeatType;
   1269                     else {
   1270                         *code++ = OP_CRRANGE + repeatType;
   1271                         put2ByteValueAndAdvance(code, repeatMin);
   1272                         if (repeat_max == -1)
   1273                             repeat_max = 0;  /* 2-byte encoding for max */
   1274                         put2ByteValueAndAdvance(code, repeat_max);
   1275                     }
   1276                 }
   1277 
   1278                 /* If previous was a bracket group, we may have to replicate it in certain
   1279                  cases. */
   1280 
   1281                 else if (*previous >= OP_BRA) {
   1282                     int ketoffset = 0;
   1283                     int len = code - previous;
   1284                     unsigned char* bralink = NULL;
   1285 
   1286                     /* If the maximum repeat count is unlimited, find the end of the bracket
   1287                      by scanning through from the start, and compute the offset back to it
   1288                      from the current code pointer. There may be an OP_OPT setting following
   1289                      the final KET, so we can't find the end just by going back from the code
   1290                      pointer. */
   1291 
   1292                     if (repeat_max == -1) {
   1293                         const unsigned char* ket = previous;
   1294                         advanceToEndOfBracket(ket);
   1295                         ketoffset = code - ket;
   1296                     }
   1297 
   1298                     /* The case of a zero minimum is special because of the need to stick
   1299                      OP_BRAZERO in front of it, and because the group appears once in the
   1300                      data, whereas in other cases it appears the minimum number of times. For
   1301                      this reason, it is simplest to treat this case separately, as otherwise
   1302                      the code gets far too messy. There are several special subcases when the
   1303                      minimum is zero. */
   1304 
   1305                     if (repeatMin == 0) {
   1306                         /* If the maximum is also zero, we just omit the group from the output
   1307                          altogether. */
   1308 
   1309                         if (repeat_max == 0) {
   1310                             code = previous;
   1311                             goto END_REPEAT;
   1312                         }
   1313 
   1314                         /* If the maximum is 1 or unlimited, we just have to stick in the
   1315                          BRAZERO and do no more at this point. However, we do need to adjust
   1316                          any OP_RECURSE calls inside the group that refer to the group itself or
   1317                          any internal group, because the offset is from the start of the whole
   1318                          regex. Temporarily terminate the pattern while doing this. */
   1319 
   1320                         if (repeat_max <= 1) {
   1321                             *code = OP_END;
   1322                             memmove(previous+1, previous, len);
   1323                             code++;
   1324                             *previous++ = OP_BRAZERO + repeatType;
   1325                         }
   1326 
   1327                         /* If the maximum is greater than 1 and limited, we have to replicate
   1328                          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
   1329                          The first one has to be handled carefully because it's the original
   1330                          copy, which has to be moved up. The remainder can be handled by code
   1331                          that is common with the non-zero minimum case below. We have to
   1332                          adjust the value of repeat_max, since one less copy is required. */
   1333 
   1334                         else {
   1335                             *code = OP_END;
   1336                             memmove(previous + 2 + LINK_SIZE, previous, len);
   1337                             code += 2 + LINK_SIZE;
   1338                             *previous++ = OP_BRAZERO + repeatType;
   1339                             *previous++ = OP_BRA;
   1340 
   1341                             /* We chain together the bracket offset fields that have to be
   1342                              filled in later when the ends of the brackets are reached. */
   1343 
   1344                             int offset = (!bralink) ? 0 : previous - bralink;
   1345                             bralink = previous;
   1346                             putLinkValueAllowZeroAndAdvance(previous, offset);
   1347                         }
   1348 
   1349                         repeat_max--;
   1350                     }
   1351 
   1352                     /* If the minimum is greater than zero, replicate the group as many
   1353                      times as necessary, and adjust the maximum to the number of subsequent
   1354                      copies that we need. If we set a first char from the group, and didn't
   1355                      set a required char, copy the latter from the former. */
   1356 
   1357                     else {
   1358                         if (repeatMin > 1) {
   1359                             if (didGroupSetFirstByte && reqByte < 0)
   1360                                 reqByte = firstByte;
   1361                             for (int i = 1; i < repeatMin; i++) {
   1362                                 memcpy(code, previous, len);
   1363                                 code += len;
   1364                             }
   1365                         }
   1366                         if (repeat_max > 0)
   1367                             repeat_max -= repeatMin;
   1368                     }
   1369 
   1370                     /* This code is common to both the zero and non-zero minimum cases. If
   1371                      the maximum is limited, it replicates the group in a nested fashion,
   1372                      remembering the bracket starts on a stack. In the case of a zero minimum,
   1373                      the first one was set up above. In all cases the repeat_max now specifies
   1374                      the number of additional copies needed. */
   1375 
   1376                     if (repeat_max >= 0) {
   1377                         for (int i = repeat_max - 1; i >= 0; i--) {
   1378                             *code++ = OP_BRAZERO + repeatType;
   1379 
   1380                             /* All but the final copy start a new nesting, maintaining the
   1381                              chain of brackets outstanding. */
   1382 
   1383                             if (i != 0) {
   1384                                 *code++ = OP_BRA;
   1385                                 int offset = (!bralink) ? 0 : code - bralink;
   1386                                 bralink = code;
   1387                                 putLinkValueAllowZeroAndAdvance(code, offset);
   1388                             }
   1389 
   1390                             memcpy(code, previous, len);
   1391                             code += len;
   1392                         }
   1393 
   1394                         /* Now chain through the pending brackets, and fill in their length
   1395                          fields (which are holding the chain links pro tem). */
   1396 
   1397                         while (bralink) {
   1398                             int offset = code - bralink + 1;
   1399                             unsigned char* bra = code - offset;
   1400                             int oldlinkoffset = getLinkValueAllowZero(bra + 1);
   1401                             bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
   1402                             *code++ = OP_KET;
   1403                             putLinkValueAndAdvance(code, offset);
   1404                             putLinkValue(bra + 1, offset);
   1405                         }
   1406                     }
   1407 
   1408                     /* If the maximum is unlimited, set a repeater in the final copy. We
   1409                      can't just offset backwards from the current code point, because we
   1410                      don't know if there's been an options resetting after the ket. The
   1411                      correct offset was computed above. */
   1412 
   1413                     else
   1414                         code[-ketoffset] = OP_KETRMAX + repeatType;
   1415                 }
   1416 
   1417                 // A quantifier after an assertion is mostly meaningless, but it
   1418                 // can nullify the assertion if it has a 0 minimum.
   1419                 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
   1420                     if (repeatMin == 0) {
   1421                         code = previous;
   1422                         goto END_REPEAT;
   1423                     }
   1424                 }
   1425 
   1426                 /* Else there's some kind of shambles */
   1427 
   1428                 else {
   1429                     *errorCodePtr = ERR11;
   1430                     goto FAILED;
   1431                 }
   1432 
   1433                 /* In all case we no longer have a previous item. We also set the
   1434                  "follows varying string" flag for subsequently encountered reqbytes if
   1435                  it isn't already set and we have just passed a varying length item. */
   1436 
   1437             END_REPEAT:
   1438                 previous = NULL;
   1439                 cd.reqVaryOpt |= reqvary;
   1440                 break;
   1441 
   1442             /* Start of nested bracket sub-expression, or comment or lookahead or
   1443              lookbehind or option setting or condition. First deal with special things
   1444              that can come after a bracket; all are introduced by ?, and the appearance
   1445              of any of them means that this is not a referencing group. They were
   1446              checked for validity in the first pass over the string, so we don't have to
   1447              check for syntax errors here.  */
   1448 
   1449             case '(':
   1450                 skipBytes = 0;
   1451 
   1452                 if (*(++ptr) == '?') {
   1453                     switch (*(++ptr)) {
   1454                         case ':':                 /* Non-extracting bracket */
   1455                             bravalue = OP_BRA;
   1456                             ptr++;
   1457                             break;
   1458 
   1459                         case '=':                 /* Positive lookahead */
   1460                             bravalue = OP_ASSERT;
   1461                             ptr++;
   1462                             break;
   1463 
   1464                         case '!':                 /* Negative lookahead */
   1465                             bravalue = OP_ASSERT_NOT;
   1466                             ptr++;
   1467                             break;
   1468 
   1469                         /* Character after (? not specially recognized */
   1470 
   1471                         default:
   1472                             *errorCodePtr = ERR12;
   1473                             goto FAILED;
   1474                         }
   1475                 }
   1476 
   1477                 /* Else we have a referencing group; adjust the opcode. If the bracket
   1478                  number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
   1479                  arrange for the true number to follow later, in an OP_BRANUMBER item. */
   1480 
   1481                 else {
   1482                     if (++(*brackets) > EXTRACT_BASIC_MAX) {
   1483                         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
   1484                         code[1 + LINK_SIZE] = OP_BRANUMBER;
   1485                         put2ByteValue(code + 2 + LINK_SIZE, *brackets);
   1486                         skipBytes = 3;
   1487                     }
   1488                     else
   1489                         bravalue = OP_BRA + *brackets;
   1490                 }
   1491 
   1492                 /* Process nested bracketed re. We copy code into a non-variable
   1493                  in order to be able to pass its address because some compilers
   1494                  complain otherwise. Pass in a new setting for the ims options
   1495                  if they have changed. */
   1496 
   1497                 previous = code;
   1498                 *code = bravalue;
   1499                 tempcode = code;
   1500                 tempreqvary = cd.reqVaryOpt;     /* Save value before bracket */
   1501 
   1502                 if (!compileBracket(
   1503                                    options,
   1504                                    brackets,                     /* Extracting bracket count */
   1505                                    &tempcode,                    /* Where to put code (updated) */
   1506                                    &ptr,                         /* Input pointer (updated) */
   1507                                    patternEnd,
   1508                                    errorCodePtr,                 /* Where to put an error message */
   1509                                    skipBytes,                    /* Skip over OP_BRANUMBER */
   1510                                    &subFirstByte,                /* For possible first char */
   1511                                    &subReqByte,                  /* For possible last char */
   1512                                    cd))                          /* Tables block */
   1513                     goto FAILED;
   1514 
   1515                 /* At the end of compiling, code is still pointing to the start of the
   1516                  group, while tempcode has been updated to point past the end of the group
   1517                  and any option resetting that may follow it. The pattern pointer (ptr)
   1518                  is on the bracket. */
   1519 
   1520                 /* Handle updating of the required and first characters. Update for normal
   1521                  brackets of all kinds, and conditions with two branches (see code above).
   1522                  If the bracket is followed by a quantifier with zero repeat, we have to
   1523                  back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
   1524                  main loop so that they can be accessed for the back off. */
   1525 
   1526                 zeroReqByte = reqByte;
   1527                 zeroFirstByte = firstByte;
   1528                 didGroupSetFirstByte = false;
   1529 
   1530                 if (bravalue >= OP_BRA) {
   1531                     /* If we have not yet set a firstByte in this branch, take it from the
   1532                      subpattern, remembering that it was set here so that a repeat of more
   1533                      than one can replicate it as reqByte if necessary. If the subpattern has
   1534                      no firstByte, set "none" for the whole branch. In both cases, a zero
   1535                      repeat forces firstByte to "none". */
   1536 
   1537                     if (firstByte == REQ_UNSET) {
   1538                         if (subFirstByte >= 0) {
   1539                             firstByte = subFirstByte;
   1540                             didGroupSetFirstByte = true;
   1541                         }
   1542                         else
   1543                             firstByte = REQ_NONE;
   1544                         zeroFirstByte = REQ_NONE;
   1545                     }
   1546 
   1547                     /* If firstByte was previously set, convert the subpattern's firstByte
   1548                      into reqByte if there wasn't one, using the vary flag that was in
   1549                      existence beforehand. */
   1550 
   1551                     else if (subFirstByte >= 0 && subReqByte < 0)
   1552                         subReqByte = subFirstByte | tempreqvary;
   1553 
   1554                     /* If the subpattern set a required byte (or set a first byte that isn't
   1555                      really the first byte - see above), set it. */
   1556 
   1557                     if (subReqByte >= 0)
   1558                         reqByte = subReqByte;
   1559                 }
   1560 
   1561                 /* For a forward assertion, we take the reqByte, if set. This can be
   1562                  helpful if the pattern that follows the assertion doesn't set a different
   1563                  char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
   1564                  for an assertion, however because it leads to incorrect effect for patterns
   1565                  such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
   1566                  of a firstByte. This is overcome by a scan at the end if there's no
   1567                  firstByte, looking for an asserted first char. */
   1568 
   1569                 else if (bravalue == OP_ASSERT && subReqByte >= 0)
   1570                     reqByte = subReqByte;
   1571 
   1572                 /* Now update the main code pointer to the end of the group. */
   1573 
   1574                 code = tempcode;
   1575 
   1576                 /* Error if hit end of pattern */
   1577 
   1578                 if (ptr >= patternEnd || *ptr != ')') {
   1579                     *errorCodePtr = ERR14;
   1580                     goto FAILED;
   1581                 }
   1582                 break;
   1583 
   1584             /* Check \ for being a real metacharacter; if not, fall through and handle
   1585              it as a data character at the start of a string. Escape items are checked
   1586              for validity in the pre-compiling pass. */
   1587 
   1588             case '\\':
   1589                 tempptr = ptr;
   1590                 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
   1591 
   1592                 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
   1593                  are arranged to be the negation of the corresponding OP_values. For the
   1594                  back references, the values are ESC_REF plus the reference number. Only
   1595                  back references and those types that consume a character may be repeated.
   1596                  We can test for values between ESC_b and ESC_w for the latter; this may
   1597                  have to change if any new ones are ever created. */
   1598 
   1599                 if (c < 0) {
   1600                     /* For metasequences that actually match a character, we disable the
   1601                      setting of a first character if it hasn't already been set. */
   1602 
   1603                     if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
   1604                         firstByte = REQ_NONE;
   1605 
   1606                     /* Set values to reset to if this is followed by a zero repeat. */
   1607 
   1608                     zeroFirstByte = firstByte;
   1609                     zeroReqByte = reqByte;
   1610 
   1611                     /* Back references are handled specially */
   1612 
   1613                     if (-c >= ESC_REF) {
   1614                         int number = -c - ESC_REF;
   1615                         previous = code;
   1616                         *code++ = OP_REF;
   1617                         put2ByteValueAndAdvance(code, number);
   1618                     }
   1619 
   1620                     /* For the rest, we can obtain the OP value by negating the escape
   1621                      value */
   1622 
   1623                     else {
   1624                         previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
   1625                         *code++ = -c;
   1626                     }
   1627                     continue;
   1628                 }
   1629 
   1630                 /* Fall through. */
   1631 
   1632                 /* Handle a literal character. It is guaranteed not to be whitespace or #
   1633                  when the extended flag is set. If we are in UTF-8 mode, it may be a
   1634                  multi-byte literal character. */
   1635 
   1636                 default:
   1637             NORMAL_CHAR:
   1638 
   1639                 previous = code;
   1640 
   1641                 if (c < 128) {
   1642                     mcLength = 1;
   1643                     mcbuffer[0] = c;
   1644 
   1645                     if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
   1646                         *code++ = OP_ASCII_LETTER_IGNORING_CASE;
   1647                         *code++ = c | 0x20;
   1648                     } else {
   1649                         *code++ = OP_ASCII_CHAR;
   1650                         *code++ = c;
   1651                     }
   1652                 } else {
   1653                     mcLength = encodeUTF8(c, mcbuffer);
   1654 
   1655                     *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
   1656                     for (c = 0; c < mcLength; c++)
   1657                         *code++ = mcbuffer[c];
   1658                 }
   1659 
   1660                 /* Set the first and required bytes appropriately. If no previous first
   1661                  byte, set it from this character, but revert to none on a zero repeat.
   1662                  Otherwise, leave the firstByte value alone, and don't change it on a zero
   1663                  repeat. */
   1664 
   1665                 if (firstByte == REQ_UNSET) {
   1666                     zeroFirstByte = REQ_NONE;
   1667                     zeroReqByte = reqByte;
   1668 
   1669                     /* If the character is more than one byte long, we can set firstByte
   1670                      only if it is not to be matched caselessly. */
   1671 
   1672                     if (mcLength == 1 || reqCaseOpt == 0) {
   1673                         firstByte = mcbuffer[0] | reqCaseOpt;
   1674                         if (mcLength != 1)
   1675                             reqByte = code[-1] | cd.reqVaryOpt;
   1676                     }
   1677                     else
   1678                         firstByte = reqByte = REQ_NONE;
   1679                 }
   1680 
   1681                 /* firstByte was previously set; we can set reqByte only the length is
   1682                  1 or the matching is caseful. */
   1683 
   1684                 else {
   1685                     zeroFirstByte = firstByte;
   1686                     zeroReqByte = reqByte;
   1687                     if (mcLength == 1 || reqCaseOpt == 0)
   1688                         reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
   1689                 }
   1690 
   1691                 break;            /* End of literal character handling */
   1692         }
   1693     }                   /* end of big loop */
   1694 
   1695     /* Control never reaches here by falling through, only by a goto for all the
   1696      error states. Pass back the position in the pattern so that it can be displayed
   1697      to the user for diagnosing the error. */
   1698 
   1699 FAILED:
   1700     *ptrPtr = ptr;
   1701     return false;
   1702 }
   1703 
   1704 /*************************************************
   1705 *     Compile sequence of alternatives           *
   1706 *************************************************/
   1707 
   1708 /* On entry, ptr is pointing past the bracket character, but on return
   1709 it points to the closing bracket, or vertical bar, or end of string.
   1710 The code variable is pointing at the byte into which the BRA operator has been
   1711 stored. If the ims options are changed at the start (for a (?ims: group) or
   1712 during any branch, we need to insert an OP_OPT item at the start of every
   1713 following branch to ensure they get set correctly at run time, and also pass
   1714 the new options into every subsequent branch compile.
   1715 
   1716 Argument:
   1717   options        option bits, including any changes for this subpattern
   1718   brackets       -> int containing the number of extracting brackets used
   1719   codePtr        -> the address of the current code pointer
   1720   ptrPtr         -> the address of the current pattern pointer
   1721   errorCodePtr   -> pointer to error code variable
   1722   skipBytes      skip this many bytes at start (for OP_BRANUMBER)
   1723   firstbyteptr   place to put the first required character, or a negative number
   1724   reqbyteptr     place to put the last required character, or a negative number
   1725   cd             points to the data block with tables pointers etc.
   1726 
   1727 Returns:      true on success
   1728 */
   1729 
   1730 static bool
   1731 compileBracket(int options, int* brackets, unsigned char** codePtr,
   1732     const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
   1733     int* firstbyteptr, int* reqbyteptr, CompileData& cd)
   1734 {
   1735     const UChar* ptr = *ptrPtr;
   1736     unsigned char* code = *codePtr;
   1737     unsigned char* lastBranch = code;
   1738     unsigned char* start_bracket = code;
   1739     int firstByte = REQ_UNSET;
   1740     int reqByte = REQ_UNSET;
   1741 
   1742     /* Offset is set zero to mark that this bracket is still open */
   1743 
   1744     putLinkValueAllowZero(code + 1, 0);
   1745     code += 1 + LINK_SIZE + skipBytes;
   1746 
   1747     /* Loop for each alternative branch */
   1748 
   1749     while (true) {
   1750         /* Now compile the branch */
   1751 
   1752         int branchFirstByte;
   1753         int branchReqByte;
   1754         if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
   1755                             &branchFirstByte, &branchReqByte, cd)) {
   1756             *ptrPtr = ptr;
   1757             return false;
   1758         }
   1759 
   1760         /* If this is the first branch, the firstByte and reqByte values for the
   1761          branch become the values for the regex. */
   1762 
   1763         if (*lastBranch != OP_ALT) {
   1764             firstByte = branchFirstByte;
   1765             reqByte = branchReqByte;
   1766         }
   1767 
   1768         /* If this is not the first branch, the first char and reqByte have to
   1769          match the values from all the previous branches, except that if the previous
   1770          value for reqByte didn't have REQ_VARY set, it can still match, and we set
   1771          REQ_VARY for the regex. */
   1772 
   1773         else {
   1774             /* If we previously had a firstByte, but it doesn't match the new branch,
   1775              we have to abandon the firstByte for the regex, but if there was previously
   1776              no reqByte, it takes on the value of the old firstByte. */
   1777 
   1778             if (firstByte >= 0 && firstByte != branchFirstByte) {
   1779                 if (reqByte < 0)
   1780                     reqByte = firstByte;
   1781                 firstByte = REQ_NONE;
   1782             }
   1783 
   1784             /* If we (now or from before) have no firstByte, a firstByte from the
   1785              branch becomes a reqByte if there isn't a branch reqByte. */
   1786 
   1787             if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
   1788                 branchReqByte = branchFirstByte;
   1789 
   1790             /* Now ensure that the reqbytes match */
   1791 
   1792             if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
   1793                 reqByte = REQ_NONE;
   1794             else
   1795                 reqByte |= branchReqByte;   /* To "or" REQ_VARY */
   1796         }
   1797 
   1798         /* Reached end of expression, either ')' or end of pattern. Go back through
   1799          the alternative branches and reverse the chain of offsets, with the field in
   1800          the BRA item now becoming an offset to the first alternative. If there are
   1801          no alternatives, it points to the end of the group. The length in the
   1802          terminating ket is always the length of the whole bracketed item. If any of
   1803          the ims options were changed inside the group, compile a resetting op-code
   1804          following, except at the very end of the pattern. Return leaving the pointer
   1805          at the terminating char. */
   1806 
   1807         if (ptr >= patternEnd || *ptr != '|') {
   1808             int length = code - lastBranch;
   1809             do {
   1810                 int prevLength = getLinkValueAllowZero(lastBranch + 1);
   1811                 putLinkValue(lastBranch + 1, length);
   1812                 length = prevLength;
   1813                 lastBranch -= length;
   1814             } while (length > 0);
   1815 
   1816             /* Fill in the ket */
   1817 
   1818             *code = OP_KET;
   1819             putLinkValue(code + 1, code - start_bracket);
   1820             code += 1 + LINK_SIZE;
   1821 
   1822             /* Set values to pass back */
   1823 
   1824             *codePtr = code;
   1825             *ptrPtr = ptr;
   1826             *firstbyteptr = firstByte;
   1827             *reqbyteptr = reqByte;
   1828             return true;
   1829         }
   1830 
   1831         /* Another branch follows; insert an "or" node. Its length field points back
   1832          to the previous branch while the bracket remains open. At the end the chain
   1833          is reversed. It's done like this so that the start of the bracket has a
   1834          zero offset until it is closed, making it possible to detect recursion. */
   1835 
   1836         *code = OP_ALT;
   1837         putLinkValue(code + 1, code - lastBranch);
   1838         lastBranch = code;
   1839         code += 1 + LINK_SIZE;
   1840         ptr++;
   1841     }
   1842     ASSERT_NOT_REACHED();
   1843 }
   1844 
   1845 /*************************************************
   1846 *          Check for anchored expression         *
   1847 *************************************************/
   1848 
   1849 /* Try to find out if this is an anchored regular expression. Consider each
   1850 alternative branch. If they all start OP_CIRC, or with a bracket
   1851 all of whose alternatives start OP_CIRC (recurse ad lib), then
   1852 it's anchored.
   1853 
   1854 Arguments:
   1855   code          points to start of expression (the bracket)
   1856   captureMap    a bitmap of which brackets we are inside while testing; this
   1857                  handles up to substring 31; all brackets after that share
   1858                  the zero bit
   1859   backrefMap    the back reference bitmap
   1860 */
   1861 
   1862 static bool branchIsAnchored(const unsigned char* code)
   1863 {
   1864     const unsigned char* scode = firstSignificantOpcode(code);
   1865     int op = *scode;
   1866 
   1867     /* Brackets */
   1868     if (op >= OP_BRA || op == OP_ASSERT)
   1869         return bracketIsAnchored(scode);
   1870 
   1871     /* Check for explicit anchoring */
   1872     return op == OP_CIRC;
   1873 }
   1874 
   1875 static bool bracketIsAnchored(const unsigned char* code)
   1876 {
   1877     do {
   1878         if (!branchIsAnchored(code + 1 + LINK_SIZE))
   1879             return false;
   1880         code += getLinkValue(code + 1);
   1881     } while (*code == OP_ALT);   /* Loop for each alternative */
   1882     return true;
   1883 }
   1884 
   1885 /*************************************************
   1886 *         Check for starting with ^ or .*        *
   1887 *************************************************/
   1888 
   1889 /* This is called to find out if every branch starts with ^ or .* so that
   1890 "first char" processing can be done to speed things up in multiline
   1891 matching and for non-DOTALL patterns that start with .* (which must start at
   1892 the beginning or after \n)
   1893 
   1894 Except when the .* appears inside capturing parentheses, and there is a
   1895 subsequent back reference to those parentheses. By keeping a bitmap of the
   1896 first 31 back references, we can catch some of the more common cases more
   1897 precisely; all the greater back references share a single bit.
   1898 
   1899 Arguments:
   1900   code          points to start of expression (the bracket)
   1901   captureMap    a bitmap of which brackets we are inside while testing; this
   1902                  handles up to substring 31; all brackets after that share
   1903                  the zero bit
   1904   backrefMap    the back reference bitmap
   1905 */
   1906 
   1907 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
   1908 {
   1909     const unsigned char* scode = firstSignificantOpcode(code);
   1910     int op = *scode;
   1911 
   1912     /* Capturing brackets */
   1913     if (op > OP_BRA) {
   1914         int captureNum = op - OP_BRA;
   1915         if (captureNum > EXTRACT_BASIC_MAX)
   1916             captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
   1917         int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
   1918         return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
   1919     }
   1920 
   1921     /* Other brackets */
   1922     if (op == OP_BRA || op == OP_ASSERT)
   1923         return bracketNeedsLineStart(scode, captureMap, backrefMap);
   1924 
   1925     /* .* means "start at start or after \n" if it isn't in brackets that
   1926      may be referenced. */
   1927 
   1928     if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
   1929         return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
   1930 
   1931     /* Explicit ^ */
   1932     return op == OP_CIRC || op == OP_BOL;
   1933 }
   1934 
   1935 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
   1936 {
   1937     do {
   1938         if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
   1939             return false;
   1940         code += getLinkValue(code + 1);
   1941     } while (*code == OP_ALT);  /* Loop for each alternative */
   1942     return true;
   1943 }
   1944 
   1945 /*************************************************
   1946 *       Check for asserted fixed first char      *
   1947 *************************************************/
   1948 
   1949 /* During compilation, the "first char" settings from forward assertions are
   1950 discarded, because they can cause conflicts with actual literals that follow.
   1951 However, if we end up without a first char setting for an unanchored pattern,
   1952 it is worth scanning the regex to see if there is an initial asserted first
   1953 char. If all branches start with the same asserted char, or with a bracket all
   1954 of whose alternatives start with the same asserted char (recurse ad lib), then
   1955 we return that char, otherwise -1.
   1956 
   1957 Arguments:
   1958   code       points to start of expression (the bracket)
   1959   options    pointer to the options (used to check casing changes)
   1960   inassert   true if in an assertion
   1961 
   1962 Returns:     -1 or the fixed first char
   1963 */
   1964 
   1965 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
   1966 {
   1967     const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
   1968     int op = *scode;
   1969 
   1970     if (op >= OP_BRA)
   1971         op = OP_BRA;
   1972 
   1973     switch (op) {
   1974         default:
   1975             return -1;
   1976 
   1977         case OP_BRA:
   1978         case OP_ASSERT:
   1979             return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
   1980 
   1981         case OP_EXACT:
   1982             scode += 2;
   1983             /* Fall through */
   1984 
   1985         case OP_CHAR:
   1986         case OP_CHAR_IGNORING_CASE:
   1987         case OP_ASCII_CHAR:
   1988         case OP_ASCII_LETTER_IGNORING_CASE:
   1989         case OP_PLUS:
   1990         case OP_MINPLUS:
   1991             if (!inassert)
   1992                 return -1;
   1993             return scode[1];
   1994     }
   1995 }
   1996 
   1997 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
   1998 {
   1999     int c = -1;
   2000     do {
   2001         int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
   2002         if (d < 0)
   2003             return -1;
   2004         if (c < 0)
   2005             c = d;
   2006         else if (c != d)
   2007             return -1;
   2008         code += getLinkValue(code + 1);
   2009     } while (*code == OP_ALT);
   2010     return c;
   2011 }
   2012 
   2013 static inline int multiplyWithOverflowCheck(int a, int b)
   2014 {
   2015     if (!a || !b)
   2016         return 0;
   2017     if (a > MAX_PATTERN_SIZE / b)
   2018         return -1;
   2019     return a * b;
   2020 }
   2021 
   2022 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
   2023     CompileData& cd, ErrorCode& errorcode)
   2024 {
   2025     /* Make a pass over the pattern to compute the
   2026      amount of store required to hold the compiled code. This does not have to be
   2027      perfect as long as errors are overestimates. */
   2028 
   2029     if (patternLength > MAX_PATTERN_SIZE) {
   2030         errorcode = ERR16;
   2031         return -1;
   2032     }
   2033 
   2034     int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
   2035     int branch_extra = 0;
   2036     int lastitemlength = 0;
   2037     unsigned brastackptr = 0;
   2038     int brastack[BRASTACK_SIZE];
   2039     unsigned char bralenstack[BRASTACK_SIZE];
   2040     int bracount = 0;
   2041 
   2042     const UChar* ptr = (const UChar*)(pattern - 1);
   2043     const UChar* patternEnd = (const UChar*)(pattern + patternLength);
   2044 
   2045     while (++ptr < patternEnd) {
   2046         int minRepeats = 0, maxRepeats = 0;
   2047         int c = *ptr;
   2048 
   2049         switch (c) {
   2050             /* A backslashed item may be an escaped data character or it may be a
   2051              character type. */
   2052 
   2053             case '\\':
   2054                 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
   2055                 if (errorcode != 0)
   2056                     return -1;
   2057 
   2058                 lastitemlength = 1;     /* Default length of last item for repeats */
   2059 
   2060                 if (c >= 0) {            /* Data character */
   2061                     length += 2;          /* For a one-byte character */
   2062 
   2063                     if (c > 127) {
   2064                         int i;
   2065                         for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
   2066                             if (c <= jsc_pcre_utf8_table1[i]) break;
   2067                         length += i;
   2068                         lastitemlength += i;
   2069                     }
   2070 
   2071                     continue;
   2072                 }
   2073 
   2074                 /* Other escapes need one byte */
   2075 
   2076                 length++;
   2077 
   2078                 /* A back reference needs an additional 2 bytes, plus either one or 5
   2079                  bytes for a repeat. We also need to keep the value of the highest
   2080                  back reference. */
   2081 
   2082                 if (c <= -ESC_REF) {
   2083                     int refnum = -c - ESC_REF;
   2084                     cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
   2085                     if (refnum > cd.topBackref)
   2086                         cd.topBackref = refnum;
   2087                     length += 2;   /* For single back reference */
   2088                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
   2089                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
   2090                         if (errorcode)
   2091                             return -1;
   2092                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
   2093                             (minRepeats == 1 && maxRepeats == -1))
   2094                             length++;
   2095                         else
   2096                             length += 5;
   2097                         if (safelyCheckNextChar(ptr, patternEnd, '?'))
   2098                             ptr++;
   2099                     }
   2100                 }
   2101                 continue;
   2102 
   2103             case '^':     /* Single-byte metacharacters */
   2104             case '.':
   2105             case '$':
   2106                 length++;
   2107                 lastitemlength = 1;
   2108                 continue;
   2109 
   2110             case '*':            /* These repeats won't be after brackets; */
   2111             case '+':            /* those are handled separately */
   2112             case '?':
   2113                 length++;
   2114                 goto POSSESSIVE;
   2115 
   2116             /* This covers the cases of braced repeats after a single char, metachar,
   2117              class, or back reference. */
   2118 
   2119             case '{':
   2120                 if (!isCountedRepeat(ptr + 1, patternEnd))
   2121                     goto NORMAL_CHAR;
   2122                 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
   2123                 if (errorcode != 0)
   2124                     return -1;
   2125 
   2126                 /* These special cases just insert one extra opcode */
   2127 
   2128                 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
   2129                     (minRepeats == 1 && maxRepeats == -1))
   2130                     length++;
   2131 
   2132                 /* These cases might insert additional copies of a preceding character. */
   2133 
   2134                 else {
   2135                     if (minRepeats != 1) {
   2136                         length -= lastitemlength;   /* Uncount the original char or metachar */
   2137                         if (minRepeats > 0)
   2138                             length += 3 + lastitemlength;
   2139                     }
   2140                     length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
   2141                 }
   2142 
   2143                 if (safelyCheckNextChar(ptr, patternEnd, '?'))
   2144                     ptr++;      /* Needs no extra length */
   2145 
   2146             POSSESSIVE:                     /* Test for possessive quantifier */
   2147                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
   2148                     ptr++;
   2149                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
   2150                 }
   2151                 continue;
   2152 
   2153             /* An alternation contains an offset to the next branch or ket. If any ims
   2154              options changed in the previous branch(es), and/or if we are in a
   2155              lookbehind assertion, extra space will be needed at the start of the
   2156              branch. This is handled by branch_extra. */
   2157 
   2158             case '|':
   2159                 if (brastackptr == 0)
   2160                     cd.needOuterBracket = true;
   2161                 length += 1 + LINK_SIZE + branch_extra;
   2162                 continue;
   2163 
   2164             /* A character class uses 33 characters provided that all the character
   2165              values are less than 256. Otherwise, it uses a bit map for low valued
   2166              characters, and individual items for others. Don't worry about character
   2167              types that aren't allowed in classes - they'll get picked up during the
   2168              compile. A character class that contains only one single-byte character
   2169              uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
   2170              where we can. (In UTF-8 mode we can do this only for chars < 128.) */
   2171 
   2172             case '[': {
   2173                 int class_optcount;
   2174                 if (*(++ptr) == '^') {
   2175                     class_optcount = 10;  /* Greater than one */
   2176                     ptr++;
   2177                 }
   2178                 else
   2179                     class_optcount = 0;
   2180 
   2181                 bool class_utf8 = false;
   2182 
   2183                 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
   2184                     /* Check for escapes */
   2185 
   2186                     if (*ptr == '\\') {
   2187                         c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
   2188                         if (errorcode != 0)
   2189                             return -1;
   2190 
   2191                         /* Handle escapes that turn into characters */
   2192 
   2193                         if (c >= 0)
   2194                             goto NON_SPECIAL_CHARACTER;
   2195 
   2196                         /* Escapes that are meta-things. The normal ones just affect the
   2197                          bit map, but Unicode properties require an XCLASS extended item. */
   2198 
   2199                         else
   2200                             class_optcount = 10;         /* \d, \s etc; make sure > 1 */
   2201                     }
   2202 
   2203                     /* Anything else increments the possible optimization count. We have to
   2204                      detect ranges here so that we can compute the number of extra ranges for
   2205                      caseless wide characters when UCP support is available. If there are wide
   2206                      characters, we are going to have to use an XCLASS, even for single
   2207                      characters. */
   2208 
   2209                     else {
   2210                         c = *ptr;
   2211 
   2212                         /* Come here from handling \ above when it escapes to a char value */
   2213 
   2214                     NON_SPECIAL_CHARACTER:
   2215                         class_optcount++;
   2216 
   2217                         int d = -1;
   2218                         if (safelyCheckNextChar(ptr, patternEnd, '-')) {
   2219                             const UChar* hyptr = ptr++;
   2220                             if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
   2221                                 ptr++;
   2222                                 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
   2223                                 if (errorcode != 0)
   2224                                     return -1;
   2225                             }
   2226                             else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
   2227                                 d = *++ptr;
   2228                             if (d < 0)
   2229                                 ptr = hyptr;      /* go back to hyphen as data */
   2230                         }
   2231 
   2232                         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
   2233                          127 for caseless matching, we will need to use an XCLASS. */
   2234 
   2235                         if (d >= 0) {
   2236                             class_optcount = 10;     /* Ensure > 1 */
   2237                             if (d < c) {
   2238                                 errorcode = ERR8;
   2239                                 return -1;
   2240                             }
   2241 
   2242                             if ((d > 255 || (ignoreCase && d > 127))) {
   2243                                 unsigned char buffer[6];
   2244                                 if (!class_utf8)         /* Allow for XCLASS overhead */
   2245                                 {
   2246                                     class_utf8 = true;
   2247                                     length += LINK_SIZE + 2;
   2248                                 }
   2249 
   2250                                 /* If we have UCP support, find out how many extra ranges are
   2251                                  needed to map the other case of characters within this range. We
   2252                                  have to mimic the range optimization here, because extending the
   2253                                  range upwards might push d over a boundary that makes it use
   2254                                  another byte in the UTF-8 representation. */
   2255 
   2256                                 if (ignoreCase) {
   2257                                     int occ, ocd;
   2258                                     int cc = c;
   2259                                     int origd = d;
   2260                                     while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
   2261                                         if (occ >= c && ocd <= d)
   2262                                             continue;   /* Skip embedded */
   2263 
   2264                                         if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
   2265                                         {                            /* if there is overlap,   */
   2266                                             c = occ;                     /* noting that if occ < c */
   2267                                             continue;                    /* we can't have ocd > d  */
   2268                                         }                            /* because a subrange is  */
   2269                                         if (ocd > d && occ <= d + 1)   /* always shorter than    */
   2270                                         {                            /* the basic range.       */
   2271                                             d = ocd;
   2272                                             continue;
   2273                                         }
   2274 
   2275                                         /* An extra item is needed */
   2276 
   2277                                         length += 1 + encodeUTF8(occ, buffer) +
   2278                                         ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
   2279                                     }
   2280                                 }
   2281 
   2282                                 /* The length of the (possibly extended) range */
   2283 
   2284                                 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
   2285                             }
   2286 
   2287                         }
   2288 
   2289                         /* We have a single character. There is nothing to be done unless we
   2290                          are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
   2291                          allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
   2292                          support. */
   2293 
   2294                         else {
   2295                             if ((c > 255 || (ignoreCase && c > 127))) {
   2296                                 unsigned char buffer[6];
   2297                                 class_optcount = 10;     /* Ensure > 1 */
   2298                                 if (!class_utf8)         /* Allow for XCLASS overhead */
   2299                                 {
   2300                                     class_utf8 = true;
   2301                                     length += LINK_SIZE + 2;
   2302                                 }
   2303                                 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
   2304                             }
   2305                         }
   2306                     }
   2307                 }
   2308 
   2309                 if (ptr >= patternEnd) {   /* Missing terminating ']' */
   2310                     errorcode = ERR6;
   2311                     return -1;
   2312                 }
   2313 
   2314                 /* We can optimize when there was only one optimizable character.
   2315                  Note that this does not detect the case of a negated single character.
   2316                  In that case we do an incorrect length computation, but it's not a serious
   2317                  problem because the computed length is too large rather than too small. */
   2318 
   2319                 if (class_optcount == 1)
   2320                     goto NORMAL_CHAR;
   2321 
   2322                 /* Here, we handle repeats for the class opcodes. */
   2323                 {
   2324                     length += 33;
   2325 
   2326                     /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
   2327                      we also need extra for wrapping the whole thing in a sub-pattern. */
   2328 
   2329                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
   2330                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
   2331                         if (errorcode != 0)
   2332                             return -1;
   2333                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
   2334                             (minRepeats == 1 && maxRepeats == -1))
   2335                             length++;
   2336                         else
   2337                             length += 5;
   2338                         if (safelyCheckNextChar(ptr, patternEnd, '+')) {
   2339                             ptr++;
   2340                             length += 2 + 2 * LINK_SIZE;
   2341                         } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
   2342                             ptr++;
   2343                     }
   2344                 }
   2345                 continue;
   2346             }
   2347 
   2348             /* Brackets may be genuine groups or special things */
   2349 
   2350             case '(': {
   2351                 int branch_newextra = 0;
   2352                 int bracket_length = 1 + LINK_SIZE;
   2353                 bool capturing = false;
   2354 
   2355                 /* Handle special forms of bracket, which all start (? */
   2356 
   2357                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
   2358                     switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
   2359                         /* Non-referencing groups and lookaheads just move the pointer on, and
   2360                          then behave like a non-special bracket, except that they don't increment
   2361                          the count of extracting brackets. Ditto for the "once only" bracket,
   2362                          which is in Perl from version 5.005. */
   2363 
   2364                         case ':':
   2365                         case '=':
   2366                         case '!':
   2367                             ptr += 2;
   2368                             break;
   2369 
   2370                         /* Else loop checking valid options until ) is met. Anything else is an
   2371                          error. If we are without any brackets, i.e. at top level, the settings
   2372                          act as if specified in the options, so massage the options immediately.
   2373                          This is for backward compatibility with Perl 5.004. */
   2374 
   2375                         default:
   2376                             errorcode = ERR12;
   2377                             return -1;
   2378                     }
   2379                 } else
   2380                     capturing = 1;
   2381 
   2382                 /* Capturing brackets must be counted so we can process escapes in a
   2383                  Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
   2384                  an additional 3 bytes of memory per capturing bracket. */
   2385 
   2386                 if (capturing) {
   2387                     bracount++;
   2388                     if (bracount > EXTRACT_BASIC_MAX)
   2389                         bracket_length += 3;
   2390                 }
   2391 
   2392                 /* Save length for computing whole length at end if there's a repeat that
   2393                  requires duplication of the group. Also save the current value of
   2394                  branch_extra, and start the new group with the new value. If non-zero, this
   2395                  will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
   2396 
   2397                 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
   2398                     errorcode = ERR17;
   2399                     return -1;
   2400                 }
   2401 
   2402                 bralenstack[brastackptr] = branch_extra;
   2403                 branch_extra = branch_newextra;
   2404 
   2405                 brastack[brastackptr++] = length;
   2406                 length += bracket_length;
   2407                 continue;
   2408             }
   2409 
   2410             /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
   2411              have to replicate this bracket up to that many times. If brastackptr is
   2412              0 this is an unmatched bracket which will generate an error, but take care
   2413              not to try to access brastack[-1] when computing the length and restoring
   2414              the branch_extra value. */
   2415 
   2416             case ')': {
   2417                 int duplength;
   2418                 length += 1 + LINK_SIZE;
   2419                 if (brastackptr > 0) {
   2420                     duplength = length - brastack[--brastackptr];
   2421                     branch_extra = bralenstack[brastackptr];
   2422                 }
   2423                 else
   2424                     duplength = 0;
   2425 
   2426                 /* Leave ptr at the final char; for readRepeatCounts this happens
   2427                  automatically; for the others we need an increment. */
   2428 
   2429                 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
   2430                     ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
   2431                     if (errorcode)
   2432                         return -1;
   2433                 } else if (c == '*') {
   2434                     minRepeats = 0;
   2435                     maxRepeats = -1;
   2436                     ptr++;
   2437                 } else if (c == '+') {
   2438                     minRepeats = 1;
   2439                     maxRepeats = -1;
   2440                     ptr++;
   2441                 } else if (c == '?') {
   2442                     minRepeats = 0;
   2443                     maxRepeats = 1;
   2444                     ptr++;
   2445                 } else {
   2446                     minRepeats = 1;
   2447                     maxRepeats = 1;
   2448                 }
   2449 
   2450                 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
   2451                  group, and if the maximum is greater than zero, we have to replicate
   2452                  maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
   2453                  bracket set. */
   2454 
   2455                 int repeatsLength;
   2456                 if (minRepeats == 0) {
   2457                     length++;
   2458                     if (maxRepeats > 0) {
   2459                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
   2460                         if (repeatsLength < 0) {
   2461                             errorcode = ERR16;
   2462                             return -1;
   2463                         }
   2464                         length += repeatsLength;
   2465                         if (length > MAX_PATTERN_SIZE) {
   2466                             errorcode = ERR16;
   2467                             return -1;
   2468                         }
   2469                     }
   2470                 }
   2471 
   2472                 /* When the minimum is greater than zero, we have to replicate up to
   2473                  minval-1 times, with no additions required in the copies. Then, if there
   2474                  is a limited maximum we have to replicate up to maxval-1 times allowing
   2475                  for a BRAZERO item before each optional copy and nesting brackets for all
   2476                  but one of the optional copies. */
   2477 
   2478                 else {
   2479                     repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
   2480                     if (repeatsLength < 0) {
   2481                         errorcode = ERR16;
   2482                         return -1;
   2483                     }
   2484                     length += repeatsLength;
   2485                     if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
   2486                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
   2487                         if (repeatsLength < 0) {
   2488                             errorcode = ERR16;
   2489                             return -1;
   2490                         }
   2491                         length += repeatsLength - (2 + 2 * LINK_SIZE);
   2492                     }
   2493                     if (length > MAX_PATTERN_SIZE) {
   2494                         errorcode = ERR16;
   2495                         return -1;
   2496                     }
   2497                 }
   2498 
   2499                 /* Allow space for once brackets for "possessive quantifier" */
   2500 
   2501                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
   2502                     ptr++;
   2503                     length += 2 + 2 * LINK_SIZE;
   2504                 }
   2505                 continue;
   2506             }
   2507 
   2508             /* Non-special character. It won't be space or # in extended mode, so it is
   2509              always a genuine character. If we are in a \Q...\E sequence, check for the
   2510              end; if not, we have a literal. */
   2511 
   2512             default:
   2513             NORMAL_CHAR:
   2514                 length += 2;          /* For a one-byte character */
   2515                 lastitemlength = 1;   /* Default length of last item for repeats */
   2516 
   2517                 if (c > 127) {
   2518                     int i;
   2519                     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
   2520                         if (c <= jsc_pcre_utf8_table1[i])
   2521                             break;
   2522                     length += i;
   2523                     lastitemlength += i;
   2524                 }
   2525 
   2526                 continue;
   2527         }
   2528     }
   2529 
   2530     length += 2 + LINK_SIZE;    /* For final KET and END */
   2531 
   2532     cd.numCapturingBrackets = bracount;
   2533     return length;
   2534 }
   2535 
   2536 /*************************************************
   2537 *        Compile a Regular Expression            *
   2538 *************************************************/
   2539 
   2540 /* This function takes a string and returns a pointer to a block of store
   2541 holding a compiled version of the expression. The original API for this
   2542 function had no error code return variable; it is retained for backwards
   2543 compatibility. The new function is given a new name.
   2544 
   2545 Arguments:
   2546   pattern       the regular expression
   2547   options       various option bits
   2548   errorCodePtr  pointer to error code variable (pcre_compile2() only)
   2549                   can be NULL if you don't want a code value
   2550   errorPtr      pointer to pointer to error text
   2551   erroroffset   ptr offset in pattern where error was detected
   2552   tables        pointer to character tables or NULL
   2553 
   2554 Returns:        pointer to compiled data block, or NULL on error,
   2555                 with errorPtr and erroroffset set
   2556 */
   2557 
   2558 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
   2559 {
   2560     *errorPtr = errorText(errorcode);
   2561     return 0;
   2562 }
   2563 
   2564 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
   2565                 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
   2566                 unsigned* numSubpatterns, const char** errorPtr)
   2567 {
   2568     /* We can't pass back an error message if errorPtr is NULL; I guess the best we
   2569      can do is just return NULL, but we can set a code value if there is a code pointer. */
   2570     if (!errorPtr)
   2571         return 0;
   2572     *errorPtr = NULL;
   2573 
   2574     CompileData cd;
   2575 
   2576     ErrorCode errorcode = ERR0;
   2577     /* Call this once just to count the brackets. */
   2578     calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
   2579     /* Call it again to compute the length. */
   2580     int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
   2581     if (errorcode)
   2582         return returnError(errorcode, errorPtr);
   2583 
   2584     if (length > MAX_PATTERN_SIZE)
   2585         return returnError(ERR16, errorPtr);
   2586 
   2587     size_t size = length + sizeof(JSRegExp);
   2588 #if REGEXP_HISTOGRAM
   2589     size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
   2590     size = stringOffset + patternLength * sizeof(UChar);
   2591 #endif
   2592     JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
   2593 
   2594     if (!re)
   2595         return returnError(ERR13, errorPtr);
   2596 
   2597     re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
   2598 
   2599     /* The starting points of the name/number translation table and of the code are
   2600      passed around in the compile data block. */
   2601 
   2602     const unsigned char* codeStart = (const unsigned char*)(re + 1);
   2603 
   2604     /* Set up a starting, non-extracting bracket, then compile the expression. On
   2605      error, errorcode will be set non-zero, so we don't need to look at the result
   2606      of the function here. */
   2607 
   2608     const UChar* ptr = (const UChar*)pattern;
   2609     const UChar* patternEnd = pattern + patternLength;
   2610     unsigned char* code = const_cast<unsigned char*>(codeStart);
   2611     int firstByte, reqByte;
   2612     int bracketCount = 0;
   2613     if (!cd.needOuterBracket)
   2614         compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
   2615     else {
   2616         *code = OP_BRA;
   2617         compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd);
   2618     }
   2619     re->topBracket = bracketCount;
   2620     re->topBackref = cd.topBackref;
   2621 
   2622     /* If not reached end of pattern on success, there's an excess bracket. */
   2623 
   2624     if (errorcode == 0 && ptr < patternEnd)
   2625         errorcode = ERR10;
   2626 
   2627     /* Fill in the terminating state and check for disastrous overflow, but
   2628      if debugging, leave the test till after things are printed out. */
   2629 
   2630     *code++ = OP_END;
   2631 
   2632     ASSERT(code - codeStart <= length);
   2633     if (code - codeStart > length)
   2634         errorcode = ERR7;
   2635 
   2636     /* Give an error if there's back reference to a non-existent capturing
   2637      subpattern. */
   2638 
   2639     if (re->topBackref > re->topBracket)
   2640         errorcode = ERR15;
   2641 
   2642     /* Failed to compile, or error while post-processing */
   2643 
   2644     if (errorcode != ERR0) {
   2645         delete [] reinterpret_cast<char*>(re);
   2646         return returnError(errorcode, errorPtr);
   2647     }
   2648 
   2649     /* If the anchored option was not passed, set the flag if we can determine that
   2650      the pattern is anchored by virtue of ^ characters or \A or anything else (such
   2651      as starting with .* when DOTALL is set).
   2652 
   2653      Otherwise, if we know what the first character has to be, save it, because that
   2654      speeds up unanchored matches no end. If not, see if we can set the
   2655      UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
   2656      start with ^. and also when all branches start with .* for non-DOTALL matches.
   2657      */
   2658 
   2659     if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
   2660         re->options |= IsAnchoredOption;
   2661     else {
   2662         if (firstByte < 0) {
   2663             firstByte = (cd.needOuterBracket
   2664                     ? bracketFindFirstAssertedCharacter(codeStart, false)
   2665                     : branchFindFirstAssertedCharacter(codeStart, false))
   2666                 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
   2667         }
   2668         if (firstByte >= 0) {
   2669             int ch = firstByte & 255;
   2670             if (ch < 127) {
   2671                 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
   2672                 re->options |= UseFirstByteOptimizationOption;
   2673             }
   2674         } else {
   2675             if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
   2676                 re->options |= UseMultiLineFirstByteOptimizationOption;
   2677         }
   2678     }
   2679 
   2680     /* For an anchored pattern, we use the "required byte" only if it follows a
   2681      variable length item in the regex. Remove the caseless flag for non-caseable
   2682      bytes. */
   2683 
   2684     if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
   2685         int ch = reqByte & 255;
   2686         if (ch < 127) {
   2687             re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
   2688             re->options |= UseRequiredByteOptimizationOption;
   2689         }
   2690     }
   2691 
   2692 #if REGEXP_HISTOGRAM
   2693     re->stringOffset = stringOffset;
   2694     re->stringLength = patternLength;
   2695     memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
   2696 #endif
   2697 
   2698     if (numSubpatterns)
   2699         *numSubpatterns = re->topBracket;
   2700     return re;
   2701 }
   2702 
   2703 void jsRegExpFree(JSRegExp* re)
   2704 {
   2705     delete [] reinterpret_cast<char*>(re);
   2706 }
   2707