Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10          New API code Copyright (c) 2016 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 
     42 #ifdef HAVE_CONFIG_H
     43 #include "config.h"
     44 #endif
     45 
     46 #define NLBLOCK cb             /* Block containing newline information */
     47 #define PSSTART start_pattern  /* Field containing processed string start */
     48 #define PSEND   end_pattern    /* Field containing processed string end */
     49 
     50 #include "pcre2_internal.h"
     51 
     52 /* In rare error cases debugging might require calling pcre2_printint(). */
     53 
     54 #if 0
     55 #ifdef EBCDIC
     56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
     57 #else
     58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
     59 #endif
     60 #include "pcre2_printint.c"
     61 #define CALL_PRINTINT
     62 #endif
     63 
     64 /* There are a few things that vary with different code unit sizes. Handle them
     65 by defining macros in order to minimize #if usage. */
     66 
     67 #if PCRE2_CODE_UNIT_WIDTH == 8
     68 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
     69 #define XDIGIT(c)                xdigitab[c]
     70 
     71 #else  /* Either 16-bit or 32-bit */
     72 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
     73 
     74 #if PCRE2_CODE_UNIT_WIDTH == 16
     75 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
     76 
     77 #else  /* 32-bit */
     78 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
     79 #endif
     80 #endif
     81 
     82 /* Function definitions to allow mutual recursion */
     83 
     84 static unsigned int
     85   add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
     86     const uint32_t *, unsigned int);
     87 
     88 static BOOL
     89   compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL,
     90     uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *,
     91     branch_chain *, compile_block *, size_t *);
     92 
     93 
     94 
     95 /*************************************************
     96 *      Code parameters and static tables         *
     97 *************************************************/
     98 
     99 /* This value specifies the size of stack workspace, which is used in different
    100 ways in the different pattern scans. The group-identifying pre-scan uses it to
    101 handle nesting, and needs it to be 16-bit aligned.
    102 
    103 During the first compiling phase, when determining how much memory is required,
    104 the regex is partly compiled into this space, but the compiled parts are
    105 discarded as soon as they can be, so that hopefully there will never be an
    106 overrun. The code does, however, check for an overrun, which can occur for
    107 pathological patterns. The size of the workspace depends on LINK_SIZE because
    108 the length of compiled items varies with this.
    109 
    110 In the real compile phase, the workspace is used for remembering data about
    111 numbered groups, provided there are not too many of them (if there are, extra
    112 memory is acquired). For this phase the memory must be 32-bit aligned. Having
    113 defined the size in code units, we set up C32_WORK_SIZE as the number of
    114 elements in the 32-bit vector. */
    115 
    116 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)   /* Size in code units */
    117 
    118 #define C32_WORK_SIZE \
    119   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t))
    120 
    121 /* The overrun tests check for a slightly smaller size so that they detect the
    122 overrun before it actually does run off the end of the data block. */
    123 
    124 #define WORK_SIZE_SAFETY_MARGIN (100)
    125 
    126 /* This value determines the size of the initial vector that is used for
    127 remembering named groups during the pre-compile. It is allocated on the stack,
    128 but if it is too small, it is expanded, in a similar way to the workspace. The
    129 value is the number of slots in the list. */
    130 
    131 #define NAMED_GROUP_LIST_SIZE  20
    132 
    133 /* The original PCRE required patterns to be zero-terminated, and it simplifies
    134 the compiling code if it is guaranteed that there is a zero code unit at the
    135 end of the pattern, because this means that tests for coding sequences such as
    136 (*SKIP) or even just (?<= can check a sequence of code units without having to
    137 keep checking for the end of the pattern. The new PCRE2 API allows zero code
    138 units within patterns if a positive length is given, but in order to keep most
    139 of the compiling code as it was, we copy such patterns and add a zero on the
    140 end. This value determines the size of space on the stack that is used if the
    141 pattern fits; if not, heap memory is used. */
    142 
    143 #define COPIED_PATTERN_SIZE 1024
    144 
    145 /* Maximum length value to check against when making sure that the variable
    146 that holds the compiled pattern length does not overflow. We make it a bit less
    147 than INT_MAX to allow for adding in group terminating bytes, so that we don't
    148 have to check them every time. */
    149 
    150 #define OFLOW_MAX (INT_MAX - 20)
    151 
    152 /* Macro for setting individual bits in class bitmaps. It took some
    153 experimenting to figure out how to stop gcc 5.3.0 from warning with
    154 -Wconversion. This version gets a warning:
    155 
    156   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
    157 
    158 Let's hope the apparently less efficient version isn't actually so bad if the
    159 compiler is clever with identical subexpressions. */
    160 
    161 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
    162 
    163 /* Private flags added to firstcu and reqcu. */
    164 
    165 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
    166 #define REQ_VARY        (1 << 1)        /* reqcu followed non-literal item */
    167 /* Negative values for the firstcu and reqcu flags */
    168 #define REQ_UNSET       (-2)            /* Not yet found anything */
    169 #define REQ_NONE        (-1)            /* Found not fixed char */
    170 
    171 /* These flags are used in the groupinfo vector. */
    172 
    173 #define GI_SET_COULD_BE_EMPTY  0x80000000u
    174 #define GI_COULD_BE_EMPTY      0x40000000u
    175 #define GI_NOT_FIXED_LENGTH    0x20000000u
    176 #define GI_SET_FIXED_LENGTH    0x10000000u
    177 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
    178 
    179 /* This bit (which is greater than any UTF value) is used to indicate that a
    180 variable contains a number of code units instead of an actual code point. */
    181 
    182 #define UTF_LENGTH     0x10000000l
    183 
    184 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
    185 and is fast (a good compiler can turn it into a subtraction and unsigned
    186 comparison). */
    187 
    188 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
    189 
    190 /* Table to identify hex digits. The tables in chartables are dependent on the
    191 locale, and may mark arbitrary characters as digits. We want to recognize only
    192 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
    193 costs 256 bytes, but it is a lot faster than doing character value tests (at
    194 least in some simple cases I timed), and in some applications one wants PCRE to
    195 compile efficiently as well as match efficiently. The value in the table is
    196 the binary hex digit value, or 0xff for non-hex digits. */
    197 
    198 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
    199 UTF-8 mode. */
    200 
    201 #ifndef EBCDIC
    202 static const uint8_t xdigitab[] =
    203   {
    204   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
    205   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
    206   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
    207   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
    208   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
    209   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
    210   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
    211   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
    212   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
    213   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
    214   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
    215   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
    216   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
    217   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
    218   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
    219   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
    220   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
    221   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
    222   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
    223   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
    224   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
    225   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
    226   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
    227   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
    228   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
    229   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
    230   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
    231   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
    232   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
    233   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
    234   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
    235   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
    236 
    237 #else
    238 
    239 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
    240 
    241 static const uint8_t xdigitab[] =
    242   {
    243   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
    244   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
    245   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
    246   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
    247   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
    248   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
    249   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
    250   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
    251   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
    252   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
    253   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
    254   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
    255   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
    256   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
    257   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
    258   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
    259   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
    260   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
    261   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
    262   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
    263   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
    264   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
    265   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
    266   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
    267   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
    268   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
    269   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
    270   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
    271   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
    272   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
    273   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
    274   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
    275 #endif  /* EBCDIC */
    276 
    277 
    278 /* Table for handling alphanumeric escaped characters. Positive returns are
    279 simple data values; negative values are for special things like \d and so on.
    280 Zero means further processing is needed (for things like \x), or the escape is
    281 invalid. */
    282 
    283 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
    284 in UTF-8 mode. It runs from '0' to 'z'. */
    285 
    286 #ifndef EBCDIC
    287 #define ESCAPES_FIRST       CHAR_0
    288 #define ESCAPES_LAST        CHAR_z
    289 #define UPPER_CASE(c)       (c-32)
    290 
    291 static const short int escapes[] = {
    292      0,                       0,
    293      0,                       0,
    294      0,                       0,
    295      0,                       0,
    296      0,                       0,
    297      CHAR_COLON,              CHAR_SEMICOLON,
    298      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
    299      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
    300      CHAR_COMMERCIAL_AT,      -ESC_A,
    301      -ESC_B,                  -ESC_C,
    302      -ESC_D,                  -ESC_E,
    303      0,                       -ESC_G,
    304      -ESC_H,                  0,
    305      0,                       -ESC_K,
    306      0,                       0,
    307      -ESC_N,                  0,
    308      -ESC_P,                  -ESC_Q,
    309      -ESC_R,                  -ESC_S,
    310      0,                       0,
    311      -ESC_V,                  -ESC_W,
    312      -ESC_X,                  0,
    313      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
    314      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
    315      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
    316      CHAR_GRAVE_ACCENT,       ESC_a,
    317      -ESC_b,                  0,
    318      -ESC_d,                  ESC_e,
    319      ESC_f,                   0,
    320      -ESC_h,                  0,
    321      0,                       -ESC_k,
    322      0,                       0,
    323      ESC_n,                   0,
    324      -ESC_p,                  0,
    325      ESC_r,                   -ESC_s,
    326      ESC_tee,                 0,
    327      -ESC_v,                  -ESC_w,
    328      0,                       0,
    329      -ESC_z
    330 };
    331 
    332 #else
    333 
    334 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
    335 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
    336 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
    337 because it is defined as 'a', which of course picks up the ASCII value. */
    338 
    339 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
    340 #define ESCAPES_FIRST       CHAR_a
    341 #define ESCAPES_LAST        CHAR_9
    342 #define UPPER_CASE(c)       (c+64)
    343 #else                              /* Testing in an ASCII environment */
    344 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
    345 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
    346 #define UPPER_CASE(c)  (c-32)
    347 #endif
    348 
    349 static const short int escapes[] = {
    350 /*  80 */        ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
    351 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
    352 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
    353 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
    354 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
    355 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
    356 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
    357 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
    358 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
    359 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
    360 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
    361 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
    362 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
    363 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
    364 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
    365 /*  F8 */     0,     0
    366 };
    367 
    368 /* We also need a table of characters that may follow \c in an EBCDIC
    369 environment for characters 0-31. */
    370 
    371 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
    372 
    373 #endif   /* EBCDIC */
    374 
    375 
    376 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
    377 searched linearly. Put all the names into a single string, in order to reduce
    378 the number of relocations when a shared library is dynamically linked. The
    379 string is built from string macros so that it works in UTF-8 mode on EBCDIC
    380 platforms. */
    381 
    382 typedef struct verbitem {
    383   int   len;                 /* Length of verb name */
    384   int   op;                  /* Op when no arg, or -1 if arg mandatory */
    385   int   op_arg;              /* Op when arg present, or -1 if not allowed */
    386 } verbitem;
    387 
    388 static const char verbnames[] =
    389   "\0"                       /* Empty name is a shorthand for MARK */
    390   STRING_MARK0
    391   STRING_ACCEPT0
    392   STRING_COMMIT0
    393   STRING_F0
    394   STRING_FAIL0
    395   STRING_PRUNE0
    396   STRING_SKIP0
    397   STRING_THEN;
    398 
    399 static const verbitem verbs[] = {
    400   { 0, -1,        OP_MARK },
    401   { 4, -1,        OP_MARK },
    402   { 6, OP_ACCEPT, -1 },
    403   { 6, OP_COMMIT, -1 },
    404   { 1, OP_FAIL,   -1 },
    405   { 4, OP_FAIL,   -1 },
    406   { 5, OP_PRUNE,  OP_PRUNE_ARG },
    407   { 4, OP_SKIP,   OP_SKIP_ARG  },
    408   { 4, OP_THEN,   OP_THEN_ARG  }
    409 };
    410 
    411 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
    412 
    413 
    414 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
    415 another regex library. */
    416 
    417 static const PCRE2_UCHAR sub_start_of_word[] = {
    418   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    419   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
    420 
    421 static const PCRE2_UCHAR sub_end_of_word[] = {
    422   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    423   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
    424   CHAR_RIGHT_PARENTHESIS, '\0' };
    425 
    426 
    427 /* Tables of names of POSIX character classes and their lengths. The names are
    428 now all in a single string, to reduce the number of relocations when a shared
    429 library is dynamically loaded. The list of lengths is terminated by a zero
    430 length entry. The first three must be alpha, lower, upper, as this is assumed
    431 for handling case independence. The indices for graph, print, and punct are
    432 needed, so identify them. */
    433 
    434 static const char posix_names[] =
    435   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
    436   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
    437   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
    438   STRING_word0  STRING_xdigit;
    439 
    440 static const uint8_t posix_name_lengths[] = {
    441   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
    442 
    443 #define PC_GRAPH  8
    444 #define PC_PRINT  9
    445 #define PC_PUNCT 10
    446 
    447 
    448 /* Table of class bit maps for each POSIX class. Each class is formed from a
    449 base map, with an optional addition or removal of another map. Then, for some
    450 classes, there is some additional tweaking: for [:blank:] the vertical space
    451 characters are removed, and for [:alpha:] and [:alnum:] the underscore
    452 character is removed. The triples in the table consist of the base map offset,
    453 second map offset or -1 if no second map, and a non-negative value for map
    454 addition or a negative value for map subtraction (if there are two maps). The
    455 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
    456 remove vertical space characters, 2 => remove underscore. */
    457 
    458 static const int posix_class_maps[] = {
    459   cbit_word,  cbit_digit, -2,             /* alpha */
    460   cbit_lower, -1,          0,             /* lower */
    461   cbit_upper, -1,          0,             /* upper */
    462   cbit_word,  -1,          2,             /* alnum - word without underscore */
    463   cbit_print, cbit_cntrl,  0,             /* ascii */
    464   cbit_space, -1,          1,             /* blank - a GNU extension */
    465   cbit_cntrl, -1,          0,             /* cntrl */
    466   cbit_digit, -1,          0,             /* digit */
    467   cbit_graph, -1,          0,             /* graph */
    468   cbit_print, -1,          0,             /* print */
    469   cbit_punct, -1,          0,             /* punct */
    470   cbit_space, -1,          0,             /* space */
    471   cbit_word,  -1,          0,             /* word - a Perl extension */
    472   cbit_xdigit,-1,          0              /* xdigit */
    473 };
    474 
    475 /* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
    476 Unicode property escapes. */
    477 
    478 #ifdef SUPPORT_UNICODE
    479 static const PCRE2_UCHAR string_PNd[]  = {
    480   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    481   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    482 static const PCRE2_UCHAR string_pNd[]  = {
    483   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    484   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    485 static const PCRE2_UCHAR string_PXsp[] = {
    486   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    487   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    488 static const PCRE2_UCHAR string_pXsp[] = {
    489   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    490   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    491 static const PCRE2_UCHAR string_PXwd[] = {
    492   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    493   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    494 static const PCRE2_UCHAR string_pXwd[] = {
    495   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    496   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    497 
    498 static PCRE2_SPTR substitutes[] = {
    499   string_PNd,           /* \D */
    500   string_pNd,           /* \d */
    501   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
    502   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
    503   string_PXwd,          /* \W */
    504   string_pXwd           /* \w */
    505 };
    506 
    507 /* The POSIX class substitutes must be in the order of the POSIX class names,
    508 defined above, and there are both positive and negative cases. NULL means no
    509 general substitute of a Unicode property escape (\p or \P). However, for some
    510 POSIX classes (e.g. graph, print, punct) a special property code is compiled
    511 directly. */
    512 
    513 static const PCRE2_UCHAR string_pCc[] =  {
    514   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    515   CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    516 static const PCRE2_UCHAR string_pL[] =   {
    517   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    518   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    519 static const PCRE2_UCHAR string_pLl[] =  {
    520   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    521   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    522 static const PCRE2_UCHAR string_pLu[] =  {
    523   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    524   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    525 static const PCRE2_UCHAR string_pXan[] = {
    526   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    527   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    528 static const PCRE2_UCHAR string_h[] =    {
    529   CHAR_BACKSLASH, CHAR_h, '\0' };
    530 static const PCRE2_UCHAR string_pXps[] = {
    531   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    532   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    533 static const PCRE2_UCHAR string_PCc[] =  {
    534   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    535   CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    536 static const PCRE2_UCHAR string_PL[] =   {
    537   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    538   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    539 static const PCRE2_UCHAR string_PLl[] =  {
    540   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    541   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    542 static const PCRE2_UCHAR string_PLu[] =  {
    543   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    544   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    545 static const PCRE2_UCHAR string_PXan[] = {
    546   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    547   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    548 static const PCRE2_UCHAR string_H[] =    {
    549   CHAR_BACKSLASH, CHAR_H, '\0' };
    550 static const PCRE2_UCHAR string_PXps[] = {
    551   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    552   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    553 
    554 static PCRE2_SPTR posix_substitutes[] = {
    555   string_pL,            /* alpha */
    556   string_pLl,           /* lower */
    557   string_pLu,           /* upper */
    558   string_pXan,          /* alnum */
    559   NULL,                 /* ascii */
    560   string_h,             /* blank */
    561   string_pCc,           /* cntrl */
    562   string_pNd,           /* digit */
    563   NULL,                 /* graph */
    564   NULL,                 /* print */
    565   NULL,                 /* punct */
    566   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
    567   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
    568   NULL,                 /* xdigit */
    569   /* Negated cases */
    570   string_PL,            /* ^alpha */
    571   string_PLl,           /* ^lower */
    572   string_PLu,           /* ^upper */
    573   string_PXan,          /* ^alnum */
    574   NULL,                 /* ^ascii */
    575   string_H,             /* ^blank */
    576   string_PCc,           /* ^cntrl */
    577   string_PNd,           /* ^digit */
    578   NULL,                 /* ^graph */
    579   NULL,                 /* ^print */
    580   NULL,                 /* ^punct */
    581   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
    582   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
    583   NULL                  /* ^xdigit */
    584 };
    585 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
    586 #endif  /* SUPPORT_UNICODE */
    587 
    588 /* Masks for checking option settings. */
    589 
    590 #define PUBLIC_COMPILE_OPTIONS \
    591   (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
    592    PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
    593    PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
    594    PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
    595    PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
    596    PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
    597    PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
    598    PCRE2_UTF)
    599 
    600 /* Compile time error code numbers. They are given names so that they can more
    601 easily be tracked. When a new number is added, the tables called eint1 and
    602 eint2 in pcre2posix.c may need to be updated, and a new error text must be
    603 added to compile_error_texts in pcre2_error.c. */
    604 
    605 enum { ERR0 = COMPILE_ERROR_BASE,
    606        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
    607        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
    608        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
    609        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
    610        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
    611        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
    612        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
    613        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
    614        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 };
    615 
    616 /* Error codes that correspond to negative error codes returned by
    617 find_fixedlength(). */
    618 
    619 static int fixed_length_errors[] =
    620   {
    621   ERR0,    /* Not an error */
    622   ERR0,    /* Not an error; -1 is used for "process later" */
    623   ERR25,   /* Lookbehind is not fixed length */
    624   ERR36,   /* \C in lookbehind is not allowed */
    625   ERR87,   /* Lookbehind is too long */
    626   ERR86,   /* Pattern too complicated */
    627   ERR70    /* Internal error: unknown opcode encountered */
    628   };
    629 
    630 /* This is a table of start-of-pattern options such as (*UTF) and settings such
    631 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
    632 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
    633 generic and always supported. */
    634 
    635 enum { PSO_OPT,     /* Value is an option bit */
    636        PSO_FLG,     /* Value is a flag bit */
    637        PSO_NL,      /* Value is a newline type */
    638        PSO_BSR,     /* Value is a \R type */
    639        PSO_LIMM,    /* Read integer value for match limit */
    640        PSO_LIMR };  /* Read integer value for recursion limit */
    641 
    642 typedef struct pso {
    643   const uint8_t *name;
    644   uint16_t length;
    645   uint16_t type;
    646   uint32_t value;
    647 } pso;
    648 
    649 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
    650 
    651 static pso pso_list[] = {
    652   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
    653   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
    654   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
    655   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
    656   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
    657   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
    658   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
    659   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
    660   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
    661   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
    662   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMR, 0 },
    663   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
    664   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
    665   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
    666   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
    667   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
    668   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
    669   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
    670 };
    671 
    672 /* This table is used when converting repeating opcodes into possessified
    673 versions as a result of an explicit possessive quantifier such as ++. A zero
    674 value means there is no possessified version - in those cases the item in
    675 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
    676 because all relevant opcodes are less than that. */
    677 
    678 static const uint8_t opcode_possessify[] = {
    679   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
    680   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
    681 
    682   0,                       /* NOTI */
    683   OP_POSSTAR, 0,           /* STAR, MINSTAR */
    684   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
    685   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
    686   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
    687   0,                       /* EXACT */
    688   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
    689 
    690   OP_POSSTARI, 0,          /* STARI, MINSTARI */
    691   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
    692   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
    693   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
    694   0,                       /* EXACTI */
    695   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
    696 
    697   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
    698   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
    699   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
    700   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
    701   0,                       /* NOTEXACT */
    702   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
    703 
    704   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
    705   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
    706   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
    707   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
    708   0,                       /* NOTEXACTI */
    709   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
    710 
    711   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
    712   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
    713   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
    714   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
    715   0,                       /* TYPEEXACT */
    716   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
    717 
    718   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
    719   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
    720   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
    721   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
    722   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
    723 
    724   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
    725   0, 0,                    /* REF, REFI */
    726   0, 0,                    /* DNREF, DNREFI */
    727   0, 0                     /* RECURSE, CALLOUT */
    728 };
    729 
    730 
    731 
    732 /*************************************************
    733 *               Copy compiled code               *
    734 *************************************************/
    735 
    736 /* Compiled JIT code cannot be copied, so the new compiled block has no
    737 associated JIT data. */
    738 
    739 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
    740 pcre2_code_copy(const pcre2_code *code)
    741 {
    742 PCRE2_SIZE* ref_count;
    743 pcre2_code *newcode;
    744 
    745 if (code == NULL) return NULL;
    746 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
    747 if (newcode == NULL) return NULL;
    748 memcpy(newcode, code, code->blocksize);
    749 newcode->executable_jit = NULL;
    750 
    751 /* If the code is one that has been deserialized, increment the reference count
    752 in the decoded tables. */
    753 
    754 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
    755   {
    756   ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
    757   (*ref_count)++;
    758   }
    759 
    760 return newcode;
    761 }
    762 
    763 
    764 
    765 /*************************************************
    766 *               Free compiled code               *
    767 *************************************************/
    768 
    769 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
    770 pcre2_code_free(pcre2_code *code)
    771 {
    772 PCRE2_SIZE* ref_count;
    773 
    774 if (code != NULL)
    775   {
    776   if (code->executable_jit != NULL)
    777     PRIV(jit_free)(code->executable_jit, &code->memctl);
    778 
    779   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
    780     {
    781     /* Decoded tables belong to the codes after deserialization, and they must
    782     be freed when there are no more reference to them. The *ref_count should
    783     always be > 0. */
    784 
    785     ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
    786     if (*ref_count > 0)
    787       {
    788       (*ref_count)--;
    789       if (*ref_count == 0)
    790         code->memctl.free((void *)code->tables, code->memctl.memory_data);
    791       }
    792     }
    793 
    794   code->memctl.free(code, code->memctl.memory_data);
    795   }
    796 }
    797 
    798 
    799 
    800 /*************************************************
    801 *        Insert an automatic callout point       *
    802 *************************************************/
    803 
    804 /* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert
    805 callout points before each pattern item.
    806 
    807 Arguments:
    808   code           current code pointer
    809   ptr            current pattern pointer
    810   cb             general compile-time data
    811 
    812 Returns:         new code pointer
    813 */
    814 
    815 static PCRE2_UCHAR *
    816 auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
    817 {
    818 code[0] = OP_CALLOUT;
    819 PUT(code, 1, ptr - cb->start_pattern);  /* Pattern offset */
    820 PUT(code, 1 + LINK_SIZE, 0);            /* Default length */
    821 code[1 + 2*LINK_SIZE] = 255;
    822 return code + PRIV(OP_lengths)[OP_CALLOUT];
    823 }
    824 
    825 
    826 
    827 /*************************************************
    828 *         Complete a callout item                *
    829 *************************************************/
    830 
    831 /* A callout item contains the length of the next item in the pattern, which
    832 we can't fill in till after we have reached the relevant point. This is used
    833 for both automatic and manual callouts.
    834 
    835 Arguments:
    836   previous_callout   points to previous callout item
    837   ptr                current pattern pointer
    838   cb                 general compile-time data
    839 
    840 Returns:             nothing
    841 */
    842 
    843 static void
    844 complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
    845   compile_block *cb)
    846 {
    847 size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1));
    848 PUT(previous_callout, 1 + LINK_SIZE, length);
    849 }
    850 
    851 
    852 
    853 /*************************************************
    854 *        Find the fixed length of a branch       *
    855 *************************************************/
    856 
    857 /* Scan a branch and compute the fixed length of subject that will match it, if
    858 the length is fixed. This is needed for dealing with lookbehind assertions. In
    859 UTF mode, the result is in code units rather than bytes. The branch is
    860 temporarily terminated with OP_END when this function is called.
    861 
    862 This function is called when a lookbehind assertion is encountered, so that if
    863 it fails, the error message can point to the correct place in the pattern.
    864 However, we cannot do this when the assertion contains subroutine calls,
    865 because they can be forward references. We solve this by remembering this case
    866 and doing the check at the end; a flag specifies which mode we are running in.
    867 
    868 Lookbehind lengths are held in 16-bit fields and the maximum value is defined
    869 as LOOKBEHIND_MAX.
    870 
    871 Arguments:
    872   code        points to the start of the pattern (the bracket)
    873   utf         TRUE in UTF mode
    874   atend       TRUE if called when the pattern is complete
    875   cb          the "compile data" structure
    876   recurses    chain of recurse_check to catch mutual recursion
    877   countptr    pointer to counter, to catch over-complexity
    878 
    879 Returns:   if non-negative, the fixed length,
    880              or -1 if an OP_RECURSE item was encountered and atend is FALSE
    881              or -2 if there is no fixed length,
    882              or -3 if \C was encountered (in UTF mode only)
    883              or -4 if length is too long
    884              or -5 if regex is too complicated
    885              or -6 if an unknown opcode was encountered (internal error)
    886 */
    887 
    888 #define FFL_LATER           (-1)
    889 #define FFL_NOTFIXED        (-2)
    890 #define FFL_BACKSLASHC      (-3)
    891 #define FFL_TOOLONG         (-4)
    892 #define FFL_TOOCOMPLICATED  (-5)
    893 #define FFL_UNKNOWNOP       (-6)
    894 
    895 static int
    896 find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
    897   recurse_check *recurses, int *countptr)
    898 {
    899 uint32_t length = 0xffffffffu;   /* Unset */
    900 uint32_t group = 0;
    901 uint32_t groupinfo = 0;
    902 recurse_check this_recurse;
    903 register uint32_t branchlength = 0;
    904 register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
    905 
    906 /* If this is a capturing group, we may have the answer cached, but we can only
    907 use this information if there are no (?| groups in the pattern, because
    908 otherwise group numbers are not unique. */
    909 
    910 if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
    911     *code == OP_SCBRAPOS)
    912   {
    913   group = GET2(cc, 0);
    914   cc += IMM2_SIZE;
    915   groupinfo = cb->groupinfo[group];
    916   if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
    917     {
    918     if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
    919     if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
    920       return groupinfo & GI_FIXED_LENGTH_MASK;
    921     }
    922   }
    923 
    924 /* A large and/or complex regex can take too long to process. This can happen
    925 more often when (?| groups are present in the pattern. */
    926 
    927 if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
    928 
    929 /* Scan along the opcodes for this branch. If we get to the end of the
    930 branch, check the length against that of the other branches. */
    931 
    932 for (;;)
    933   {
    934   int d;
    935   PCRE2_UCHAR *ce, *cs;
    936   register PCRE2_UCHAR op = *cc;
    937 
    938   if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
    939 
    940   switch (op)
    941     {
    942     /* We only need to continue for OP_CBRA (normal capturing bracket) and
    943     OP_BRA (normal non-capturing bracket) because the other variants of these
    944     opcodes are all concerned with unlimited repeated groups, which of course
    945     are not of fixed length. */
    946 
    947     case OP_CBRA:
    948     case OP_BRA:
    949     case OP_ONCE:
    950     case OP_ONCE_NC:
    951     case OP_COND:
    952     d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
    953     if (d < 0) return d;
    954     branchlength += (uint32_t)d;
    955     do cc += GET(cc, 1); while (*cc == OP_ALT);
    956     cc += 1 + LINK_SIZE;
    957     break;
    958 
    959     /* Reached end of a branch; if it's a ket it is the end of a nested call.
    960     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
    961     an ALT. If it is END it's the end of the outer call. All can be handled by
    962     the same code. Note that we must not include the OP_KETRxxx opcodes here,
    963     because they all imply an unlimited repeat. */
    964 
    965     case OP_ALT:
    966     case OP_KET:
    967     case OP_END:
    968     case OP_ACCEPT:
    969     case OP_ASSERT_ACCEPT:
    970     if (length == 0xffffffffu) length = branchlength;
    971       else if (length != branchlength) goto ISNOTFIXED;
    972     if (*cc != OP_ALT)
    973       {
    974       if (group > 0)
    975         {
    976         groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length);
    977         cb->groupinfo[group] = groupinfo;
    978         }
    979       return (int)length;
    980       }
    981     cc += 1 + LINK_SIZE;
    982     branchlength = 0;
    983     break;
    984 
    985     /* A true recursion implies not fixed length, but a subroutine call may
    986     be OK. If the subroutine is a forward reference, we can't deal with
    987     it until the end of the pattern, so return FFL_LATER. */
    988 
    989     case OP_RECURSE:
    990     if (!atend) return FFL_LATER;
    991     cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
    992     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
    993     if (cc > cs && cc < ce) goto ISNOTFIXED;          /* Recursion */
    994     else   /* Check for mutual recursion */
    995       {
    996       recurse_check *r = recurses;
    997       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    998       if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
    999       }
   1000     this_recurse.prev = recurses;
   1001     this_recurse.group = cs;
   1002     d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
   1003     if (d < 0) return d;
   1004     branchlength += (uint32_t)d;
   1005     cc += 1 + LINK_SIZE;
   1006     break;
   1007 
   1008     /* Skip over assertive subpatterns. Note that we must increment cc by
   1009     1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive
   1010     situation this assertion may be the one that is ultimately being checked
   1011     for having a fixed length, in which case its terminating OP_KET will have
   1012     been temporarily replaced by OP_END. */
   1013 
   1014     case OP_ASSERT:
   1015     case OP_ASSERT_NOT:
   1016     case OP_ASSERTBACK:
   1017     case OP_ASSERTBACK_NOT:
   1018     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1019     cc += 1 + LINK_SIZE;
   1020     break;
   1021 
   1022     /* Skip over things that don't match chars */
   1023 
   1024     case OP_MARK:
   1025     case OP_PRUNE_ARG:
   1026     case OP_SKIP_ARG:
   1027     case OP_THEN_ARG:
   1028     cc += cc[1] + PRIV(OP_lengths)[*cc];
   1029     break;
   1030 
   1031     case OP_CALLOUT:
   1032     case OP_CIRC:
   1033     case OP_CIRCM:
   1034     case OP_CLOSE:
   1035     case OP_COMMIT:
   1036     case OP_CREF:
   1037     case OP_FALSE:
   1038     case OP_TRUE:
   1039     case OP_DNCREF:
   1040     case OP_DNRREF:
   1041     case OP_DOLL:
   1042     case OP_DOLLM:
   1043     case OP_EOD:
   1044     case OP_EODN:
   1045     case OP_FAIL:
   1046     case OP_NOT_WORD_BOUNDARY:
   1047     case OP_PRUNE:
   1048     case OP_REVERSE:
   1049     case OP_RREF:
   1050     case OP_SET_SOM:
   1051     case OP_SKIP:
   1052     case OP_SOD:
   1053     case OP_SOM:
   1054     case OP_THEN:
   1055     case OP_WORD_BOUNDARY:
   1056     cc += PRIV(OP_lengths)[*cc];
   1057     break;
   1058 
   1059     case OP_CALLOUT_STR:
   1060     cc += GET(cc, 1 + 2*LINK_SIZE);
   1061     break;
   1062 
   1063     /* Handle literal characters */
   1064 
   1065     case OP_CHAR:
   1066     case OP_CHARI:
   1067     case OP_NOT:
   1068     case OP_NOTI:
   1069     branchlength++;
   1070     cc += 2;
   1071 #ifdef SUPPORT_UNICODE
   1072     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1073 #endif
   1074     break;
   1075 
   1076     /* Handle exact repetitions. The count is already in characters, but we
   1077     need to skip over a multibyte character in UTF8 mode.  */
   1078 
   1079     case OP_EXACT:
   1080     case OP_EXACTI:
   1081     case OP_NOTEXACT:
   1082     case OP_NOTEXACTI:
   1083     branchlength += GET2(cc,1);
   1084     cc += 2 + IMM2_SIZE;
   1085 #ifdef SUPPORT_UNICODE
   1086     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1087 #endif
   1088     break;
   1089 
   1090     case OP_TYPEEXACT:
   1091     branchlength += GET2(cc,1);
   1092     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
   1093       cc += 2;
   1094     cc += 1 + IMM2_SIZE + 1;
   1095     break;
   1096 
   1097     /* Handle single-char matchers */
   1098 
   1099     case OP_PROP:
   1100     case OP_NOTPROP:
   1101     cc += 2;
   1102     /* Fall through */
   1103 
   1104     case OP_HSPACE:
   1105     case OP_VSPACE:
   1106     case OP_NOT_HSPACE:
   1107     case OP_NOT_VSPACE:
   1108     case OP_NOT_DIGIT:
   1109     case OP_DIGIT:
   1110     case OP_NOT_WHITESPACE:
   1111     case OP_WHITESPACE:
   1112     case OP_NOT_WORDCHAR:
   1113     case OP_WORDCHAR:
   1114     case OP_ANY:
   1115     case OP_ALLANY:
   1116     branchlength++;
   1117     cc++;
   1118     break;
   1119 
   1120     /* The single-byte matcher isn't allowed. This only happens in UTF-8 or
   1121     UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
   1122 
   1123     case OP_ANYBYTE:
   1124     return FFL_BACKSLASHC;
   1125 
   1126     /* Check a class for variable quantification */
   1127 
   1128     case OP_CLASS:
   1129     case OP_NCLASS:
   1130 #ifdef SUPPORT_WIDE_CHARS
   1131     case OP_XCLASS:
   1132     /* The original code caused an unsigned overflow in 64 bit systems,
   1133     so now we use a conditional statement. */
   1134     if (op == OP_XCLASS)
   1135       cc += GET(cc, 1);
   1136     else
   1137       cc += PRIV(OP_lengths)[OP_CLASS];
   1138 #else
   1139     cc += PRIV(OP_lengths)[OP_CLASS];
   1140 #endif
   1141 
   1142     switch (*cc)
   1143       {
   1144       case OP_CRSTAR:
   1145       case OP_CRMINSTAR:
   1146       case OP_CRPLUS:
   1147       case OP_CRMINPLUS:
   1148       case OP_CRQUERY:
   1149       case OP_CRMINQUERY:
   1150       case OP_CRPOSSTAR:
   1151       case OP_CRPOSPLUS:
   1152       case OP_CRPOSQUERY:
   1153       goto ISNOTFIXED;
   1154 
   1155       case OP_CRRANGE:
   1156       case OP_CRMINRANGE:
   1157       case OP_CRPOSRANGE:
   1158       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
   1159       branchlength += GET2(cc,1);
   1160       cc += 1 + 2 * IMM2_SIZE;
   1161       break;
   1162 
   1163       default:
   1164       branchlength++;
   1165       }
   1166     break;
   1167 
   1168     /* Anything else is variable length */
   1169 
   1170     case OP_ANYNL:
   1171     case OP_BRAMINZERO:
   1172     case OP_BRAPOS:
   1173     case OP_BRAPOSZERO:
   1174     case OP_BRAZERO:
   1175     case OP_CBRAPOS:
   1176     case OP_EXTUNI:
   1177     case OP_KETRMAX:
   1178     case OP_KETRMIN:
   1179     case OP_KETRPOS:
   1180     case OP_MINPLUS:
   1181     case OP_MINPLUSI:
   1182     case OP_MINQUERY:
   1183     case OP_MINQUERYI:
   1184     case OP_MINSTAR:
   1185     case OP_MINSTARI:
   1186     case OP_MINUPTO:
   1187     case OP_MINUPTOI:
   1188     case OP_NOTMINPLUS:
   1189     case OP_NOTMINPLUSI:
   1190     case OP_NOTMINQUERY:
   1191     case OP_NOTMINQUERYI:
   1192     case OP_NOTMINSTAR:
   1193     case OP_NOTMINSTARI:
   1194     case OP_NOTMINUPTO:
   1195     case OP_NOTMINUPTOI:
   1196     case OP_NOTPLUS:
   1197     case OP_NOTPLUSI:
   1198     case OP_NOTPOSPLUS:
   1199     case OP_NOTPOSPLUSI:
   1200     case OP_NOTPOSQUERY:
   1201     case OP_NOTPOSQUERYI:
   1202     case OP_NOTPOSSTAR:
   1203     case OP_NOTPOSSTARI:
   1204     case OP_NOTPOSUPTO:
   1205     case OP_NOTPOSUPTOI:
   1206     case OP_NOTQUERY:
   1207     case OP_NOTQUERYI:
   1208     case OP_NOTSTAR:
   1209     case OP_NOTSTARI:
   1210     case OP_NOTUPTO:
   1211     case OP_NOTUPTOI:
   1212     case OP_PLUS:
   1213     case OP_PLUSI:
   1214     case OP_POSPLUS:
   1215     case OP_POSPLUSI:
   1216     case OP_POSQUERY:
   1217     case OP_POSQUERYI:
   1218     case OP_POSSTAR:
   1219     case OP_POSSTARI:
   1220     case OP_POSUPTO:
   1221     case OP_POSUPTOI:
   1222     case OP_QUERY:
   1223     case OP_QUERYI:
   1224     case OP_REF:
   1225     case OP_REFI:
   1226     case OP_DNREF:
   1227     case OP_DNREFI:
   1228     case OP_SBRA:
   1229     case OP_SBRAPOS:
   1230     case OP_SCBRA:
   1231     case OP_SCBRAPOS:
   1232     case OP_SCOND:
   1233     case OP_SKIPZERO:
   1234     case OP_STAR:
   1235     case OP_STARI:
   1236     case OP_TYPEMINPLUS:
   1237     case OP_TYPEMINQUERY:
   1238     case OP_TYPEMINSTAR:
   1239     case OP_TYPEMINUPTO:
   1240     case OP_TYPEPLUS:
   1241     case OP_TYPEPOSPLUS:
   1242     case OP_TYPEPOSQUERY:
   1243     case OP_TYPEPOSSTAR:
   1244     case OP_TYPEPOSUPTO:
   1245     case OP_TYPEQUERY:
   1246     case OP_TYPESTAR:
   1247     case OP_TYPEUPTO:
   1248     case OP_UPTO:
   1249     case OP_UPTOI:
   1250     goto ISNOTFIXED;
   1251 
   1252     /* Catch unrecognized opcodes so that when new ones are added they
   1253     are not forgotten, as has happened in the past. */
   1254 
   1255     default:
   1256     return FFL_UNKNOWNOP;
   1257     }
   1258   }
   1259 /* Control never gets here except by goto. */
   1260 
   1261 ISNOTFIXED:
   1262 if (group > 0)
   1263   {
   1264   groupinfo |= GI_NOT_FIXED_LENGTH;
   1265   cb->groupinfo[group] = groupinfo;
   1266   }
   1267 return FFL_NOTFIXED;
   1268 }
   1269 
   1270 
   1271 
   1272 /*************************************************
   1273 *      Find first significant op code            *
   1274 *************************************************/
   1275 
   1276 /* This is called by several functions that scan a compiled expression looking
   1277 for a fixed first character, or an anchoring op code etc. It skips over things
   1278 that do not influence this. For some calls, it makes sense to skip negative
   1279 forward and all backward assertions, and also the \b assertion; for others it
   1280 does not.
   1281 
   1282 Arguments:
   1283   code         pointer to the start of the group
   1284   skipassert   TRUE if certain assertions are to be skipped
   1285 
   1286 Returns:       pointer to the first significant opcode
   1287 */
   1288 
   1289 static const PCRE2_UCHAR*
   1290 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
   1291 {
   1292 for (;;)
   1293   {
   1294   switch ((int)*code)
   1295     {
   1296     case OP_ASSERT_NOT:
   1297     case OP_ASSERTBACK:
   1298     case OP_ASSERTBACK_NOT:
   1299     if (!skipassert) return code;
   1300     do code += GET(code, 1); while (*code == OP_ALT);
   1301     code += PRIV(OP_lengths)[*code];
   1302     break;
   1303 
   1304     case OP_WORD_BOUNDARY:
   1305     case OP_NOT_WORD_BOUNDARY:
   1306     if (!skipassert) return code;
   1307     /* Fall through */
   1308 
   1309     case OP_CALLOUT:
   1310     case OP_CREF:
   1311     case OP_DNCREF:
   1312     case OP_RREF:
   1313     case OP_DNRREF:
   1314     case OP_FALSE:
   1315     case OP_TRUE:
   1316     code += PRIV(OP_lengths)[*code];
   1317     break;
   1318 
   1319     case OP_CALLOUT_STR:
   1320     code += GET(code, 1 + 2*LINK_SIZE);
   1321     break;
   1322 
   1323     default:
   1324     return code;
   1325     }
   1326   }
   1327 /* Control never reaches here */
   1328 }
   1329 
   1330 
   1331 
   1332 /*************************************************
   1333 *    Scan compiled branch for non-emptiness      *
   1334 *************************************************/
   1335 
   1336 /* This function scans through a branch of a compiled pattern to see whether it
   1337 can match the empty string. It is called at the end of compiling to check the
   1338 entire pattern, and from compile_branch() when checking for an unlimited repeat
   1339 of a group that can match nothing. In the latter case it is called only when
   1340 doing the real compile, not during the pre-compile that measures the size of
   1341 the compiled pattern.
   1342 
   1343 Note that first_significant_code() skips over backward and negative forward
   1344 assertions when its final argument is TRUE. If we hit an unclosed bracket, we
   1345 return "empty" - this means we've struck an inner bracket whose current branch
   1346 will already have been scanned.
   1347 
   1348 Arguments:
   1349   code        points to start of search
   1350   endcode     points to where to stop
   1351   utf         TRUE if in UTF mode
   1352   cb          compile data
   1353   atend       TRUE if being called to check an entire pattern
   1354   recurses    chain of recurse_check to catch mutual recursion
   1355   countptr    pointer to count to catch over-complicated pattern
   1356 
   1357 Returns:      0 if what is matched cannot be empty
   1358               1 if what is matched could be empty
   1359              -1 if the pattern is too complicated
   1360 */
   1361 
   1362 #define CBE_NOTEMPTY          0
   1363 #define CBE_EMPTY             1
   1364 #define CBE_TOOCOMPLICATED  (-1)
   1365 
   1366 
   1367 static int
   1368 could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
   1369   compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
   1370 {
   1371 uint32_t group = 0;
   1372 uint32_t groupinfo = 0;
   1373 register PCRE2_UCHAR c;
   1374 recurse_check this_recurse;
   1375 
   1376 /* If what we are checking has already been set as "could be empty", we know
   1377 the answer. */
   1378 
   1379 if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
   1380 
   1381 /* If this is a capturing group, we may have the answer cached, but we can only
   1382 use this information if there are no (?| groups in the pattern, because
   1383 otherwise group numbers are not unique. */
   1384 
   1385 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
   1386     (*code == OP_CBRA || *code == OP_CBRAPOS))
   1387   {
   1388   group = GET2(code, 1 + LINK_SIZE);
   1389   groupinfo = cb->groupinfo[group];
   1390   if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
   1391     return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
   1392   }
   1393 
   1394 /* A large and/or complex regex can take too long to process. We have to assume
   1395 it can match an empty string. This can happen more often when (?| groups are
   1396 present in the pattern and the caching is disabled. Setting the cap at 1100
   1397 allows the test for more than 1023 capturing patterns to work. */
   1398 
   1399 if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
   1400 
   1401 /* Scan the opcodes for this branch. */
   1402 
   1403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
   1404      code < endcode;
   1405      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
   1406   {
   1407   PCRE2_SPTR ccode;
   1408 
   1409   c = *code;
   1410 
   1411   /* Skip over forward assertions; the other assertions are skipped by
   1412   first_significant_code() with a TRUE final argument. */
   1413 
   1414   if (c == OP_ASSERT)
   1415     {
   1416     do code += GET(code, 1); while (*code == OP_ALT);
   1417     c = *code;
   1418     continue;
   1419     }
   1420 
   1421   /* For a recursion/subroutine call we can scan the recursion when this
   1422   function is called at the end, to check a complete pattern. Before then,
   1423   recursions just have the group number as their argument and in any case may
   1424   be forward references. In that situation, we return CBE_EMPTY, just in case.
   1425   It means that unlimited repeats of groups that contain recursions are always
   1426   treated as "could be empty" - which just adds a bit more processing time
   1427   because of the runtime check. */
   1428 
   1429   if (c == OP_RECURSE)
   1430     {
   1431     PCRE2_SPTR scode, endgroup;
   1432     BOOL empty_branch;
   1433 
   1434     if (!atend) goto ISTRUE;
   1435     scode = cb->start_code + GET(code, 1);
   1436     endgroup = scode;
   1437 
   1438     /* We need to detect whether this is a recursive call, as otherwise there
   1439     will be an infinite loop. If it is a recursion, just skip over it. Simple
   1440     recursions are easily detected. For mutual recursions we keep a chain on
   1441     the stack. */
   1442 
   1443     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
   1444     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
   1445     else
   1446       {
   1447       recurse_check *r = recurses;
   1448       for (r = recurses; r != NULL; r = r->prev)
   1449         if (r->group == scode) break;
   1450       if (r != NULL) continue;   /* Mutual recursion */
   1451       }
   1452 
   1453     /* Scan the referenced group, remembering it on the stack chain to detect
   1454     mutual recursions. */
   1455 
   1456     empty_branch = FALSE;
   1457     this_recurse.prev = recurses;
   1458     this_recurse.group = scode;
   1459 
   1460     do
   1461       {
   1462       int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
   1463         &this_recurse, countptr);
   1464       if (rc < 0) return rc;
   1465       if (rc > 0)
   1466         {
   1467         empty_branch = TRUE;
   1468         break;
   1469         }
   1470       scode += GET(scode, 1);
   1471       }
   1472     while (*scode == OP_ALT);
   1473 
   1474     if (!empty_branch) goto ISFALSE;  /* All branches are non-empty */
   1475     continue;
   1476     }
   1477 
   1478   /* Groups with zero repeats can of course be empty; skip them. */
   1479 
   1480   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
   1481       c == OP_BRAPOSZERO)
   1482     {
   1483     code += PRIV(OP_lengths)[c];
   1484     do code += GET(code, 1); while (*code == OP_ALT);
   1485     c = *code;
   1486     continue;
   1487     }
   1488 
   1489   /* A nested group that is already marked as "could be empty" can just be
   1490   skipped. */
   1491 
   1492   if (c == OP_SBRA  || c == OP_SBRAPOS ||
   1493       c == OP_SCBRA || c == OP_SCBRAPOS)
   1494     {
   1495     do code += GET(code, 1); while (*code == OP_ALT);
   1496     c = *code;
   1497     continue;
   1498     }
   1499 
   1500   /* For other groups, scan the branches. */
   1501 
   1502   if (c == OP_BRA  || c == OP_BRAPOS ||
   1503       c == OP_CBRA || c == OP_CBRAPOS ||
   1504       c == OP_ONCE || c == OP_ONCE_NC ||
   1505       c == OP_COND || c == OP_SCOND)
   1506     {
   1507     BOOL empty_branch;
   1508     if (GET(code, 1) == 0) goto ISTRUE;    /* Hit unclosed bracket */
   1509 
   1510     /* If a conditional group has only one branch, there is a second, implied,
   1511     empty branch, so just skip over the conditional, because it could be empty.
   1512     Otherwise, scan the individual branches of the group. */
   1513 
   1514     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
   1515       code += GET(code, 1);
   1516     else
   1517       {
   1518       empty_branch = FALSE;
   1519       do
   1520         {
   1521         if (!empty_branch)
   1522           {
   1523           int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
   1524             recurses, countptr);
   1525           if (rc < 0) return rc;
   1526           if (rc > 0) empty_branch = TRUE;
   1527           }
   1528         code += GET(code, 1);
   1529         }
   1530       while (*code == OP_ALT);
   1531       if (!empty_branch) goto ISFALSE;   /* All branches are non-empty */
   1532       }
   1533 
   1534     c = *code;
   1535     continue;
   1536     }
   1537 
   1538   /* Handle the other opcodes */
   1539 
   1540   switch (c)
   1541     {
   1542     /* Check for quantifiers after a class. XCLASS is used for classes that
   1543     cannot be represented just by a bit map. This includes negated single
   1544     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
   1545     actual length is stored in the compiled code, so we must update "code"
   1546     here. */
   1547 
   1548 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
   1549     case OP_XCLASS:
   1550     ccode = code += GET(code, 1);
   1551     goto CHECK_CLASS_REPEAT;
   1552 #endif
   1553 
   1554     case OP_CLASS:
   1555     case OP_NCLASS:
   1556     ccode = code + PRIV(OP_lengths)[OP_CLASS];
   1557 
   1558 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
   1559     CHECK_CLASS_REPEAT:
   1560 #endif
   1561 
   1562     switch (*ccode)
   1563       {
   1564       case OP_CRSTAR:            /* These could be empty; continue */
   1565       case OP_CRMINSTAR:
   1566       case OP_CRQUERY:
   1567       case OP_CRMINQUERY:
   1568       case OP_CRPOSSTAR:
   1569       case OP_CRPOSQUERY:
   1570       break;
   1571 
   1572       default:                   /* Non-repeat => class must match */
   1573       case OP_CRPLUS:            /* These repeats aren't empty */
   1574       case OP_CRMINPLUS:
   1575       case OP_CRPOSPLUS:
   1576       goto ISFALSE;
   1577 
   1578       case OP_CRRANGE:
   1579       case OP_CRMINRANGE:
   1580       case OP_CRPOSRANGE:
   1581       if (GET2(ccode, 1) > 0) goto ISFALSE;  /* Minimum > 0 */
   1582       break;
   1583       }
   1584     break;
   1585 
   1586     /* Opcodes that must match a character */
   1587 
   1588     case OP_ANY:
   1589     case OP_ALLANY:
   1590     case OP_ANYBYTE:
   1591 
   1592     case OP_PROP:
   1593     case OP_NOTPROP:
   1594     case OP_ANYNL:
   1595 
   1596     case OP_NOT_HSPACE:
   1597     case OP_HSPACE:
   1598     case OP_NOT_VSPACE:
   1599     case OP_VSPACE:
   1600     case OP_EXTUNI:
   1601 
   1602     case OP_NOT_DIGIT:
   1603     case OP_DIGIT:
   1604     case OP_NOT_WHITESPACE:
   1605     case OP_WHITESPACE:
   1606     case OP_NOT_WORDCHAR:
   1607     case OP_WORDCHAR:
   1608 
   1609     case OP_CHAR:
   1610     case OP_CHARI:
   1611     case OP_NOT:
   1612     case OP_NOTI:
   1613 
   1614     case OP_PLUS:
   1615     case OP_PLUSI:
   1616     case OP_MINPLUS:
   1617     case OP_MINPLUSI:
   1618 
   1619     case OP_NOTPLUS:
   1620     case OP_NOTPLUSI:
   1621     case OP_NOTMINPLUS:
   1622     case OP_NOTMINPLUSI:
   1623 
   1624     case OP_POSPLUS:
   1625     case OP_POSPLUSI:
   1626     case OP_NOTPOSPLUS:
   1627     case OP_NOTPOSPLUSI:
   1628 
   1629     case OP_EXACT:
   1630     case OP_EXACTI:
   1631     case OP_NOTEXACT:
   1632     case OP_NOTEXACTI:
   1633 
   1634     case OP_TYPEPLUS:
   1635     case OP_TYPEMINPLUS:
   1636     case OP_TYPEPOSPLUS:
   1637     case OP_TYPEEXACT:
   1638     goto ISFALSE;
   1639 
   1640     /* These are going to continue, as they may be empty, but we have to
   1641     fudge the length for the \p and \P cases. */
   1642 
   1643     case OP_TYPESTAR:
   1644     case OP_TYPEMINSTAR:
   1645     case OP_TYPEPOSSTAR:
   1646     case OP_TYPEQUERY:
   1647     case OP_TYPEMINQUERY:
   1648     case OP_TYPEPOSQUERY:
   1649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   1650     break;
   1651 
   1652     /* Same for these */
   1653 
   1654     case OP_TYPEUPTO:
   1655     case OP_TYPEMINUPTO:
   1656     case OP_TYPEPOSUPTO:
   1657     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   1658       code += 2;
   1659     break;
   1660 
   1661     /* End of branch */
   1662 
   1663     case OP_KET:
   1664     case OP_KETRMAX:
   1665     case OP_KETRMIN:
   1666     case OP_KETRPOS:
   1667     case OP_ALT:
   1668     goto ISTRUE;
   1669 
   1670     /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
   1671     POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
   1672     versions may be followed by a multibyte character. */
   1673 
   1674 #ifdef MAYBE_UTF_MULTI
   1675     case OP_STAR:
   1676     case OP_STARI:
   1677     case OP_NOTSTAR:
   1678     case OP_NOTSTARI:
   1679 
   1680     case OP_MINSTAR:
   1681     case OP_MINSTARI:
   1682     case OP_NOTMINSTAR:
   1683     case OP_NOTMINSTARI:
   1684 
   1685     case OP_POSSTAR:
   1686     case OP_POSSTARI:
   1687     case OP_NOTPOSSTAR:
   1688     case OP_NOTPOSSTARI:
   1689 
   1690     case OP_QUERY:
   1691     case OP_QUERYI:
   1692     case OP_NOTQUERY:
   1693     case OP_NOTQUERYI:
   1694 
   1695     case OP_MINQUERY:
   1696     case OP_MINQUERYI:
   1697     case OP_NOTMINQUERY:
   1698     case OP_NOTMINQUERYI:
   1699 
   1700     case OP_POSQUERY:
   1701     case OP_POSQUERYI:
   1702     case OP_NOTPOSQUERY:
   1703     case OP_NOTPOSQUERYI:
   1704     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
   1705     break;
   1706 
   1707     case OP_UPTO:
   1708     case OP_UPTOI:
   1709     case OP_NOTUPTO:
   1710     case OP_NOTUPTOI:
   1711 
   1712     case OP_MINUPTO:
   1713     case OP_MINUPTOI:
   1714     case OP_NOTMINUPTO:
   1715     case OP_NOTMINUPTOI:
   1716 
   1717     case OP_POSUPTO:
   1718     case OP_POSUPTOI:
   1719     case OP_NOTPOSUPTO:
   1720     case OP_NOTPOSUPTOI:
   1721     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
   1722     break;
   1723 #endif  /* MAYBE_UTF_MULTI */
   1724 
   1725     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
   1726     string. */
   1727 
   1728     case OP_MARK:
   1729     case OP_PRUNE_ARG:
   1730     case OP_SKIP_ARG:
   1731     case OP_THEN_ARG:
   1732     code += code[1];
   1733     break;
   1734 
   1735     /* None of the remaining opcodes are required to match a character. */
   1736 
   1737     default:
   1738     break;
   1739     }
   1740   }
   1741 
   1742 ISTRUE:
   1743 groupinfo |= GI_COULD_BE_EMPTY;
   1744 
   1745 ISFALSE:
   1746 if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
   1747 
   1748 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
   1749 }
   1750 
   1751 
   1752 
   1753 /*************************************************
   1754 *            Check for counted repeat            *
   1755 *************************************************/
   1756 
   1757 /* This function is called when a '{' is encountered in a place where it might
   1758 start a quantifier. It looks ahead to see if it really is a quantifier, that
   1759 is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
   1760 
   1761 Argument:   pointer to the first char after '{'
   1762 Returns:    TRUE or FALSE
   1763 */
   1764 
   1765 static BOOL
   1766 is_counted_repeat(PCRE2_SPTR p)
   1767 {
   1768 if (!IS_DIGIT(*p)) return FALSE;
   1769 p++;
   1770 while (IS_DIGIT(*p)) p++;
   1771 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
   1772 
   1773 if (*p++ != CHAR_COMMA) return FALSE;
   1774 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
   1775 
   1776 if (!IS_DIGIT(*p)) return FALSE;
   1777 p++;
   1778 while (IS_DIGIT(*p)) p++;
   1779 
   1780 return (*p == CHAR_RIGHT_CURLY_BRACKET);
   1781 }
   1782 
   1783 
   1784 
   1785 /*************************************************
   1786 *            Handle escapes                      *
   1787 *************************************************/
   1788 
   1789 /* This function is called when a \ has been encountered. It either returns a
   1790 positive value for a simple escape such as \d, or 0 for a data character, which
   1791 is placed in chptr. A backreference to group n is returned as negative n. On
   1792 entry, ptr is pointing at the \. On exit, it points the final code unit of the
   1793 escape sequence.
   1794 
   1795 This function is also called from pcre2_substitute() to handle escape sequences
   1796 in replacement strings. In this case, the cb argument is NULL, and only
   1797 sequences that define a data character are recognised. The isclass argument is
   1798 not relevant, but the options argument is the final value of the compiled
   1799 pattern's options.
   1800 
   1801 There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
   1802 processed, it is replaced by a nested alternative sequence. If this contains a
   1803 backslash (which is usually does), ptrend does not point to its end - it still
   1804 points to the end of the whole pattern. However, we can detect this case
   1805 because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
   1806 terminated and there are only ever two levels of nesting.
   1807 
   1808 Arguments:
   1809   ptrptr         points to the input position pointer
   1810   ptrend         points to the end of the input
   1811   chptr          points to a returned data character
   1812   errorcodeptr   points to the errorcode variable (containing zero)
   1813   options        the current options bits
   1814   isclass        TRUE if inside a character class
   1815   cb             compile data block
   1816 
   1817 Returns:         zero => a data character
   1818                  positive => a special escape sequence
   1819                  negative => a back reference
   1820                  on error, errorcodeptr is set non-zero
   1821 */
   1822 
   1823 int
   1824 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
   1825   int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
   1826 {
   1827 BOOL utf = (options & PCRE2_UTF) != 0;
   1828 PCRE2_SPTR ptr = *ptrptr + 1;
   1829 register uint32_t c, cc;
   1830 int escape = 0;
   1831 int i;
   1832 
   1833 /* Find the end of a nested insert. */
   1834 
   1835 if (cb != NULL && cb->nestptr[0] != NULL)
   1836   ptrend = ptr + PRIV(strlen)(ptr);
   1837 
   1838 /* If backslash is at the end of the string, it's an error. */
   1839 
   1840 if (ptr >= ptrend)
   1841   {
   1842   *errorcodeptr = ERR1;
   1843   return 0;
   1844   }
   1845 
   1846 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
   1847 ptr--;                          /* Set pointer back to the last code unit */
   1848 
   1849 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
   1850 value test saves a memory lookup for code points outside the alphanumeric
   1851 range. Otherwise, do a table lookup. A non-zero result is something that can be
   1852 returned immediately. Otherwise further processing is required. */
   1853 
   1854 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
   1855 
   1856 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
   1857   {
   1858   if (i > 0) c = (uint32_t)i; else  /* Positive is a data character */
   1859     {
   1860     escape = -i;                    /* Else return a special escape */
   1861     if (escape == ESC_P || escape == ESC_p || escape == ESC_X)
   1862       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
   1863     }
   1864   }
   1865 
   1866 /* Escapes that need further processing, including those that are unknown.
   1867 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
   1868 when BSUX is set). */
   1869 
   1870 else
   1871   {
   1872   PCRE2_SPTR oldptr;
   1873   BOOL braced, negated, overflow;
   1874   unsigned int s;
   1875 
   1876   /* Filter calls from pcre2_substitute(). */
   1877 
   1878   if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
   1879       (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
   1880     {
   1881     *errorcodeptr = ERR3;
   1882     return 0;
   1883     }
   1884 
   1885   switch (c)
   1886     {
   1887     /* A number of Perl escapes are not handled by PCRE. We give an explicit
   1888     error. */
   1889 
   1890     case CHAR_l:
   1891     case CHAR_L:
   1892     *errorcodeptr = ERR37;
   1893     break;
   1894 
   1895     /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
   1896     specially, \u must be followed by four hex digits. Otherwise it is a
   1897     lowercase u letter. */
   1898 
   1899     case CHAR_u:
   1900     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
   1901       {
   1902       uint32_t xc;
   1903       if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   1904       if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
   1905       cc = (cc << 4) | xc;
   1906       if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
   1907       cc = (cc << 4) | xc;
   1908       if ((xc = XDIGIT(ptr[4])) == 0xff) break;  /* Not a hex digit */
   1909       c = (cc << 4) | xc;
   1910       ptr += 4;
   1911       if (utf)
   1912         {
   1913         if (c > 0x10ffffU) *errorcodeptr = ERR77;
   1914           else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1915         }
   1916       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
   1917       }
   1918     break;
   1919 
   1920     case CHAR_U:
   1921     /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
   1922     upper case letter. */
   1923     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
   1924     break;
   1925 
   1926     /* In a character class, \g is just a literal "g". Outside a character
   1927     class, \g must be followed by one of a number of specific things:
   1928 
   1929     (1) A number, either plain or braced. If positive, it is an absolute
   1930     backreference. If negative, it is a relative backreference. This is a Perl
   1931     5.10 feature.
   1932 
   1933     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
   1934     is part of Perl's movement towards a unified syntax for back references. As
   1935     this is synonymous with \k{name}, we fudge it up by pretending it really
   1936     was \k.
   1937 
   1938     (3) For Oniguruma compatibility we also support \g followed by a name or a
   1939     number either in angle brackets or in single quotes. However, these are
   1940     (possibly recursive) subroutine calls, _not_ backreferences. Just return
   1941     the ESC_g code (cf \k). */
   1942 
   1943     case CHAR_g:
   1944     if (isclass) break;
   1945     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
   1946       {
   1947       escape = ESC_g;
   1948       break;
   1949       }
   1950 
   1951     /* Handle the Perl-compatible cases */
   1952 
   1953     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   1954       {
   1955       PCRE2_SPTR p;
   1956       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
   1957         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
   1958       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
   1959         {
   1960         escape = ESC_k;
   1961         break;
   1962         }
   1963       braced = TRUE;
   1964       ptr++;
   1965       }
   1966     else braced = FALSE;
   1967 
   1968     if (ptr[1] == CHAR_MINUS)
   1969       {
   1970       negated = TRUE;
   1971       ptr++;
   1972       }
   1973     else negated = FALSE;
   1974 
   1975     /* The integer range is limited by the machine's int representation. */
   1976     s = 0;
   1977     overflow = FALSE;
   1978     while (IS_DIGIT(ptr[1]))
   1979       {
   1980       if (s > INT_MAX / 10 - 1) /* Integer overflow */
   1981         {
   1982         overflow = TRUE;
   1983         break;
   1984         }
   1985       s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
   1986       }
   1987     if (overflow) /* Integer overflow */
   1988       {
   1989       while (IS_DIGIT(ptr[1])) ptr++;
   1990       *errorcodeptr = ERR61;
   1991       break;
   1992       }
   1993 
   1994     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
   1995       {
   1996       *errorcodeptr = ERR57;
   1997       break;
   1998       }
   1999 
   2000     if (s == 0)
   2001       {
   2002       *errorcodeptr = ERR58;
   2003       break;
   2004       }
   2005 
   2006     if (negated)
   2007       {
   2008       if (s > cb->bracount)
   2009         {
   2010         *errorcodeptr = ERR15;
   2011         break;
   2012         }
   2013       s = cb->bracount - (s - 1);
   2014       }
   2015 
   2016     escape = -(int)s;
   2017     break;
   2018 
   2019     /* The handling of escape sequences consisting of a string of digits
   2020     starting with one that is not zero is not straightforward. Perl has changed
   2021     over the years. Nowadays \g{} for backreferences and \o{} for octal are
   2022     recommended to avoid the ambiguities in the old syntax.
   2023 
   2024     Outside a character class, the digits are read as a decimal number. If the
   2025     number is less than 10, or if there are that many previous extracting left
   2026     brackets, it is a back reference. Otherwise, up to three octal digits are
   2027     read to form an escaped character code. Thus \123 is likely to be octal 123
   2028     (cf \0123, which is octal 012 followed by the literal 3).
   2029 
   2030     Inside a character class, \ followed by a digit is always either a literal
   2031     8 or 9 or an octal number. */
   2032 
   2033     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
   2034     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   2035 
   2036     if (!isclass)
   2037       {
   2038       oldptr = ptr;
   2039       /* The integer range is limited by the machine's int representation. */
   2040       s = c - CHAR_0;
   2041       overflow = FALSE;
   2042       while (IS_DIGIT(ptr[1]))
   2043         {
   2044         if (s > INT_MAX / 10 - 1) /* Integer overflow */
   2045           {
   2046           overflow = TRUE;
   2047           break;
   2048           }
   2049         s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
   2050         }
   2051       if (overflow) /* Integer overflow */
   2052         {
   2053         while (IS_DIGIT(ptr[1])) ptr++;
   2054         *errorcodeptr = ERR61;
   2055         break;
   2056         }
   2057 
   2058       /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
   2059       are octal escapes if there are not that many previous captures. */
   2060 
   2061       if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
   2062         {
   2063         escape = -(int)s;     /* Indicates a back reference */
   2064         break;
   2065         }
   2066       ptr = oldptr;      /* Put the pointer back and fall through */
   2067       }
   2068 
   2069     /* Handle a digit following \ when the number is not a back reference, or
   2070     we are within a character class. If the first digit is 8 or 9, Perl used to
   2071     generate a binary zero byte and then treat the digit as a following
   2072     literal. At least by Perl 5.18 this changed so as not to insert the binary
   2073     zero. */
   2074 
   2075     if ((c = *ptr) >= CHAR_8) break;
   2076 
   2077     /* Fall through with a digit less than 8 */
   2078 
   2079     /* \0 always starts an octal number, but we may drop through to here with a
   2080     larger first octal digit. The original code used just to take the least
   2081     significant 8 bits of octal numbers (I think this is what early Perls used
   2082     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
   2083     but no more than 3 octal digits. */
   2084 
   2085     case CHAR_0:
   2086     c -= CHAR_0;
   2087     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
   2088         c = c * 8 + *(++ptr) - CHAR_0;
   2089 #if PCRE2_CODE_UNIT_WIDTH == 8
   2090     if (!utf && c > 0xff) *errorcodeptr = ERR51;
   2091 #endif
   2092     break;
   2093 
   2094     /* \o is a relatively new Perl feature, supporting a more general way of
   2095     specifying character codes in octal. The only supported form is \o{ddd}. */
   2096 
   2097     case CHAR_o:
   2098     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
   2099     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
   2100       {
   2101       ptr += 2;
   2102       c = 0;
   2103       overflow = FALSE;
   2104       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
   2105         {
   2106         cc = *ptr++;
   2107         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   2108 #if PCRE2_CODE_UNIT_WIDTH == 32
   2109         if (c >= 0x20000000l) { overflow = TRUE; break; }
   2110 #endif
   2111         c = (c << 3) + (cc - CHAR_0);
   2112 #if PCRE2_CODE_UNIT_WIDTH == 8
   2113         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   2114 #elif PCRE2_CODE_UNIT_WIDTH == 16
   2115         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   2116 #elif PCRE2_CODE_UNIT_WIDTH == 32
   2117         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   2118 #endif
   2119         }
   2120       if (overflow)
   2121         {
   2122         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
   2123         *errorcodeptr = ERR34;
   2124         }
   2125       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   2126         {
   2127         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   2128         }
   2129       else *errorcodeptr = ERR64;
   2130       }
   2131     break;
   2132 
   2133     /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
   2134     two hexadecimal digits. Otherwise it is a lowercase x letter. */
   2135 
   2136     case CHAR_x:
   2137     if ((options & PCRE2_ALT_BSUX) != 0)
   2138       {
   2139       uint32_t xc;
   2140       if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   2141       if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
   2142       c = (cc << 4) | xc;
   2143       ptr += 2;
   2144       }    /* End PCRE2_ALT_BSUX handling */
   2145 
   2146     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
   2147     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
   2148     digits. If not, { used to be treated as a data character. However, Perl
   2149     seems to read hex digits up to the first non-such, and ignore the rest, so
   2150     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
   2151     now gives an error. */
   2152 
   2153     else
   2154       {
   2155       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   2156         {
   2157         ptr += 2;
   2158         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   2159           {
   2160           *errorcodeptr = ERR78;
   2161           break;
   2162           }
   2163         c = 0;
   2164         overflow = FALSE;
   2165 
   2166         while ((cc = XDIGIT(*ptr)) != 0xff)
   2167           {
   2168           ptr++;
   2169           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
   2170 #if PCRE2_CODE_UNIT_WIDTH == 32
   2171           if (c >= 0x10000000l) { overflow = TRUE; break; }
   2172 #endif
   2173           c = (c << 4) | cc;
   2174           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
   2175             {
   2176             overflow = TRUE;
   2177             break;
   2178             }
   2179           }
   2180 
   2181         if (overflow)
   2182           {
   2183           while (XDIGIT(*ptr) != 0xff) ptr++;
   2184           *errorcodeptr = ERR34;
   2185           }
   2186         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   2187           {
   2188           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   2189           }
   2190 
   2191         /* If the sequence of hex digits does not end with '}', give an error.
   2192         We used just to recognize this construct and fall through to the normal
   2193         \x handling, but nowadays Perl gives an error, which seems much more
   2194         sensible, so we do too. */
   2195 
   2196         else *errorcodeptr = ERR67;
   2197         }   /* End of \x{} processing */
   2198 
   2199       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
   2200 
   2201       else
   2202         {
   2203         c = 0;
   2204         if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   2205         ptr++;
   2206         c = cc;
   2207         if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   2208         ptr++;
   2209         c = (c << 4) | cc;
   2210         }     /* End of \xdd handling */
   2211       }       /* End of Perl-style \x handling */
   2212     break;
   2213 
   2214     /* The handling of \c is different in ASCII and EBCDIC environments. In an
   2215     ASCII (or Unicode) environment, an error is given if the character
   2216     following \c is not a printable ASCII character. Otherwise, the following
   2217     character is upper-cased if it is a letter, and after that the 0x40 bit is
   2218     flipped. The result is the value of the escape.
   2219 
   2220     In an EBCDIC environment the handling of \c is compatible with the
   2221     specification in the perlebcdic document. The following character must be
   2222     a letter or one of small number of special characters. These provide a
   2223     means of defining the character values 0-31.
   2224 
   2225     For testing the EBCDIC handling of \c in an ASCII environment, recognize
   2226     the EBCDIC value of 'c' explicitly. */
   2227 
   2228 #if defined EBCDIC && 'a' != 0x81
   2229     case 0x83:
   2230 #else
   2231     case CHAR_c:
   2232 #endif
   2233 
   2234     c = *(++ptr);
   2235     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
   2236     if (c == CHAR_NULL && ptr >= ptrend)
   2237       {
   2238       *errorcodeptr = ERR2;
   2239       break;
   2240       }
   2241 
   2242     /* Handle \c in an ASCII/Unicode environment. */
   2243 
   2244 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
   2245     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
   2246       {
   2247       *errorcodeptr = ERR68;
   2248       break;
   2249       }
   2250     c ^= 0x40;
   2251 
   2252     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
   2253     255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
   2254     encoding. (This is the way Perl indicates that it handles \c?.) The other
   2255     valid sequences correspond to a list of specific characters. */
   2256 
   2257 #else
   2258     if (c == CHAR_QUESTION_MARK)
   2259       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
   2260     else
   2261       {
   2262       for (i = 0; i < 32; i++)
   2263         {
   2264         if (c == ebcdic_escape_c[i]) break;
   2265         }
   2266       if (i < 32) c = i; else *errorcodeptr = ERR68;
   2267       }
   2268 #endif  /* EBCDIC */
   2269 
   2270     break;
   2271 
   2272     /* Any other alphanumeric following \ is an error. Perl gives an error only
   2273     if in warning mode, but PCRE doesn't have a warning mode. */
   2274 
   2275     default:
   2276     *errorcodeptr = ERR3;
   2277     break;
   2278     }
   2279   }
   2280 
   2281 /* Perl supports \N{name} for character names, as well as plain \N for "not
   2282 newline". PCRE does not support \N{name}. However, it does support
   2283 quantification such as \N{2,3}. */
   2284 
   2285 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
   2286      !is_counted_repeat(ptr+2))
   2287   *errorcodeptr = ERR37;
   2288 
   2289 /* If PCRE2_UCP is set, we change the values for \d etc. */
   2290 
   2291 if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
   2292   escape += (ESC_DU - ESC_D);
   2293 
   2294 /* Set the pointer to the final character before returning. */
   2295 
   2296 *ptrptr = ptr;
   2297 *chptr = c;
   2298 return escape;
   2299 }
   2300 
   2301 
   2302 
   2303 #ifdef SUPPORT_UNICODE
   2304 /*************************************************
   2305 *               Handle \P and \p                 *
   2306 *************************************************/
   2307 
   2308 /* This function is called after \P or \p has been encountered, provided that
   2309 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
   2310 contents of ptrptr are pointing at the P or p. On exit, it is left pointing at
   2311 the final code unit of the escape sequence.
   2312 
   2313 Arguments:
   2314   ptrptr         the pattern position pointer
   2315   negptr         a boolean that is set TRUE for negation else FALSE
   2316   ptypeptr       an unsigned int that is set to the type value
   2317   pdataptr       an unsigned int that is set to the detailed property value
   2318   errorcodeptr   the error code variable
   2319   cb             the compile data
   2320 
   2321 Returns:         TRUE if the type value was found, or FALSE for an invalid type
   2322 */
   2323 
   2324 static BOOL
   2325 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
   2326   unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
   2327 {
   2328 register PCRE2_UCHAR c;
   2329 size_t i, bot, top;
   2330 PCRE2_SPTR ptr = *ptrptr;
   2331 PCRE2_UCHAR name[32];
   2332 
   2333 *negptr = FALSE;
   2334 c = *(++ptr);
   2335 
   2336 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
   2337 negation. */
   2338 
   2339 if (c == CHAR_LEFT_CURLY_BRACKET)
   2340   {
   2341   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
   2342     {
   2343     *negptr = TRUE;
   2344     ptr++;
   2345     }
   2346   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
   2347     {
   2348     c = *(++ptr);
   2349     if (c == CHAR_NULL) goto ERROR_RETURN;
   2350     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
   2351     name[i] = c;
   2352     }
   2353   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
   2354   name[i] = 0;
   2355   }
   2356 
   2357 /* Otherwise there is just one following character, which must be an ASCII
   2358 letter. */
   2359 
   2360 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
   2361   {
   2362   name[0] = c;
   2363   name[1] = 0;
   2364   }
   2365 else goto ERROR_RETURN;
   2366 
   2367 *ptrptr = ptr;
   2368 
   2369 /* Search for a recognized property name using binary chop. */
   2370 
   2371 bot = 0;
   2372 top = PRIV(utt_size);
   2373 
   2374 while (bot < top)
   2375   {
   2376   int r;
   2377   i = (bot + top) >> 1;
   2378   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
   2379   if (r == 0)
   2380     {
   2381     *ptypeptr = PRIV(utt)[i].type;
   2382     *pdataptr = PRIV(utt)[i].value;
   2383     return TRUE;
   2384     }
   2385   if (r > 0) bot = i + 1; else top = i;
   2386   }
   2387 *errorcodeptr = ERR47;   /* Unrecognized name */
   2388 return FALSE;
   2389 
   2390 ERROR_RETURN:            /* Malformed \P or \p */
   2391 *errorcodeptr = ERR46;
   2392 *ptrptr = ptr;
   2393 return FALSE;
   2394 }
   2395 #endif
   2396 
   2397 
   2398 
   2399 /*************************************************
   2400 *         Read repeat counts                     *
   2401 *************************************************/
   2402 
   2403 /* Read an item of the form {n,m} and return the values. This is called only
   2404 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
   2405 so the syntax is guaranteed to be correct, but we need to check the values.
   2406 
   2407 Arguments:
   2408   p              pointer to first char after '{'
   2409   minp           pointer to int for min
   2410   maxp           pointer to int for max
   2411                  returned as -1 if no max
   2412   errorcodeptr   points to error code variable
   2413 
   2414 Returns:         pointer to '}' on success;
   2415                  current ptr on error, with errorcodeptr set non-zero
   2416 */
   2417 
   2418 static PCRE2_SPTR
   2419 read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
   2420 {
   2421 int min = 0;
   2422 int max = -1;
   2423 
   2424 while (IS_DIGIT(*p))
   2425   {
   2426   min = min * 10 + (int)(*p++ - CHAR_0);
   2427   if (min > 65535)
   2428     {
   2429     *errorcodeptr = ERR5;
   2430     return p;
   2431     }
   2432   }
   2433 
   2434 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
   2435   {
   2436   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
   2437     {
   2438     max = 0;
   2439     while(IS_DIGIT(*p))
   2440       {
   2441       max = max * 10 + (int)(*p++ - CHAR_0);
   2442       if (max > 65535)
   2443         {
   2444         *errorcodeptr = ERR5;
   2445         return p;
   2446         }
   2447       }
   2448     if (max < min)
   2449       {
   2450       *errorcodeptr = ERR4;
   2451       return p;
   2452       }
   2453     }
   2454   }
   2455 
   2456 *minp = min;
   2457 *maxp = max;
   2458 return p;
   2459 }
   2460 
   2461 
   2462 
   2463 /*************************************************
   2464 *   Scan compiled regex for recursion reference  *
   2465 *************************************************/
   2466 
   2467 /* This function scans through a compiled pattern until it finds an instance of
   2468 OP_RECURSE.
   2469 
   2470 Arguments:
   2471   code        points to start of expression
   2472   utf         TRUE in UTF mode
   2473 
   2474 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
   2475 */
   2476 
   2477 static PCRE2_SPTR
   2478 find_recurse(PCRE2_SPTR code, BOOL utf)
   2479 {
   2480 for (;;)
   2481   {
   2482   register PCRE2_UCHAR c = *code;
   2483   if (c == OP_END) return NULL;
   2484   if (c == OP_RECURSE) return code;
   2485 
   2486   /* XCLASS is used for classes that cannot be represented just by a bit map.
   2487   This includes negated single high-valued characters. CALLOUT_STR is used for
   2488   callouts with string arguments. In both cases the length in the table is
   2489   zero; the actual length is stored in the compiled code. */
   2490 
   2491   if (c == OP_XCLASS) code += GET(code, 1);
   2492     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
   2493 
   2494   /* Otherwise, we can get the item's length from the table, except that for
   2495   repeated character types, we have to test for \p and \P, which have an extra
   2496   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   2497   must add in its length. */
   2498 
   2499   else
   2500     {
   2501     switch(c)
   2502       {
   2503       case OP_TYPESTAR:
   2504       case OP_TYPEMINSTAR:
   2505       case OP_TYPEPLUS:
   2506       case OP_TYPEMINPLUS:
   2507       case OP_TYPEQUERY:
   2508       case OP_TYPEMINQUERY:
   2509       case OP_TYPEPOSSTAR:
   2510       case OP_TYPEPOSPLUS:
   2511       case OP_TYPEPOSQUERY:
   2512       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2513       break;
   2514 
   2515       case OP_TYPEPOSUPTO:
   2516       case OP_TYPEUPTO:
   2517       case OP_TYPEMINUPTO:
   2518       case OP_TYPEEXACT:
   2519       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2520         code += 2;
   2521       break;
   2522 
   2523       case OP_MARK:
   2524       case OP_PRUNE_ARG:
   2525       case OP_SKIP_ARG:
   2526       case OP_THEN_ARG:
   2527       code += code[1];
   2528       break;
   2529       }
   2530 
   2531     /* Add in the fixed length from the table */
   2532 
   2533     code += PRIV(OP_lengths)[c];
   2534 
   2535     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
   2536     be followed by a multi-unit character. The length in the table is a
   2537     minimum, so we have to arrange to skip the extra units. */
   2538 
   2539 #ifdef MAYBE_UTF_MULTI
   2540     if (utf) switch(c)
   2541       {
   2542       case OP_CHAR:
   2543       case OP_CHARI:
   2544       case OP_NOT:
   2545       case OP_NOTI:
   2546       case OP_EXACT:
   2547       case OP_EXACTI:
   2548       case OP_NOTEXACT:
   2549       case OP_NOTEXACTI:
   2550       case OP_UPTO:
   2551       case OP_UPTOI:
   2552       case OP_NOTUPTO:
   2553       case OP_NOTUPTOI:
   2554       case OP_MINUPTO:
   2555       case OP_MINUPTOI:
   2556       case OP_NOTMINUPTO:
   2557       case OP_NOTMINUPTOI:
   2558       case OP_POSUPTO:
   2559       case OP_POSUPTOI:
   2560       case OP_NOTPOSUPTO:
   2561       case OP_NOTPOSUPTOI:
   2562       case OP_STAR:
   2563       case OP_STARI:
   2564       case OP_NOTSTAR:
   2565       case OP_NOTSTARI:
   2566       case OP_MINSTAR:
   2567       case OP_MINSTARI:
   2568       case OP_NOTMINSTAR:
   2569       case OP_NOTMINSTARI:
   2570       case OP_POSSTAR:
   2571       case OP_POSSTARI:
   2572       case OP_NOTPOSSTAR:
   2573       case OP_NOTPOSSTARI:
   2574       case OP_PLUS:
   2575       case OP_PLUSI:
   2576       case OP_NOTPLUS:
   2577       case OP_NOTPLUSI:
   2578       case OP_MINPLUS:
   2579       case OP_MINPLUSI:
   2580       case OP_NOTMINPLUS:
   2581       case OP_NOTMINPLUSI:
   2582       case OP_POSPLUS:
   2583       case OP_POSPLUSI:
   2584       case OP_NOTPOSPLUS:
   2585       case OP_NOTPOSPLUSI:
   2586       case OP_QUERY:
   2587       case OP_QUERYI:
   2588       case OP_NOTQUERY:
   2589       case OP_NOTQUERYI:
   2590       case OP_MINQUERY:
   2591       case OP_MINQUERYI:
   2592       case OP_NOTMINQUERY:
   2593       case OP_NOTMINQUERYI:
   2594       case OP_POSQUERY:
   2595       case OP_POSQUERYI:
   2596       case OP_NOTPOSQUERY:
   2597       case OP_NOTPOSQUERYI:
   2598       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   2599       break;
   2600       }
   2601 #else
   2602     (void)(utf);  /* Keep compiler happy by referencing function argument */
   2603 #endif  /* MAYBE_UTF_MULTI */
   2604     }
   2605   }
   2606 }
   2607 
   2608 
   2609 
   2610 /*************************************************
   2611 *           Check for POSIX class syntax         *
   2612 *************************************************/
   2613 
   2614 /* This function is called when the sequence "[:" or "[." or "[=" is
   2615 encountered in a character class. It checks whether this is followed by a
   2616 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
   2617 reach an unescaped ']' without the special preceding character, return FALSE.
   2618 
   2619 Originally, this function only recognized a sequence of letters between the
   2620 terminators, but it seems that Perl recognizes any sequence of characters,
   2621 though of course unknown POSIX names are subsequently rejected. Perl gives an
   2622 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
   2623 didn't consider this to be a POSIX class. Likewise for [:1234:].
   2624 
   2625 The problem in trying to be exactly like Perl is in the handling of escapes. We
   2626 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
   2627 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
   2628 below handles the special cases \\ and \], but does not try to do any other
   2629 escape processing. This makes it different from Perl for cases such as
   2630 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
   2631 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
   2632 when Perl does, I think.
   2633 
   2634 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
   2635 It seems that the appearance of a nested POSIX class supersedes an apparent
   2636 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
   2637 a digit. This is handled by returning FALSE if the start of a new group with
   2638 the same terminator is encountered, since the next closing sequence must close
   2639 the nested group, not the outer one.
   2640 
   2641 In Perl, unescaped square brackets may also appear as part of class names. For
   2642 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
   2643 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
   2644 seem right at all. PCRE does not allow closing square brackets in POSIX class
   2645 names.
   2646 
   2647 Arguments:
   2648   ptr      pointer to the initial [
   2649   endptr   where to return a pointer to the terminating ':', '.', or '='
   2650 
   2651 Returns:   TRUE or FALSE
   2652 */
   2653 
   2654 static BOOL
   2655 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr)
   2656 {
   2657 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
   2658 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
   2659 
   2660 for (++ptr; *ptr != CHAR_NULL; ptr++)
   2661   {
   2662   if (*ptr == CHAR_BACKSLASH &&
   2663       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
   2664     ptr++;
   2665   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
   2666             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
   2667   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   2668     {
   2669     *endptr = ptr;
   2670     return TRUE;
   2671     }
   2672   }
   2673 
   2674 return FALSE;
   2675 }
   2676 
   2677 
   2678 
   2679 /*************************************************
   2680 *          Check POSIX class name                *
   2681 *************************************************/
   2682 
   2683 /* This function is called to check the name given in a POSIX-style class entry
   2684 such as [:alnum:].
   2685 
   2686 Arguments:
   2687   ptr        points to the first letter
   2688   len        the length of the name
   2689 
   2690 Returns:     a value representing the name, or -1 if unknown
   2691 */
   2692 
   2693 static int
   2694 check_posix_name(PCRE2_SPTR ptr, int len)
   2695 {
   2696 const char *pn = posix_names;
   2697 register int yield = 0;
   2698 while (posix_name_lengths[yield] != 0)
   2699   {
   2700   if (len == posix_name_lengths[yield] &&
   2701     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
   2702   pn += posix_name_lengths[yield] + 1;
   2703   yield++;
   2704   }
   2705 return -1;
   2706 }
   2707 
   2708 
   2709 
   2710 #ifdef SUPPORT_UNICODE
   2711 /*************************************************
   2712 *           Get othercase range                  *
   2713 *************************************************/
   2714 
   2715 /* This function is passed the start and end of a class range in UCT mode. It
   2716 searches up the characters, looking for ranges of characters in the "other"
   2717 case. Each call returns the next one, updating the start address. A character
   2718 with multiple other cases is returned on its own with a special return value.
   2719 
   2720 Arguments:
   2721   cptr        points to starting character value; updated
   2722   d           end value
   2723   ocptr       where to put start of othercase range
   2724   odptr       where to put end of othercase range
   2725 
   2726 Yield:        -1 when no more
   2727                0 when a range is returned
   2728               >0 the CASESET offset for char with multiple other cases
   2729                 in this case, ocptr contains the original
   2730 */
   2731 
   2732 static int
   2733 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
   2734   uint32_t *odptr)
   2735 {
   2736 uint32_t c, othercase, next;
   2737 unsigned int co;
   2738 
   2739 /* Find the first character that has an other case. If it has multiple other
   2740 cases, return its case offset value. */
   2741 
   2742 for (c = *cptr; c <= d; c++)
   2743   {
   2744   if ((co = UCD_CASESET(c)) != 0)
   2745     {
   2746     *ocptr = c++;   /* Character that has the set */
   2747     *cptr = c;      /* Rest of input range */
   2748     return (int)co;
   2749     }
   2750   if ((othercase = UCD_OTHERCASE(c)) != c) break;
   2751   }
   2752 
   2753 if (c > d) return -1;  /* Reached end of range */
   2754 
   2755 /* Found a character that has a single other case. Search for the end of the
   2756 range, which is either the end of the input range, or a character that has zero
   2757 or more than one other cases. */
   2758 
   2759 *ocptr = othercase;
   2760 next = othercase + 1;
   2761 
   2762 for (++c; c <= d; c++)
   2763   {
   2764   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
   2765   next++;
   2766   }
   2767 
   2768 *odptr = next - 1;     /* End of othercase range */
   2769 *cptr = c;             /* Rest of input range */
   2770 return 0;
   2771 }
   2772 #endif  /* SUPPORT_UNICODE */
   2773 
   2774 
   2775 
   2776 /*************************************************
   2777 *        Add a character or range to a class     *
   2778 *************************************************/
   2779 
   2780 /* This function packages up the logic of adding a character or range of
   2781 characters to a class. The character values in the arguments will be within the
   2782 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
   2783 mutually recursive with the function immediately below.
   2784 
   2785 Arguments:
   2786   classbits     the bit map for characters < 256
   2787   uchardptr     points to the pointer for extra data
   2788   options       the options word
   2789   cb            compile data
   2790   start         start of range character
   2791   end           end of range character
   2792 
   2793 Returns:        the number of < 256 characters added
   2794                 the pointer to extra data is updated
   2795 */
   2796 
   2797 static unsigned int
   2798 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
   2799   compile_block *cb, uint32_t start, uint32_t end)
   2800 {
   2801 uint32_t c;
   2802 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
   2803 unsigned int n8 = 0;
   2804 
   2805 /* If caseless matching is required, scan the range and process alternate
   2806 cases. In Unicode, there are 8-bit characters that have alternate cases that
   2807 are greater than 255 and vice-versa. Sometimes we can just extend the original
   2808 range. */
   2809 
   2810 if ((options & PCRE2_CASELESS) != 0)
   2811   {
   2812 #ifdef SUPPORT_UNICODE
   2813   if ((options & PCRE2_UTF) != 0)
   2814     {
   2815     int rc;
   2816     uint32_t oc, od;
   2817 
   2818     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
   2819     c = start;
   2820 
   2821     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
   2822       {
   2823       /* Handle a single character that has more than one other case. */
   2824 
   2825       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
   2826         PRIV(ucd_caseless_sets) + rc, oc);
   2827 
   2828       /* Do nothing if the other case range is within the original range. */
   2829 
   2830       else if (oc >= start && od <= end) continue;
   2831 
   2832       /* Extend the original range if there is overlap, noting that if oc < c, we
   2833       can't have od > end because a subrange is always shorter than the basic
   2834       range. Otherwise, use a recursive call to add the additional range. */
   2835 
   2836       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
   2837       else if (od > end && oc <= end + 1)
   2838         {
   2839         end = od;       /* Extend upwards */
   2840         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
   2841         }
   2842       else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
   2843       }
   2844     }
   2845   else
   2846 #endif  /* SUPPORT_UNICODE */
   2847 
   2848   /* Not UTF mode */
   2849 
   2850   for (c = start; c <= classbits_end; c++)
   2851     {
   2852     SETBIT(classbits, cb->fcc[c]);
   2853     n8++;
   2854     }
   2855   }
   2856 
   2857 /* Now handle the original range. Adjust the final value according to the bit
   2858 length - this means that the same lists of (e.g.) horizontal spaces can be used
   2859 in all cases. */
   2860 
   2861 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
   2862   end = MAX_NON_UTF_CHAR;
   2863 
   2864 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
   2865 
   2866 for (c = start; c <= classbits_end; c++)
   2867   {
   2868   /* Regardless of start, c will always be <= 255. */
   2869   SETBIT(classbits, c);
   2870   n8++;
   2871   }
   2872 
   2873 #ifdef SUPPORT_WIDE_CHARS
   2874 if (start <= 0xff) start = 0xff + 1;
   2875 
   2876 if (end >= start)
   2877   {
   2878   PCRE2_UCHAR *uchardata = *uchardptr;
   2879 
   2880 #ifdef SUPPORT_UNICODE
   2881   if ((options & PCRE2_UTF) != 0)
   2882     {
   2883     if (start < end)
   2884       {
   2885       *uchardata++ = XCL_RANGE;
   2886       uchardata += PRIV(ord2utf)(start, uchardata);
   2887       uchardata += PRIV(ord2utf)(end, uchardata);
   2888       }
   2889     else if (start == end)
   2890       {
   2891       *uchardata++ = XCL_SINGLE;
   2892       uchardata += PRIV(ord2utf)(start, uchardata);
   2893       }
   2894     }
   2895   else
   2896 #endif  /* SUPPORT_UNICODE */
   2897 
   2898   /* Without UTF support, character values are constrained by the bit length,
   2899   and can only be > 256 for 16-bit and 32-bit libraries. */
   2900 
   2901 #if PCRE2_CODE_UNIT_WIDTH == 8
   2902     {}
   2903 #else
   2904   if (start < end)
   2905     {
   2906     *uchardata++ = XCL_RANGE;
   2907     *uchardata++ = start;
   2908     *uchardata++ = end;
   2909     }
   2910   else if (start == end)
   2911     {
   2912     *uchardata++ = XCL_SINGLE;
   2913     *uchardata++ = start;
   2914     }
   2915 #endif
   2916   *uchardptr = uchardata;   /* Updata extra data pointer */
   2917   }
   2918 #else
   2919   (void)uchardptr;          /* Avoid compiler warning */
   2920 #endif /* SUPPORT_WIDE_CHARS */
   2921 
   2922 return n8;    /* Number of 8-bit characters */
   2923 }
   2924 
   2925 
   2926 
   2927 /*************************************************
   2928 *        Add a list of characters to a class     *
   2929 *************************************************/
   2930 
   2931 /* This function is used for adding a list of case-equivalent characters to a
   2932 class, and also for adding a list of horizontal or vertical whitespace. If the
   2933 list is in order (which it should be), ranges of characters are detected and
   2934 handled appropriately. This function is mutually recursive with the function
   2935 above.
   2936 
   2937 Arguments:
   2938   classbits     the bit map for characters < 256
   2939   uchardptr     points to the pointer for extra data
   2940   options       the options word
   2941   cb            contains pointers to tables etc.
   2942   p             points to row of 32-bit values, terminated by NOTACHAR
   2943   except        character to omit; this is used when adding lists of
   2944                   case-equivalent characters to avoid including the one we
   2945                   already know about
   2946 
   2947 Returns:        the number of < 256 characters added
   2948                 the pointer to extra data is updated
   2949 */
   2950 
   2951 static unsigned int
   2952 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
   2953   compile_block *cb, const uint32_t *p, unsigned int except)
   2954 {
   2955 unsigned int n8 = 0;
   2956 while (p[0] < NOTACHAR)
   2957   {
   2958   unsigned int n = 0;
   2959   if (p[0] != except)
   2960     {
   2961     while(p[n+1] == p[0] + n + 1) n++;
   2962     n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
   2963     }
   2964   p += n + 1;
   2965   }
   2966 return n8;
   2967 }
   2968 
   2969 
   2970 
   2971 /*************************************************
   2972 *    Add characters not in a list to a class     *
   2973 *************************************************/
   2974 
   2975 /* This function is used for adding the complement of a list of horizontal or
   2976 vertical whitespace to a class. The list must be in order.
   2977 
   2978 Arguments:
   2979   classbits     the bit map for characters < 256
   2980   uchardptr     points to the pointer for extra data
   2981   options       the options word
   2982   cb            contains pointers to tables etc.
   2983   p             points to row of 32-bit values, terminated by NOTACHAR
   2984 
   2985 Returns:        the number of < 256 characters added
   2986                 the pointer to extra data is updated
   2987 */
   2988 
   2989 static unsigned int
   2990 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
   2991   uint32_t options, compile_block *cb, const uint32_t *p)
   2992 {
   2993 BOOL utf = (options & PCRE2_UTF) != 0;
   2994 unsigned int n8 = 0;
   2995 if (p[0] > 0)
   2996   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
   2997 while (p[0] < NOTACHAR)
   2998   {
   2999   while (p[1] == p[0] + 1) p++;
   3000   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
   3001     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
   3002   p++;
   3003   }
   3004 return n8;
   3005 }
   3006 
   3007 
   3008 
   3009 /*************************************************
   3010 *       Process (*VERB) name for escapes         *
   3011 *************************************************/
   3012 
   3013 /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
   3014 process the characters in a verb's name argument. It is called twice, once with
   3015 codeptr == NULL, to find out the length of the processed name, and again to put
   3016 the name into memory.
   3017 
   3018 Arguments:
   3019   ptrptr        pointer to the input pointer
   3020   codeptr       pointer to the compiled code pointer
   3021   errorcodeptr  pointer to the error code
   3022   options       the options bits
   3023   utf           TRUE if processing UTF
   3024   cb            compile data block
   3025 
   3026 Returns:        length of the processed name, or < 0 on error
   3027 */
   3028 
   3029 static int
   3030 process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
   3031   uint32_t options, BOOL utf, compile_block *cb)
   3032 {
   3033 int32_t arglen = 0;
   3034 BOOL inescq = FALSE;
   3035 PCRE2_SPTR ptr = *ptrptr;
   3036 PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
   3037 
   3038 for (; ptr < cb->end_pattern; ptr++)
   3039   {
   3040   uint32_t x = *ptr;
   3041 
   3042   /* Skip over literals */
   3043 
   3044   if (inescq)
   3045     {
   3046     if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   3047       {
   3048       inescq = FALSE;
   3049       ptr++;;
   3050       continue;
   3051       }
   3052     }
   3053 
   3054   else  /* Not a literal character */
   3055     {
   3056     if (x == CHAR_RIGHT_PARENTHESIS) break;
   3057 
   3058     /* Skip over comments and whitespace in extended mode. */
   3059 
   3060     if ((options & PCRE2_EXTENDED) != 0)
   3061       {
   3062       PCRE2_SPTR wscptr = ptr;
   3063       while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
   3064       if (x == CHAR_NUMBER_SIGN)
   3065         {
   3066         ptr++;
   3067         while (*ptr != CHAR_NULL || ptr < cb->end_pattern)
   3068           {
   3069           if (IS_NEWLINE(ptr))       /* For non-fixed-length newline cases, */
   3070             {                        /* IS_NEWLINE sets cb->nllen. */
   3071             ptr += cb->nllen;
   3072             break;
   3073             }
   3074           ptr++;
   3075 #ifdef SUPPORT_UNICODE
   3076           if (utf) FORWARDCHAR(ptr);
   3077 #endif
   3078           }
   3079         }
   3080 
   3081       /* If we have skipped any characters, restart the loop. */
   3082 
   3083       if (ptr > wscptr)
   3084         {
   3085         ptr--;
   3086         continue;
   3087         }
   3088       }
   3089 
   3090     /* Process escapes */
   3091 
   3092     if (x == '\\')
   3093       {
   3094       int rc;
   3095       *errorcodeptr = 0;
   3096       rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options,
   3097         FALSE, cb);
   3098       *ptrptr = ptr;   /* For possible error */
   3099       if (*errorcodeptr != 0) return -1;
   3100       if (rc != 0)
   3101         {
   3102         if (rc == ESC_Q)
   3103           {
   3104           inescq = TRUE;
   3105           continue;
   3106           }
   3107         if (rc == ESC_E) continue;
   3108         *errorcodeptr = ERR40;
   3109         return -1;
   3110         }
   3111       }
   3112     }
   3113 
   3114   /* We have the next character in the name. */
   3115 
   3116 #ifdef SUPPORT_UNICODE
   3117   if (utf)
   3118     {
   3119     if (code == NULL)   /* Just want the length */
   3120       {
   3121 #if PCRE2_CODE_UNIT_WIDTH == 8
   3122       int i;
   3123       for (i = 0; i < PRIV(utf8_table1_size); i++)
   3124         if ((int)x <= PRIV(utf8_table1)[i]) break;
   3125       arglen += i;
   3126 #elif PCRE2_CODE_UNIT_WIDTH == 16
   3127       if (x > 0xffff) arglen++;
   3128 #endif
   3129       }
   3130     else
   3131       {
   3132       PCRE2_UCHAR cbuff[8];
   3133       x = PRIV(ord2utf)(x, cbuff);
   3134       memcpy(code, cbuff, CU2BYTES(x));
   3135       code += x;
   3136       }
   3137     }
   3138   else
   3139 #endif  /* SUPPORT_UNICODE */
   3140 
   3141   /* Not UTF */
   3142     {
   3143     if (code != NULL) *code++ = (PCRE2_UCHAR)x;
   3144     }
   3145 
   3146   arglen++;
   3147 
   3148   if ((unsigned int)arglen > MAX_MARK)
   3149     {
   3150     *errorcodeptr = ERR76;
   3151     *ptrptr = ptr;
   3152     return -1;
   3153     }
   3154   }
   3155 
   3156 /* Update the pointers before returning. */
   3157 
   3158 *ptrptr = ptr;
   3159 if (codeptr != NULL) *codeptr = code;
   3160 return arglen;
   3161 }
   3162 
   3163 
   3164 
   3165 /*************************************************
   3166 *          Macro for the next two functions      *
   3167 *************************************************/
   3168 
   3169 /* Both scan_for_captures() and compile_branch() use this macro to generate a
   3170 fragment of code that reads the characters of a name and sets its length
   3171 (checking for not being too long). Count the characters dynamically, to avoid
   3172 the possibility of integer overflow. The same macro is used for reading *VERB
   3173 names. */
   3174 
   3175 #define READ_NAME(ctype, errno, errset)                      \
   3176   namelen = 0;                                               \
   3177   while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)   \
   3178     {                                                        \
   3179     ptr++;                                                   \
   3180     namelen++;                                               \
   3181     if (namelen > MAX_NAME_SIZE)                             \
   3182       {                                                      \
   3183       errset = errno;                                        \
   3184       goto FAILED;                                           \
   3185       }                                                      \
   3186     }
   3187 
   3188 
   3189 
   3190 /*************************************************
   3191 *      Scan regex to identify named groups       *
   3192 *************************************************/
   3193 
   3194 /* This function is called first of all, to scan for named capturing groups so
   3195 that information about them is fully available to both the compiling scans.
   3196 It skips over everything except parenthesized items.
   3197 
   3198 Arguments:
   3199   ptrptr   points to pointer to the start of the pattern
   3200   options  compiling dynamic options
   3201   cb       pointer to the compile data block
   3202 
   3203 Returns:   zero on success or a non-zero error code, with pointer updated
   3204 */
   3205 
   3206 typedef struct nest_save {
   3207   uint16_t  nest_depth;
   3208   uint16_t  reset_group;
   3209   uint16_t  max_group;
   3210   uint16_t  flags;
   3211 } nest_save;
   3212 
   3213 #define NSF_RESET    0x0001u
   3214 #define NSF_EXTENDED 0x0002u
   3215 #define NSF_DUPNAMES 0x0004u
   3216 
   3217 static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
   3218   compile_block *cb)
   3219 {
   3220 uint32_t c;
   3221 uint32_t delimiter;
   3222 uint32_t set, unset, *optset;
   3223 uint32_t skiptoket = 0;
   3224 uint16_t nest_depth = 0;
   3225 int errorcode = 0;
   3226 int escape;
   3227 int namelen;
   3228 int i;
   3229 BOOL inescq = FALSE;
   3230 BOOL isdupname;
   3231 BOOL utf = (options & PCRE2_UTF) != 0;
   3232 BOOL negate_class;
   3233 PCRE2_SPTR name;
   3234 PCRE2_SPTR start;
   3235 PCRE2_SPTR ptr = *ptrptr;
   3236 named_group *ng;
   3237 nest_save *top_nest = NULL;
   3238 nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
   3239 
   3240 /* The size of the nest_save structure might not be a factor of the size of the
   3241 workspace. Therefore we must round down end_nests so as to correctly avoid
   3242 creating a nest_save that spans the end of the workspace. */
   3243 
   3244 end_nests = (nest_save *)((char *)end_nests -
   3245   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
   3246 
   3247 /* Now scan the pattern */
   3248 
   3249 for (; ptr < cb->end_pattern; ptr++)
   3250   {
   3251   c = *ptr;
   3252 
   3253   /* Parenthesized groups set skiptoket when all following characters up to the
   3254   next closing parenthesis must be ignored. The parenthesis itself must be
   3255   processed (to end the nested parenthesized item). */
   3256 
   3257   if (skiptoket != 0)
   3258     {
   3259     if (c != CHAR_RIGHT_PARENTHESIS) continue;
   3260     skiptoket = 0;
   3261     }
   3262 
   3263   /* Skip over literals */
   3264 
   3265   if (inescq)
   3266     {
   3267     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   3268       {
   3269       inescq = FALSE;
   3270       ptr++;
   3271       }
   3272     continue;
   3273     }
   3274 
   3275   /* Skip over # comments and whitespace in extended mode. */
   3276 
   3277   if ((options & PCRE2_EXTENDED) != 0)
   3278     {
   3279     PCRE2_SPTR wscptr = ptr;
   3280     while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
   3281     if (c == CHAR_NUMBER_SIGN)
   3282       {
   3283       ptr++;
   3284       while (ptr < cb->end_pattern)
   3285         {
   3286         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
   3287           {                          /* IS_NEWLINE sets cb->nllen. */
   3288           ptr += cb->nllen;
   3289           break;
   3290           }
   3291         ptr++;
   3292 #ifdef SUPPORT_UNICODE
   3293         if (utf) FORWARDCHAR(ptr);
   3294 #endif
   3295         }
   3296       }
   3297 
   3298     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
   3299     a comment. */
   3300 
   3301     if (ptr > wscptr)
   3302       {
   3303       ptr--;
   3304       continue;
   3305       }
   3306     }
   3307 
   3308   /* Process the next pattern item. */
   3309 
   3310   switch(c)
   3311     {
   3312     default:              /* Most characters are just skipped */
   3313     break;
   3314 
   3315     /* Skip escapes except for \Q */
   3316 
   3317     case CHAR_BACKSLASH:
   3318     errorcode = 0;
   3319     escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, options,
   3320       FALSE, cb);
   3321     if (errorcode != 0) goto FAILED;
   3322     if (escape == ESC_Q) inescq = TRUE;
   3323     break;
   3324 
   3325     /* Skip a character class. The syntax is complicated so we have to
   3326     replicate some of what happens when a class is processed for real. */
   3327 
   3328     case CHAR_LEFT_SQUARE_BRACKET:
   3329     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 ||
   3330         PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
   3331       {
   3332       ptr += 6;
   3333       break;
   3334       }
   3335 
   3336     /* If the first character is '^', set the negation flag (not actually used
   3337     here, except to recognize only one ^) and skip it. If the first few
   3338     characters (either before or after ^) are \Q\E or \E we skip them too. This
   3339     makes for compatibility with Perl. */
   3340 
   3341     negate_class = FALSE;
   3342     for (;;)
   3343       {
   3344       c = *(++ptr);   /* First character in class */
   3345       if (c == CHAR_BACKSLASH)
   3346         {
   3347         if (ptr[1] == CHAR_E)
   3348           ptr++;
   3349         else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
   3350           ptr += 3;
   3351         else
   3352           break;
   3353         }
   3354       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   3355         negate_class = TRUE;
   3356       else break;
   3357       }
   3358 
   3359     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   3360         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
   3361       break;
   3362 
   3363     /* Loop for the contents of the class */
   3364 
   3365     for (;;)
   3366       {
   3367       PCRE2_SPTR tempptr;
   3368 
   3369       if (c == CHAR_NULL && ptr >= cb->end_pattern)
   3370         {
   3371         errorcode = ERR6;  /* Missing terminating ']' */
   3372         goto FAILED;
   3373         }
   3374 
   3375 #ifdef SUPPORT_UNICODE
   3376       if (utf && HAS_EXTRALEN(c))
   3377         {                           /* Braces are required because the */
   3378         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
   3379         }
   3380 #endif
   3381 
   3382       /* Inside \Q...\E everything is literal except \E */
   3383 
   3384       if (inescq)
   3385         {
   3386         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
   3387           {
   3388           inescq = FALSE;                   /* Reset literal state */
   3389           ptr++;                            /* Skip the 'E' */
   3390           }
   3391         goto CONTINUE_CLASS;
   3392         }
   3393 
   3394       /* Skip POSIX class names. */
   3395       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   3396           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   3397            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
   3398         {
   3399         ptr = tempptr + 1;
   3400         }
   3401       else if (c == CHAR_BACKSLASH)
   3402         {
   3403         errorcode = 0;
   3404         escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode,
   3405           options, TRUE, cb);
   3406         if (errorcode != 0) goto FAILED;
   3407         if (escape == ESC_Q) inescq = TRUE;
   3408         }
   3409 
   3410       CONTINUE_CLASS:
   3411       c = *(++ptr);
   3412       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
   3413       }     /* End of class-processing loop */
   3414     break;
   3415 
   3416     /* This is the real work of this function - handling parentheses. */
   3417 
   3418     case CHAR_LEFT_PARENTHESIS:
   3419     nest_depth++;
   3420 
   3421     if (ptr[1] != CHAR_QUESTION_MARK)
   3422       {
   3423       if (ptr[1] != CHAR_ASTERISK)
   3424         {
   3425         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++;
   3426         }
   3427 
   3428       /* (*something) - skip over a name, and then just skip to closing ket
   3429       unless PCRE2_ALT_VERBNAMES is set, in which case we have to process
   3430       escapes in the string after a verb name terminated by a colon. */
   3431 
   3432       else
   3433         {
   3434         ptr += 2;
   3435         while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++;
   3436         if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0)
   3437           {
   3438           ptr++;
   3439           if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
   3440             goto FAILED;
   3441           }
   3442         else
   3443           {
   3444           while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
   3445             ptr++;
   3446           }
   3447         nest_depth--;
   3448         }
   3449       }
   3450 
   3451     /* Handle (?...) groups */
   3452 
   3453     else switch(ptr[2])
   3454       {
   3455       default:
   3456       ptr += 2;
   3457       if (ptr[0] == CHAR_R ||                           /* (?R) */
   3458           ptr[0] == CHAR_NUMBER_SIGN ||                 /* (?#) */
   3459           IS_DIGIT(ptr[0]) ||                           /* (?n) */
   3460           (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1])))   /* (?-n) */
   3461         {
   3462         skiptoket = ptr[0];
   3463         break;
   3464         }
   3465 
   3466       /* Handle (?| and (?imsxJU: which are the only other valid forms. Both
   3467       need a new block on the nest stack. */
   3468 
   3469       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
   3470       else if (++top_nest >= end_nests)
   3471         {
   3472         errorcode = ERR84;
   3473         goto FAILED;
   3474         }
   3475       top_nest->nest_depth = nest_depth;
   3476       top_nest->flags = 0;
   3477       if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED;
   3478       if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES;
   3479 
   3480       if (*ptr == CHAR_VERTICAL_LINE)
   3481         {
   3482         top_nest->reset_group = (uint16_t)cb->bracount;
   3483         top_nest->max_group = (uint16_t)cb->bracount;
   3484         top_nest->flags |= NSF_RESET;
   3485         cb->external_flags |= PCRE2_DUPCAPUSED;
   3486         break;
   3487         }
   3488 
   3489       /* Scan options */
   3490 
   3491       top_nest->reset_group = 0;
   3492       top_nest->max_group = 0;
   3493 
   3494       set = unset = 0;
   3495       optset = &set;
   3496 
   3497       /* Need only track (?x: and (?J: at this stage */
   3498 
   3499       while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
   3500         {
   3501         switch (*ptr++)
   3502           {
   3503           case CHAR_MINUS: optset = &unset; break;
   3504 
   3505           case CHAR_x: *optset |= PCRE2_EXTENDED; break;
   3506 
   3507           case CHAR_J:
   3508           *optset |= PCRE2_DUPNAMES;
   3509           cb->external_flags |= PCRE2_JCHANGED;
   3510           break;
   3511 
   3512           case CHAR_i:
   3513           case CHAR_m:
   3514           case CHAR_s:
   3515           case CHAR_U:
   3516           break;
   3517 
   3518           default:
   3519           errorcode = ERR11;
   3520           ptr--;    /* Correct the offset */
   3521           goto FAILED;
   3522           }
   3523         }
   3524 
   3525       options = (options | set) & (~unset);
   3526 
   3527       /* If the options ended with ')' this is not the start of a nested
   3528       group with option changes, so the options change at this level. If the
   3529       previous level set up a nest block, discard the one we have just created.
   3530       Otherwise adjust it for the previous level. */
   3531 
   3532       if (*ptr == CHAR_RIGHT_PARENTHESIS)
   3533         {
   3534         nest_depth--;
   3535         if (top_nest > (nest_save *)(cb->start_workspace) &&
   3536             (top_nest-1)->nest_depth == nest_depth) top_nest --;
   3537         else top_nest->nest_depth = nest_depth;
   3538         }
   3539       break;
   3540 
   3541       /* Skip over a numerical or string argument for a callout. */
   3542 
   3543       case CHAR_C:
   3544       ptr += 2;
   3545       if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break;
   3546       if (IS_DIGIT(ptr[1]))
   3547         {
   3548         while (IS_DIGIT(ptr[1])) ptr++;
   3549         }
   3550 
   3551       /* Handle a string argument */
   3552 
   3553       else
   3554         {
   3555         ptr++;
   3556         delimiter = 0;
   3557         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
   3558           {
   3559           if (*ptr == PRIV(callout_start_delims)[i])
   3560             {
   3561             delimiter = PRIV(callout_end_delims)[i];
   3562             break;
   3563             }
   3564           }
   3565 
   3566         if (delimiter == 0)
   3567           {
   3568           errorcode = ERR82;
   3569           goto FAILED;
   3570           }
   3571 
   3572         start = ptr;
   3573         do
   3574           {
   3575           if (++ptr >= cb->end_pattern)
   3576             {
   3577             errorcode = ERR81;
   3578             ptr = start;   /* To give a more useful message */
   3579             goto FAILED;
   3580             }
   3581           if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
   3582           }
   3583         while (ptr[0] != delimiter);
   3584         }
   3585 
   3586       /* Check terminating ) */
   3587 
   3588       if (ptr[1] != CHAR_RIGHT_PARENTHESIS)
   3589         {
   3590         errorcode = ERR39;
   3591         ptr++;
   3592         goto FAILED;
   3593         }
   3594       break;
   3595 
   3596       /* Conditional group */
   3597 
   3598       case CHAR_LEFT_PARENTHESIS:
   3599       if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
   3600         {
   3601         nest_depth++;
   3602         ptr += 2;
   3603         break;
   3604         }
   3605 
   3606       /* Must be an assertion or a callout */
   3607 
   3608       switch(ptr[4])
   3609        {
   3610        case CHAR_LESS_THAN_SIGN:
   3611        if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
   3612          goto MISSING_ASSERTION;
   3613        /* Fall through */
   3614 
   3615        case CHAR_C:
   3616        case CHAR_EXCLAMATION_MARK:
   3617        case CHAR_EQUALS_SIGN:
   3618        ptr++;
   3619        break;
   3620 
   3621        default:
   3622        MISSING_ASSERTION:
   3623        ptr += 3;            /* To improve error message */
   3624        errorcode = ERR28;
   3625        goto FAILED;
   3626        }
   3627       break;
   3628 
   3629       case CHAR_COLON:
   3630       case CHAR_GREATER_THAN_SIGN:
   3631       case CHAR_EQUALS_SIGN:
   3632       case CHAR_EXCLAMATION_MARK:
   3633       case CHAR_AMPERSAND:
   3634       case CHAR_PLUS:
   3635       ptr += 2;
   3636       break;
   3637 
   3638       case CHAR_P:
   3639       if (ptr[3] != CHAR_LESS_THAN_SIGN)
   3640         {
   3641         ptr += 3;
   3642         break;
   3643         }
   3644       ptr++;
   3645       c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
   3646       goto DEFINE_NAME;
   3647 
   3648       case CHAR_LESS_THAN_SIGN:
   3649       if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK)
   3650         {
   3651         ptr += 3;
   3652         break;
   3653         }
   3654       c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
   3655       goto DEFINE_NAME;
   3656 
   3657       case CHAR_APOSTROPHE:
   3658       c = CHAR_APOSTROPHE;    /* Terminator */
   3659 
   3660       DEFINE_NAME:
   3661       name = ptr = ptr + 3;
   3662 
   3663       if (*ptr == c)          /* Empty name */
   3664         {
   3665         errorcode = ERR62;
   3666         goto FAILED;
   3667         }
   3668 
   3669       if (IS_DIGIT(*ptr))
   3670         {
   3671         errorcode = ERR44;   /* Group name must start with non-digit */
   3672         goto FAILED;
   3673         }
   3674 
   3675       if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0)
   3676         {
   3677         errorcode = ERR24;
   3678         goto FAILED;
   3679         }
   3680 
   3681       /* Advance ptr, set namelen and check its length. */
   3682       READ_NAME(ctype_word, ERR48, errorcode);
   3683 
   3684       if (*ptr != c)
   3685         {
   3686         errorcode = ERR42;
   3687         goto FAILED;
   3688         }
   3689 
   3690       if (cb->names_found >= MAX_NAME_COUNT)
   3691         {
   3692         errorcode = ERR49;
   3693         goto FAILED;
   3694         }
   3695 
   3696       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
   3697         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
   3698 
   3699       /* We have a valid name for this capturing group. */
   3700 
   3701       cb->bracount++;
   3702 
   3703       /* Scan the list to check for duplicates. For duplicate names, if the
   3704       number is the same, break the loop, which causes the name to be
   3705       discarded; otherwise, if DUPNAMES is not set, give an error.
   3706       If it is set, allow the name with a different number, but continue
   3707       scanning in case this is a duplicate with the same number. For
   3708       non-duplicate names, give an error if the number is duplicated. */
   3709 
   3710       isdupname = FALSE;
   3711       ng = cb->named_groups;
   3712       for (i = 0; i < cb->names_found; i++, ng++)
   3713         {
   3714         if (namelen == ng->length &&
   3715             PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0)
   3716           {
   3717           if (ng->number == cb->bracount) break;
   3718           if ((options & PCRE2_DUPNAMES) == 0)
   3719             {
   3720             errorcode = ERR43;
   3721             goto FAILED;
   3722             }
   3723           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
   3724           cb->dupnames = TRUE;              /* Duplicate names exist */
   3725           }
   3726         else if (ng->number == cb->bracount)
   3727           {
   3728           errorcode = ERR65;
   3729           goto FAILED;
   3730           }
   3731         }
   3732 
   3733       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
   3734 
   3735       /* Increase the list size if necessary */
   3736 
   3737       if (cb->names_found >= cb->named_group_list_size)
   3738         {
   3739         uint32_t newsize = cb->named_group_list_size * 2;
   3740         named_group *newspace =
   3741           cb->cx->memctl.malloc(newsize * sizeof(named_group),
   3742           cb->cx->memctl.memory_data);
   3743         if (newspace == NULL)
   3744           {
   3745           errorcode = ERR21;
   3746           goto FAILED;
   3747           }
   3748 
   3749         memcpy(newspace, cb->named_groups,
   3750           cb->named_group_list_size * sizeof(named_group));
   3751         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
   3752           cb->cx->memctl.free((void *)cb->named_groups,
   3753           cb->cx->memctl.memory_data);
   3754         cb->named_groups = newspace;
   3755         cb->named_group_list_size = newsize;
   3756         }
   3757 
   3758       /* Add this name to the list */
   3759 
   3760       cb->named_groups[cb->names_found].name = name;
   3761       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
   3762       cb->named_groups[cb->names_found].number = cb->bracount;
   3763       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
   3764       cb->names_found++;
   3765       break;
   3766       }        /* End of (? switch */
   3767     break;     /* End of ( handling */
   3768 
   3769     /* At an alternation, reset the capture count if we are in a (?| group. */
   3770 
   3771     case CHAR_VERTICAL_LINE:
   3772     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
   3773         (top_nest->flags & NSF_RESET) != 0)
   3774       {
   3775       if (cb->bracount > top_nest->max_group)
   3776         top_nest->max_group = (uint16_t)cb->bracount;
   3777       cb->bracount = top_nest->reset_group;
   3778       }
   3779     break;
   3780 
   3781     /* At a right parenthesis, reset the capture count to the maximum if we
   3782     are in a (?| group and/or reset the extended option. */
   3783 
   3784     case CHAR_RIGHT_PARENTHESIS:
   3785     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
   3786       {
   3787       if ((top_nest->flags & NSF_RESET) != 0 &&
   3788           top_nest->max_group > cb->bracount)
   3789         cb->bracount = top_nest->max_group;
   3790       if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED;
   3791         else options &= ~PCRE2_EXTENDED;
   3792       if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES;
   3793         else options &= ~PCRE2_DUPNAMES;
   3794       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
   3795         else top_nest--;
   3796       }
   3797     if (nest_depth == 0)    /* Unmatched closing parenthesis */
   3798       {
   3799       errorcode = ERR22;
   3800       goto FAILED;
   3801       }
   3802     nest_depth--;
   3803     break;
   3804     }
   3805   }
   3806 
   3807 if (nest_depth == 0)
   3808   {
   3809   cb->final_bracount = cb->bracount;
   3810   return 0;
   3811   }
   3812 
   3813 /* We give a special error for a missing closing parentheses after (?# because
   3814 it might otherwise be hard to see where the missing character is. */
   3815 
   3816 errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14;
   3817 
   3818 FAILED:
   3819 *ptrptr = ptr;
   3820 return errorcode;
   3821 }
   3822 
   3823 
   3824 
   3825 /*************************************************
   3826 *           Compile one branch                   *
   3827 *************************************************/
   3828 
   3829 /* Scan the pattern, compiling it into the a vector. If the options are
   3830 changed during the branch, the pointer is used to change the external options
   3831 bits. This function is used during the pre-compile phase when we are trying
   3832 to find out the amount of memory needed, as well as during the real compile
   3833 phase. The value of lengthptr distinguishes the two phases.
   3834 
   3835 Arguments:
   3836   optionsptr        pointer to the option bits
   3837   codeptr           points to the pointer to the current code point
   3838   ptrptr            points to the current pattern pointer
   3839   errorcodeptr      points to error code variable
   3840   firstcuptr        place to put the first required code unit
   3841   firstcuflagsptr   place to put the first code unit flags, or a negative number
   3842   reqcuptr          place to put the last required code unit
   3843   reqcuflagsptr     place to put the last required code unit flags, or a negative number
   3844   bcptr             points to current branch chain
   3845   cond_depth        conditional nesting depth
   3846   cb                contains pointers to tables etc.
   3847   lengthptr         NULL during the real compile phase
   3848                     points to length accumulator during pre-compile phase
   3849 
   3850 Returns:            TRUE on success
   3851                     FALSE, with *errorcodeptr set non-zero on error
   3852 */
   3853 
   3854 static BOOL
   3855 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr,
   3856   PCRE2_SPTR *ptrptr, int *errorcodeptr,
   3857   uint32_t *firstcuptr, int32_t *firstcuflagsptr,
   3858   uint32_t *reqcuptr, int32_t *reqcuflagsptr,
   3859   branch_chain *bcptr, int cond_depth,
   3860   compile_block *cb, size_t *lengthptr)
   3861 {
   3862 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
   3863 int bravalue = 0;
   3864 uint32_t greedy_default, greedy_non_default;
   3865 uint32_t repeat_type, op_type;
   3866 uint32_t options = *optionsptr;               /* May change dynamically */
   3867 uint32_t firstcu, reqcu;
   3868 int32_t firstcuflags, reqcuflags;
   3869 uint32_t zeroreqcu, zerofirstcu;
   3870 int32_t zeroreqcuflags, zerofirstcuflags;
   3871 int32_t req_caseopt, reqvary, tempreqvary;
   3872 int after_manual_callout = 0;
   3873 int escape;
   3874 size_t length_prevgroup = 0;
   3875 register uint32_t c;
   3876 register PCRE2_UCHAR *code = *codeptr;
   3877 PCRE2_UCHAR *last_code = code;
   3878 PCRE2_UCHAR *orig_code = code;
   3879 PCRE2_UCHAR *tempcode;
   3880 BOOL inescq = FALSE;
   3881 BOOL groupsetfirstcu = FALSE;
   3882 PCRE2_SPTR ptr = *ptrptr;
   3883 PCRE2_SPTR tempptr;
   3884 PCRE2_UCHAR *previous = NULL;
   3885 PCRE2_UCHAR *previous_callout = NULL;
   3886 uint8_t classbits[32];
   3887 
   3888 /* We can fish out the UTF setting once and for all into a BOOL, but we must
   3889 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
   3890 dynamically as we process the pattern. */
   3891 
   3892 #ifdef SUPPORT_UNICODE
   3893 BOOL utf = (options & PCRE2_UTF) != 0;
   3894 #if PCRE2_CODE_UNIT_WIDTH != 32
   3895 PCRE2_UCHAR utf_units[6];      /* For setting up multi-cu chars */
   3896 #endif
   3897 
   3898 #else  /* No UTF support */
   3899 BOOL utf = FALSE;
   3900 #endif
   3901 
   3902 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
   3903 class_uchardata always so that it can be passed to add_to_class() always,
   3904 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
   3905 alternative calls for the different cases. */
   3906 
   3907 PCRE2_UCHAR *class_uchardata;
   3908 #ifdef SUPPORT_WIDE_CHARS
   3909 BOOL xclass;
   3910 PCRE2_UCHAR *class_uchardata_base;
   3911 #endif
   3912 
   3913 /* Set up the default and non-default settings for greediness */
   3914 
   3915 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
   3916 greedy_non_default = greedy_default ^ 1;
   3917 
   3918 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
   3919 matching encountered yet". It gets changed to REQ_NONE if we hit something that
   3920 matches a non-fixed first unit; reqcu just remains unset if we never find one.
   3921 
   3922 When we hit a repeat whose minimum is zero, we may have to adjust these values
   3923 to take the zero repeat into account. This is implemented by setting them to
   3924 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
   3925 item types that can be repeated set these backoff variables appropriately. */
   3926 
   3927 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
   3928 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
   3929 
   3930 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
   3931 according to the current setting of the caseless flag. The REQ_CASELESS value
   3932 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
   3933 to record the case status of the value. This is used only for ASCII characters.
   3934 */
   3935 
   3936 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
   3937 
   3938 /* Switch on next character until the end of the branch */
   3939 
   3940 for (;; ptr++)
   3941   {
   3942   BOOL negate_class;
   3943   BOOL should_flip_negation;
   3944   BOOL match_all_or_no_wide_chars;
   3945   BOOL possessive_quantifier;
   3946   BOOL is_quantifier;
   3947   BOOL is_recurse;
   3948   BOOL is_dupname;
   3949   BOOL reset_bracount;
   3950   int class_has_8bitchar;
   3951   int class_one_char;
   3952 #ifdef SUPPORT_WIDE_CHARS
   3953   BOOL xclass_has_prop;
   3954 #endif
   3955   int recno;                               /* Must be signed */
   3956   int refsign;                             /* Must be signed */
   3957   int terminator;                          /* Must be signed */
   3958   unsigned int mclength;
   3959   unsigned int tempbracount;
   3960   uint32_t ec;
   3961   uint32_t newoptions;
   3962   uint32_t skipunits;
   3963   uint32_t subreqcu, subfirstcu;
   3964   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
   3965   PCRE2_UCHAR mcbuffer[8];
   3966 
   3967   /* Come here to restart the loop. */
   3968 
   3969   REDO_LOOP:
   3970 
   3971   /* Get next character in the pattern */
   3972 
   3973   c = *ptr;
   3974 
   3975   /* If we are at the end of a nested substitution, revert to the outer level
   3976   string. Nesting only happens one or two levels deep, and the inserted string
   3977   is always zero terminated. */
   3978 
   3979   if (c == CHAR_NULL && cb->nestptr[0] != NULL)
   3980     {
   3981     ptr = cb->nestptr[0];
   3982     cb->nestptr[0] = cb->nestptr[1];
   3983     cb->nestptr[1] = NULL;
   3984     c = *ptr;
   3985     }
   3986 
   3987   /* If we are in the pre-compile phase, accumulate the length used for the
   3988   previous cycle of this loop. */
   3989 
   3990   if (lengthptr != NULL)
   3991     {
   3992     if (code > cb->start_workspace + cb->workspace_size -
   3993         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
   3994       {
   3995       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
   3996         ERR52 : ERR86;
   3997       goto FAILED;
   3998       }
   3999 
   4000     /* There is at least one situation where code goes backwards: this is the
   4001     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
   4002     the class is simply eliminated. However, it is created first, so we have to
   4003     allow memory for it. Therefore, don't ever reduce the length at this point.
   4004     */
   4005 
   4006     if (code < last_code) code = last_code;
   4007 
   4008     /* Paranoid check for integer overflow */
   4009 
   4010     if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
   4011       {
   4012       *errorcodeptr = ERR20;
   4013       goto FAILED;
   4014       }
   4015     *lengthptr += (size_t)(code - last_code);
   4016 
   4017     /* If "previous" is set and it is not at the start of the work space, move
   4018     it back to there, in order to avoid filling up the work space. Otherwise,
   4019     if "previous" is NULL, reset the current code pointer to the start. */
   4020 
   4021     if (previous != NULL)
   4022       {
   4023       if (previous > orig_code)
   4024         {
   4025         memmove(orig_code, previous, (size_t)CU2BYTES(code - previous));
   4026         code -= previous - orig_code;
   4027         previous = orig_code;
   4028         }
   4029       }
   4030     else code = orig_code;
   4031 
   4032     /* Remember where this code item starts so we can pick up the length
   4033     next time round. */
   4034 
   4035     last_code = code;
   4036     }
   4037 
   4038   /* Before doing anything else we must handle all the special items that do
   4039   nothing, and which may come between an item and its quantifier. Otherwise,
   4040   when auto-callouts are enabled, a callout gets incorrectly inserted before
   4041   the quantifier is recognized. After recognizing a "do nothing" item, restart
   4042   the loop in case another one follows. */
   4043 
   4044   /* If c is not NULL we are not at the end of the pattern. If it is NULL, we
   4045   may still be in the pattern with a NULL data item. In these cases, if we are
   4046   in \Q...\E, check for the \E that ends the literal string; if not, we have a
   4047   literal character. If not in \Q...\E, an isolated \E is ignored. */
   4048 
   4049   if (c != CHAR_NULL || ptr < cb->end_pattern)
   4050     {
   4051     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   4052       {
   4053       inescq = FALSE;
   4054       ptr++;
   4055       continue;
   4056       }
   4057     else if (inescq)   /* Literal character */
   4058       {
   4059       if (previous_callout != NULL)
   4060         {
   4061         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
   4062           complete_callout(previous_callout, ptr, cb);
   4063         previous_callout = NULL;
   4064         }
   4065       if ((options & PCRE2_AUTO_CALLOUT) != 0)
   4066         {
   4067         previous_callout = code;
   4068         code = auto_callout(code, ptr, cb);
   4069         }
   4070       goto NORMAL_CHAR;
   4071       }
   4072 
   4073     /* Check for the start of a \Q...\E sequence. We must do this here rather
   4074     than later in case it is immediately followed by \E, which turns it into a
   4075     "do nothing" sequence. */
   4076 
   4077     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
   4078       {
   4079       inescq = TRUE;
   4080       ptr++;
   4081       continue;
   4082       }
   4083     }
   4084 
   4085   /* In extended mode, skip white space and #-comments that end at newline. */
   4086 
   4087   if ((options & PCRE2_EXTENDED) != 0)
   4088     {
   4089     PCRE2_SPTR wscptr = ptr;
   4090     while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
   4091     if (c == CHAR_NUMBER_SIGN)
   4092       {
   4093       ptr++;
   4094       while (ptr < cb->end_pattern)
   4095         {
   4096         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
   4097           {                          /* IS_NEWLINE sets cb->nllen. */
   4098           ptr += cb->nllen;
   4099           break;
   4100           }
   4101         ptr++;
   4102 #ifdef SUPPORT_UNICODE
   4103         if (utf) FORWARDCHAR(ptr);
   4104 #endif
   4105         }
   4106       }
   4107 
   4108     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
   4109     a comment. */
   4110 
   4111     if (ptr > wscptr) goto REDO_LOOP;
   4112     }
   4113 
   4114   /* Skip over (?# comments. */
   4115 
   4116   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
   4117       ptr[2] == CHAR_NUMBER_SIGN)
   4118     {
   4119     ptr += 3;
   4120     while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
   4121     if (*ptr != CHAR_RIGHT_PARENTHESIS)
   4122       {
   4123       *errorcodeptr = ERR18;
   4124       goto FAILED;
   4125       }
   4126     continue;
   4127     }
   4128 
   4129   /* End of processing "do nothing" items. See if the next thing is a
   4130   quantifier. */
   4131 
   4132   is_quantifier =
   4133     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
   4134      (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
   4135 
   4136   /* Fill in length of a previous callout and create an auto callout if
   4137   required, except when the next thing is a quantifier or when processing a
   4138   property substitution string for \w etc in UCP mode. */
   4139 
   4140   if (!is_quantifier && cb->nestptr[0] == NULL)
   4141     {
   4142     if (previous_callout != NULL && after_manual_callout-- <= 0)
   4143       {
   4144       if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
   4145         complete_callout(previous_callout, ptr, cb);
   4146       previous_callout = NULL;
   4147       }
   4148 
   4149     if ((options & PCRE2_AUTO_CALLOUT) != 0)
   4150       {
   4151       previous_callout = code;
   4152       code = auto_callout(code, ptr, cb);
   4153       }
   4154     }
   4155 
   4156   /* Process the next pattern item. */
   4157 
   4158   switch(c)
   4159     {
   4160     /* ===================================================================*/
   4161     /* The branch terminates at string end or | or ) */
   4162 
   4163     case CHAR_NULL:
   4164     if (ptr < cb->end_pattern) goto NORMAL_CHAR;   /* Zero data character */
   4165     /* Fall through */
   4166 
   4167     case CHAR_VERTICAL_LINE:
   4168     case CHAR_RIGHT_PARENTHESIS:
   4169     *firstcuptr = firstcu;
   4170     *firstcuflagsptr = firstcuflags;
   4171     *reqcuptr = reqcu;
   4172     *reqcuflagsptr = reqcuflags;
   4173     *codeptr = code;
   4174     *ptrptr = ptr;
   4175     if (lengthptr != NULL)
   4176       {
   4177       if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
   4178         {
   4179         *errorcodeptr = ERR20;
   4180         goto FAILED;
   4181         }
   4182       *lengthptr += (size_t)(code - last_code);  /* To include callout length */
   4183       }
   4184     return TRUE;
   4185 
   4186 
   4187     /* ===================================================================*/
   4188     /* Handle single-character metacharacters. In multiline mode, ^ disables
   4189     the setting of any following char as a first character. */
   4190 
   4191     case CHAR_CIRCUMFLEX_ACCENT:
   4192     previous = NULL;
   4193     if ((options & PCRE2_MULTILINE) != 0)
   4194       {
   4195       if (firstcuflags == REQ_UNSET)
   4196         zerofirstcuflags = firstcuflags = REQ_NONE;
   4197       *code++ = OP_CIRCM;
   4198       }
   4199     else *code++ = OP_CIRC;
   4200     break;
   4201 
   4202     case CHAR_DOLLAR_SIGN:
   4203     previous = NULL;
   4204     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
   4205     break;
   4206 
   4207     /* There can never be a first char if '.' is first, whatever happens about
   4208     repeats. The value of reqcu doesn't change either. */
   4209 
   4210     case CHAR_DOT:
   4211     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   4212     zerofirstcu = firstcu;
   4213     zerofirstcuflags = firstcuflags;
   4214     zeroreqcu = reqcu;
   4215     zeroreqcuflags = reqcuflags;
   4216     previous = code;
   4217     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
   4218     break;
   4219 
   4220 
   4221     /* ===================================================================*/
   4222     /* Character classes. If the included characters are all < 256, we build a
   4223     32-byte bitmap of the permitted characters, except in the special case
   4224     where there is only one such character. For negated classes, we build the
   4225     map as usual, then invert it at the end. However, we use a different opcode
   4226     so that data characters > 255 can be handled correctly.
   4227 
   4228     If the class contains characters outside the 0-255 range, a different
   4229     opcode is compiled. It may optionally have a bit map for characters < 256,
   4230     but those above are are explicitly listed afterwards. A flag byte tells
   4231     whether the bitmap is present, and whether this is a negated class or not.
   4232 
   4233     An isolated ']' character is not treated specially, so is just another data
   4234     character. In earlier versions of PCRE that used the original API there was
   4235     a "JavaScript compatibility mode" in which it gave an error. However,
   4236     JavaScript itself has changed in this respect so there is no longer any
   4237     need for this special handling.
   4238 
   4239     In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
   4240     used for "start of word" and "end of word". As these are otherwise illegal
   4241     sequences, we don't break anything by recognizing them. They are replaced
   4242     by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
   4243     nesting level, as no other inserted sequences will contains these oddities.
   4244     Sequences like [a[:<:]] are erroneous and are handled by the normal code
   4245     below. */
   4246 
   4247     case CHAR_LEFT_SQUARE_BRACKET:
   4248     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
   4249       {
   4250       cb->nestptr[0] = ptr + 7;
   4251       ptr = sub_start_of_word;
   4252       goto REDO_LOOP;
   4253       }
   4254 
   4255     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
   4256       {
   4257       cb->nestptr[0] = ptr + 7;
   4258       ptr = sub_end_of_word;
   4259       goto REDO_LOOP;
   4260       }
   4261 
   4262     /* Handle a real character class. */
   4263 
   4264     previous = code;
   4265 
   4266     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
   4267     they are encountered at the top level, so we'll do that too. */
   4268 
   4269     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4270          ptr[1] == CHAR_EQUALS_SIGN) &&
   4271         check_posix_syntax(ptr, &tempptr))
   4272       {
   4273       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
   4274       goto FAILED;
   4275       }
   4276 
   4277     /* If the first character is '^', set the negation flag and skip it. Also,
   4278     if the first few characters (either before or after ^) are \Q\E or \E we
   4279     skip them too. This makes for compatibility with Perl. */
   4280 
   4281     negate_class = FALSE;
   4282     for (;;)
   4283       {
   4284       c = *(++ptr);
   4285       if (c == CHAR_BACKSLASH)
   4286         {
   4287         if (ptr[1] == CHAR_E)
   4288           ptr++;
   4289         else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
   4290           ptr += 3;
   4291         else
   4292           break;
   4293         }
   4294       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   4295         negate_class = TRUE;
   4296       else break;
   4297       }
   4298 
   4299     /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise,
   4300     an initial ']' is taken as a data character -- the code below handles
   4301     that. When empty classes are allowed, [] must always fail, so generate
   4302     OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */
   4303 
   4304     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   4305         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
   4306       {
   4307       *code++ = negate_class? OP_ALLANY : OP_FAIL;
   4308       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   4309       zerofirstcu = firstcu;
   4310       zerofirstcuflags = firstcuflags;
   4311       break;
   4312       }
   4313 
   4314     /* If a non-extended class contains a negative special such as \S, we need
   4315     to flip the negation flag at the end, so that support for characters > 255
   4316     works correctly (they are all included in the class). An extended class may
   4317     need to insert specific matching or non-matching code for wide characters.
   4318     */
   4319 
   4320     should_flip_negation = match_all_or_no_wide_chars = FALSE;
   4321 
   4322     /* Extended class (xclass) will be used when characters > 255
   4323     might match. */
   4324 
   4325 #ifdef SUPPORT_WIDE_CHARS
   4326     xclass = FALSE;
   4327     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
   4328     class_uchardata_base = class_uchardata;   /* Save the start */
   4329 #endif
   4330 
   4331     /* For optimization purposes, we track some properties of the class:
   4332     class_has_8bitchar will be non-zero if the class contains at least one 256
   4333     character with a code point less than 256; class_one_char will be 1 if the
   4334     class contains just one character; xclass_has_prop will be TRUE if Unicode
   4335     property checks are present in the class. */
   4336 
   4337     class_has_8bitchar = 0;
   4338     class_one_char = 0;
   4339 #ifdef SUPPORT_WIDE_CHARS
   4340     xclass_has_prop = FALSE;
   4341 #endif
   4342 
   4343     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
   4344     in a temporary bit of memory, in case the class contains fewer than two
   4345     8-bit characters because in that case the compiled code doesn't use the bit
   4346     map. */
   4347 
   4348     memset(classbits, 0, 32 * sizeof(uint8_t));
   4349 
   4350     /* Process characters until ] is reached. As the test is at the end of the
   4351     loop, an initial ] is taken as a data character. At the start of the loop,
   4352     c contains the first code unit of the character. If it is zero, check for
   4353     the end of the pattern, to allow binary zero as data. */
   4354 
   4355     for(;;)
   4356       {
   4357       PCRE2_SPTR oldptr;
   4358 #ifdef EBCDIC
   4359       BOOL range_is_literal = TRUE;
   4360 #endif
   4361 
   4362       if (c == CHAR_NULL && ptr >= cb->end_pattern)
   4363         {
   4364         *errorcodeptr = ERR6;  /* Missing terminating ']' */
   4365         goto FAILED;
   4366         }
   4367 
   4368 #ifdef SUPPORT_UNICODE
   4369       if (utf && HAS_EXTRALEN(c))
   4370         {                           /* Braces are required because the */
   4371         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
   4372         }
   4373 #endif
   4374 
   4375       /* Inside \Q...\E everything is literal except \E */
   4376 
   4377       if (inescq)
   4378         {
   4379         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
   4380           {
   4381           inescq = FALSE;                   /* Reset literal state */
   4382           ptr++;                            /* Skip the 'E' */
   4383           goto CONTINUE_CLASS;              /* Carry on with next char */
   4384           }
   4385         goto CHECK_RANGE;                   /* Could be range if \E follows */
   4386         }
   4387 
   4388       /* Handle POSIX class names. Perl allows a negation extension of the
   4389       form [:^name:]. A square bracket that doesn't match the syntax is
   4390       treated as a literal. We also recognize the POSIX constructions
   4391       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
   4392       5.6 and 5.8 do. */
   4393 
   4394       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   4395           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4396            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
   4397         {
   4398         BOOL local_negate = FALSE;
   4399         int posix_class, taboffset, tabopt;
   4400         register const uint8_t *cbits = cb->cbits;
   4401         uint8_t pbits[32];
   4402 
   4403         if (ptr[1] != CHAR_COLON)
   4404           {
   4405           *errorcodeptr = ERR13;
   4406           goto FAILED;
   4407           }
   4408 
   4409         ptr += 2;
   4410         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
   4411           {
   4412           local_negate = TRUE;
   4413           should_flip_negation = TRUE;  /* Note negative special */
   4414           ptr++;
   4415           }
   4416 
   4417         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
   4418         if (posix_class < 0)
   4419           {
   4420           *errorcodeptr = ERR30;
   4421           goto FAILED;
   4422           }
   4423 
   4424         /* If matching is caseless, upper and lower are converted to
   4425         alpha. This relies on the fact that the class table starts with
   4426         alpha, lower, upper as the first 3 entries. */
   4427 
   4428         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
   4429           posix_class = 0;
   4430 
   4431         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
   4432         different escape sequences that use Unicode properties \p or \P. Others
   4433         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
   4434         directly. UCP support is not available unless UTF support is.*/
   4435 
   4436 #ifdef SUPPORT_UNICODE
   4437         if ((options & PCRE2_UCP) != 0)
   4438           {
   4439           unsigned int ptype = 0;
   4440           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
   4441 
   4442           /* The posix_substitutes table specifies which POSIX classes can be
   4443           converted to \p or \P items. This can only happen at top nestling
   4444           level, as there will never be a POSIX class in a string that is
   4445           substituted for something else. */
   4446 
   4447           if (posix_substitutes[pc] != NULL)
   4448             {
   4449             cb->nestptr[0] = tempptr + 1;
   4450             ptr = posix_substitutes[pc] - 1;
   4451             goto CONTINUE_CLASS;
   4452             }
   4453 
   4454           /* There are three other classes that generate special property calls
   4455           that are recognized only in an XCLASS. */
   4456 
   4457           else switch(posix_class)
   4458             {
   4459             case PC_GRAPH:
   4460             ptype = PT_PXGRAPH;
   4461             /* Fall through */
   4462             case PC_PRINT:
   4463             if (ptype == 0) ptype = PT_PXPRINT;
   4464             /* Fall through */
   4465             case PC_PUNCT:
   4466             if (ptype == 0) ptype = PT_PXPUNCT;
   4467             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
   4468             *class_uchardata++ = (PCRE2_UCHAR)ptype;
   4469             *class_uchardata++ = 0;
   4470             xclass_has_prop = TRUE;
   4471             ptr = tempptr + 1;
   4472             goto CONTINUE_CLASS;
   4473 
   4474             /* For the other POSIX classes (ascii, xdigit) we are going to fall
   4475             through to the non-UCP case and build a bit map for characters with
   4476             code points less than 256. However, if we are in a negated POSIX
   4477             class, characters with code points greater than 255 must either all
   4478             match or all not match, depending on whether the whole class is not
   4479             or is negated. For example, for [[:^ascii:]... they must all match,
   4480             whereas for [^[:^xdigit:]... they must not.
   4481 
   4482             In the special case where there are no xclass items, this is
   4483             automatically handled by the use of OP_CLASS or OP_NCLASS, but an
   4484             explicit range is needed for OP_XCLASS. Setting a flag here causes
   4485             the range to be generated later when it is known that OP_XCLASS is
   4486             required. */
   4487 
   4488             default:
   4489             match_all_or_no_wide_chars |= local_negate;
   4490             break;
   4491             }
   4492           }
   4493 #endif  /* SUPPORT_UNICODE */
   4494 
   4495         /* In the non-UCP case, or when UCP makes no difference, we build the
   4496         bit map for the POSIX class in a chunk of local store because we may be
   4497         adding and subtracting from it, and we don't want to subtract bits that
   4498         may be in the main map already. At the end we or the result into the
   4499         bit map that is being built. */
   4500 
   4501         posix_class *= 3;
   4502 
   4503         /* Copy in the first table (always present) */
   4504 
   4505         memcpy(pbits, cbits + posix_class_maps[posix_class],
   4506           32 * sizeof(uint8_t));
   4507 
   4508         /* If there is a second table, add or remove it as required. */
   4509 
   4510         taboffset = posix_class_maps[posix_class + 1];
   4511         tabopt = posix_class_maps[posix_class + 2];
   4512 
   4513         if (taboffset >= 0)
   4514           {
   4515           if (tabopt >= 0)
   4516             for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset];
   4517           else
   4518             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset];
   4519           }
   4520 
   4521         /* Now see if we need to remove any special characters. An option
   4522         value of 1 removes vertical space and 2 removes underscore. */
   4523 
   4524         if (tabopt < 0) tabopt = -tabopt;
   4525         if (tabopt == 1) pbits[1] &= ~0x3c;
   4526           else if (tabopt == 2) pbits[11] &= 0x7f;
   4527 
   4528         /* Add the POSIX table or its complement into the main table that is
   4529         being built and we are done. */
   4530 
   4531         if (local_negate)
   4532           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
   4533         else
   4534           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
   4535 
   4536         ptr = tempptr + 1;
   4537         /* Every class contains at least one < 256 character. */
   4538         class_has_8bitchar = 1;
   4539         /* Every class contains at least two characters. */
   4540         class_one_char = 2;
   4541         goto CONTINUE_CLASS;    /* End of POSIX syntax handling */
   4542         }
   4543 
   4544       /* Backslash may introduce a single character, or it may introduce one
   4545       of the specials, which just set a flag. The sequence \b is a special
   4546       case. Inside a class (and only there) it is treated as backspace. We
   4547       assume that other escapes have more than one character in them, so
   4548       speculatively set both class_has_8bitchar and class_one_char bigger
   4549       than one. Unrecognized escapes fall through and are faulted. */
   4550 
   4551       if (c == CHAR_BACKSLASH)
   4552         {
   4553         escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
   4554           options, TRUE, cb);
   4555         if (*errorcodeptr != 0) goto FAILED;
   4556         if (escape == 0)    /* Escaped single char */
   4557           {
   4558           c = ec;
   4559 #ifdef EBCDIC
   4560           range_is_literal = FALSE;
   4561 #endif
   4562           }
   4563         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
   4564         else if (escape == ESC_N)          /* \N is not supported in a class */
   4565           {
   4566           *errorcodeptr = ERR71;
   4567           goto FAILED;
   4568           }
   4569         else if (escape == ESC_Q)            /* Handle start of quoted string */
   4570           {
   4571           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   4572             {
   4573             ptr += 2; /* avoid empty string */
   4574             }
   4575           else inescq = TRUE;
   4576           goto CONTINUE_CLASS;
   4577           }
   4578         else if (escape == ESC_E) goto CONTINUE_CLASS;  /* Ignore orphan \E */
   4579 
   4580         else  /* Handle \d-type escapes */
   4581           {
   4582           register const uint8_t *cbits = cb->cbits;
   4583           /* Every class contains at least two < 256 characters. */
   4584           class_has_8bitchar++;
   4585           /* Every class contains at least two characters. */
   4586           class_one_char += 2;
   4587 
   4588           switch (escape)
   4589             {
   4590 #ifdef SUPPORT_UNICODE
   4591             case ESC_du:     /* These are the values given for \d etc */
   4592             case ESC_DU:     /* when PCRE2_UCP is set. We replace the */
   4593             case ESC_wu:     /* escape sequence with an appropriate \p */
   4594             case ESC_WU:     /* or \P to test Unicode properties instead */
   4595             case ESC_su:     /* of the default ASCII testing. This might be */
   4596             case ESC_SU:     /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
   4597             cb->nestptr[1] = cb->nestptr[0];
   4598             cb->nestptr[0] = ptr;
   4599             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
   4600             class_has_8bitchar--;                /* Undo! */
   4601             break;
   4602 #endif
   4603             case ESC_d:
   4604             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
   4605             break;
   4606 
   4607             case ESC_D:
   4608             should_flip_negation = TRUE;
   4609             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
   4610             break;
   4611 
   4612             case ESC_w:
   4613             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
   4614             break;
   4615 
   4616             case ESC_W:
   4617             should_flip_negation = TRUE;
   4618             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
   4619             break;
   4620 
   4621             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
   4622             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
   4623             previously set by something earlier in the character class.
   4624             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
   4625             we could just adjust the appropriate bit. From PCRE 8.34 we no
   4626             longer treat \s and \S specially. */
   4627 
   4628             case ESC_s:
   4629             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
   4630             break;
   4631 
   4632             case ESC_S:
   4633             should_flip_negation = TRUE;
   4634             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
   4635             break;
   4636 
   4637             /* The rest apply in both UCP and non-UCP cases. */
   4638 
   4639             case ESC_h:
   4640             (void)add_list_to_class(classbits, &class_uchardata, options, cb,
   4641               PRIV(hspace_list), NOTACHAR);
   4642             break;
   4643 
   4644             case ESC_H:
   4645             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   4646               cb, PRIV(hspace_list));
   4647             break;
   4648 
   4649             case ESC_v:
   4650             (void)add_list_to_class(classbits, &class_uchardata, options, cb,
   4651               PRIV(vspace_list), NOTACHAR);
   4652             break;
   4653 
   4654             case ESC_V:
   4655             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   4656               cb, PRIV(vspace_list));
   4657             break;
   4658 
   4659             case ESC_p:
   4660             case ESC_P:
   4661 #ifdef SUPPORT_UNICODE
   4662               {
   4663               BOOL negated;
   4664               unsigned int ptype = 0, pdata = 0;
   4665               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
   4666                 goto FAILED;
   4667               *class_uchardata++ = ((escape == ESC_p) != negated)?
   4668                 XCL_PROP : XCL_NOTPROP;
   4669               *class_uchardata++ = ptype;
   4670               *class_uchardata++ = pdata;
   4671               xclass_has_prop = TRUE;
   4672               class_has_8bitchar--;                /* Undo! */
   4673               }
   4674             break;
   4675 #else
   4676             *errorcodeptr = ERR45;
   4677             goto FAILED;
   4678 #endif
   4679             /* Unrecognized escapes are faulted. */
   4680 
   4681             default:
   4682             *errorcodeptr = ERR7;
   4683             goto FAILED;
   4684             }
   4685 
   4686           /* Handled \d-type escape */
   4687 
   4688           goto CONTINUE_CLASS;
   4689           }
   4690 
   4691         /* Control gets here if the escape just defined a single character.
   4692         This is in c and may be greater than 256. */
   4693 
   4694         escape = 0;
   4695         }   /* End of backslash handling */
   4696 
   4697       /* A character may be followed by '-' to form a range. However, Perl does
   4698       not permit ']' to be the end of the range. A '-' character at the end is
   4699       treated as a literal. Perl ignores orphaned \E sequences entirely. The
   4700       code for handling \Q and \E is messy. */
   4701 
   4702       CHECK_RANGE:
   4703       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   4704         {
   4705         inescq = FALSE;
   4706         ptr += 2;
   4707         }
   4708       oldptr = ptr;
   4709 
   4710       /* Remember if \r or \n were explicitly used */
   4711 
   4712       if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
   4713 
   4714       /* Check for range */
   4715 
   4716       if (!inescq && ptr[1] == CHAR_MINUS)
   4717         {
   4718         uint32_t d;
   4719         ptr += 2;
   4720         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
   4721 
   4722         /* If we hit \Q (not followed by \E) at this point, go into escaped
   4723         mode. */
   4724 
   4725         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
   4726           {
   4727           ptr += 2;
   4728           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   4729             { ptr += 2; continue; }
   4730           inescq = TRUE;
   4731           break;
   4732           }
   4733 
   4734         /* Minus (hyphen) at the end of a class is treated as a literal, so put
   4735         back the pointer and jump to handle the character that preceded it. */
   4736 
   4737         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
   4738           {
   4739           ptr = oldptr;
   4740           goto CLASS_SINGLE_CHARACTER;
   4741           }
   4742 
   4743         /* Otherwise, we have a potential range; pick up the next character */
   4744 
   4745 #ifdef SUPPORT_UNICODE
   4746         if (utf)
   4747           {                           /* Braces are required because the */
   4748           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
   4749           }
   4750         else
   4751 #endif
   4752         d = *ptr;  /* Not UTF mode */
   4753 
   4754         /* The second part of a range can be a single-character escape
   4755         sequence, but not any of the other escapes. Perl treats a hyphen as a
   4756         literal in such circumstances. However, in Perl's warning mode, a
   4757         warning is given, so PCRE now faults it as it is almost certainly a
   4758         mistake on the user's part. */
   4759 
   4760         if (!inescq)
   4761           {
   4762           if (d == CHAR_BACKSLASH)
   4763             {
   4764             int descape;
   4765             descape = PRIV(check_escape)(&ptr, cb->end_pattern, &d,
   4766               errorcodeptr, options, TRUE, cb);
   4767             if (*errorcodeptr != 0) goto FAILED;
   4768 #ifdef EBCDIC
   4769             range_is_literal = FALSE;
   4770 #endif
   4771             /* 0 means a character was put into d; \b is backspace; any other
   4772             special causes an error. */
   4773 
   4774             if (descape != 0)
   4775               {
   4776               if (descape == ESC_b) d = CHAR_BS; else
   4777                 {
   4778                 *errorcodeptr = ERR50;
   4779                 goto FAILED;
   4780                 }
   4781               }
   4782             }
   4783 
   4784           /* A hyphen followed by a POSIX class is treated in the same way. */
   4785 
   4786           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
   4787                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4788                     ptr[1] == CHAR_EQUALS_SIGN) &&
   4789                    check_posix_syntax(ptr, &tempptr))
   4790             {
   4791             *errorcodeptr = ERR50;
   4792             goto FAILED;
   4793             }
   4794           }
   4795 
   4796         /* Check that the two values are in the correct order. Optimize
   4797         one-character ranges. */
   4798 
   4799         if (d < c)
   4800           {
   4801           *errorcodeptr = ERR8;
   4802           goto FAILED;
   4803           }
   4804         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
   4805 
   4806         /* We have found a character range, so single character optimizations
   4807         cannot be done anymore. Any value greater than 1 indicates that there
   4808         is more than one character. */
   4809 
   4810         class_one_char = 2;
   4811 
   4812         /* Remember an explicit \r or \n, and add the range to the class. */
   4813 
   4814         if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
   4815 
   4816         /* In an EBCDIC environment, Perl treats alphabetic ranges specially
   4817         because there are holes in the encoding, and simply using the range A-Z
   4818         (for example) would include the characters in the holes. This applies
   4819         only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
   4820 
   4821 #ifdef EBCDIC
   4822         if (range_is_literal &&
   4823              (cb->ctypes[c] & ctype_letter) != 0 &&
   4824              (cb->ctypes[d] & ctype_letter) != 0 &&
   4825              (c <= CHAR_z) == (d <= CHAR_z))
   4826           {
   4827           uint32_t uc = (c <= CHAR_z)? 0 : 64;
   4828           uint32_t C = c - uc;
   4829           uint32_t D = d - uc;
   4830 
   4831           if (C <= CHAR_i)
   4832             {
   4833             class_has_8bitchar +=
   4834               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   4835                 ((D < CHAR_i)? D : CHAR_i) + uc);
   4836             C = CHAR_j;
   4837             }
   4838 
   4839           if (C <= D && C <= CHAR_r)
   4840             {
   4841             class_has_8bitchar +=
   4842               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   4843                 ((D < CHAR_r)? D : CHAR_r) + uc);
   4844             C = CHAR_s;
   4845             }
   4846 
   4847           if (C <= D)
   4848             {
   4849             class_has_8bitchar +=
   4850               add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   4851                 D + uc);
   4852             }
   4853           }
   4854         else
   4855 #endif
   4856         class_has_8bitchar +=
   4857           add_to_class(classbits, &class_uchardata, options, cb, c, d);
   4858         goto CONTINUE_CLASS;   /* Go get the next char in the class */
   4859         }
   4860 
   4861       /* Handle a single character - we can get here for a normal non-escape
   4862       char, or after \ that introduces a single character or for an apparent
   4863       range that isn't. Only the value 1 matters for class_one_char, so don't
   4864       increase it if it is already 2 or more ... just in case there's a class
   4865       with a zillion characters in it. */
   4866 
   4867       CLASS_SINGLE_CHARACTER:
   4868       if (class_one_char < 2) class_one_char++;
   4869 
   4870       /* If class_one_char is 1 and xclass_has_prop is false, we have the first
   4871       single character in the class, and there have been no prior ranges, or
   4872       XCLASS items generated by escapes. If this is the final character in the
   4873       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
   4874       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
   4875       can cause firstcu to be set. Otherwise, there can be no first char if
   4876       this item is first, whatever repeat count may follow. In the case of
   4877       reqcu, save the previous value for reinstating. */
   4878 
   4879       if (!inescq &&
   4880 #ifdef SUPPORT_UNICODE
   4881           !xclass_has_prop &&
   4882 #endif
   4883           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   4884         {
   4885         ptr++;
   4886         zeroreqcu = reqcu;
   4887         zeroreqcuflags = reqcuflags;
   4888 
   4889         if (negate_class)
   4890           {
   4891 #ifdef SUPPORT_UNICODE
   4892           int d;
   4893 #endif
   4894           if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   4895           zerofirstcu = firstcu;
   4896           zerofirstcuflags = firstcuflags;
   4897 
   4898           /* For caseless UTF mode, check whether this character has more than
   4899           one other case. If so, generate a special OP_NOTPROP item instead of
   4900           OP_NOTI. */
   4901 
   4902 #ifdef SUPPORT_UNICODE
   4903           if (utf && (options & PCRE2_CASELESS) != 0 &&
   4904               (d = UCD_CASESET(c)) != 0)
   4905             {
   4906             *code++ = OP_NOTPROP;
   4907             *code++ = PT_CLIST;
   4908             *code++ = d;
   4909             }
   4910           else
   4911 #endif
   4912           /* Char has only one other case, or UCP not available */
   4913 
   4914             {
   4915             *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
   4916             code += PUTCHAR(c, code);
   4917             }
   4918 
   4919           /* We are finished with this character class */
   4920 
   4921           goto END_CLASS;
   4922           }
   4923 
   4924         /* For a single, positive character, get the value into mcbuffer, and
   4925         then we can handle this with the normal one-character code. */
   4926 
   4927         mclength = PUTCHAR(c, mcbuffer);
   4928         goto ONE_CHAR;
   4929         }       /* End of 1-char optimization */
   4930 
   4931       /* There is more than one character in the class, or an XCLASS item
   4932       has been generated. Add this character to the class. */
   4933 
   4934       class_has_8bitchar +=
   4935         add_to_class(classbits, &class_uchardata, options, cb, c, c);
   4936 
   4937       /* Continue to the next character in the class. Closing square bracket
   4938       not within \Q..\E ends the class. A NULL character terminates a
   4939       nested substitution string, but may be a data character in the main
   4940       pattern (tested at the start of this loop). */
   4941 
   4942       CONTINUE_CLASS:
   4943       c = *(++ptr);
   4944       if (c == CHAR_NULL && cb->nestptr[0] != NULL)
   4945         {
   4946         ptr = cb->nestptr[0];
   4947         cb->nestptr[0] = cb->nestptr[1];
   4948         cb->nestptr[1] = NULL;
   4949         c = *(++ptr);
   4950         }
   4951 
   4952 #ifdef SUPPORT_WIDE_CHARS
   4953       /* If any wide characters have been encountered, set xclass = TRUE. Then,
   4954       in the pre-compile phase, accumulate the length of the wide characters
   4955       and reset the pointer. This is so that very large classes that contain a
   4956       zillion wide characters do not overwrite the work space (which is on the
   4957       stack). */
   4958 
   4959       if (class_uchardata > class_uchardata_base)
   4960         {
   4961         xclass = TRUE;
   4962         if (lengthptr != NULL)
   4963           {
   4964           *lengthptr += class_uchardata - class_uchardata_base;
   4965           class_uchardata = class_uchardata_base;
   4966           }
   4967         }
   4968 #endif
   4969       /* An unescaped ] ends the class */
   4970 
   4971       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
   4972       }   /* End of main class-processing loop */
   4973 
   4974     /* If this is the first thing in the branch, there can be no first char
   4975     setting, whatever the repeat count. Any reqcu setting must remain
   4976     unchanged after any kind of repeat. */
   4977 
   4978     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   4979     zerofirstcu = firstcu;
   4980     zerofirstcuflags = firstcuflags;
   4981     zeroreqcu = reqcu;
   4982     zeroreqcuflags = reqcuflags;
   4983 
   4984     /* If there are characters with values > 255, or Unicode property settings
   4985     (\p or \P), we have to compile an extended class, with its own opcode,
   4986     unless there were no property settings and there was a negated special such
   4987     as \S in the class, and PCRE2_UCP is not set, because in that case all
   4988     characters > 255 are in or not in the class, so any that were explicitly
   4989     given as well can be ignored.
   4990 
   4991     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
   4992     [^:xdigit:]) were present in a class, we either have to match or not match
   4993     all wide characters (depending on whether the whole class is or is not
   4994     negated). This requirement is indicated by match_all_or_no_wide_chars being
   4995     true. We do this by including an explicit range, which works in both cases.
   4996 
   4997     If, when generating an xclass, there are no characters < 256, we can omit
   4998     the bitmap in the actual compiled code. */
   4999 
   5000 #ifdef SUPPORT_WIDE_CHARS
   5001 #ifdef SUPPORT_UNICODE
   5002     if (xclass && (xclass_has_prop || !should_flip_negation ||
   5003          (options & PCRE2_UCP) != 0))
   5004 #elif PCRE2_CODE_UNIT_WIDTH != 8
   5005     if (xclass && (xclass_has_prop || !should_flip_negation))
   5006 #endif
   5007       {
   5008       if (match_all_or_no_wide_chars)
   5009         {
   5010         *class_uchardata++ = XCL_RANGE;
   5011         class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
   5012         class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
   5013         }
   5014       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
   5015       *code++ = OP_XCLASS;
   5016       code += LINK_SIZE;
   5017       *code = negate_class? XCL_NOT:0;
   5018       if (xclass_has_prop) *code |= XCL_HASPROP;
   5019 
   5020       /* If the map is required, move up the extra data to make room for it;
   5021       otherwise just move the code pointer to the end of the extra data. */
   5022 
   5023       if (class_has_8bitchar > 0)
   5024         {
   5025         *code++ |= XCL_MAP;
   5026         memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
   5027           CU2BYTES(class_uchardata - code));
   5028         if (negate_class && !xclass_has_prop)
   5029           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5030         memcpy(code, classbits, 32);
   5031         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
   5032         }
   5033       else code = class_uchardata;
   5034 
   5035       /* Now fill in the complete length of the item */
   5036 
   5037       PUT(previous, 1, (int)(code - previous));
   5038       break;   /* End of class handling */
   5039       }
   5040 #endif
   5041 
   5042     /* If there are no characters > 255, or they are all to be included or
   5043     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
   5044     whole class was negated and whether there were negative specials such as \S
   5045     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
   5046     negating it if necessary. */
   5047 
   5048     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
   5049     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
   5050       {
   5051       if (negate_class)
   5052         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5053       memcpy(code, classbits, 32);
   5054       }
   5055     code += 32 / sizeof(PCRE2_UCHAR);
   5056 
   5057     END_CLASS:
   5058     break;
   5059 
   5060 
   5061     /* ===================================================================*/
   5062     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
   5063     has been tested above. */
   5064 
   5065     case CHAR_LEFT_CURLY_BRACKET:
   5066     if (!is_quantifier) goto NORMAL_CHAR;
   5067     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
   5068     if (*errorcodeptr != 0) goto FAILED;
   5069     goto REPEAT;
   5070 
   5071     case CHAR_ASTERISK:
   5072     repeat_min = 0;
   5073     repeat_max = -1;
   5074     goto REPEAT;
   5075 
   5076     case CHAR_PLUS:
   5077     repeat_min = 1;
   5078     repeat_max = -1;
   5079     goto REPEAT;
   5080 
   5081     case CHAR_QUESTION_MARK:
   5082     repeat_min = 0;
   5083     repeat_max = 1;
   5084 
   5085     REPEAT:
   5086     if (previous == NULL)
   5087       {
   5088       *errorcodeptr = ERR9;
   5089       goto FAILED;
   5090       }
   5091 
   5092     if (repeat_min == 0)
   5093       {
   5094       firstcu = zerofirstcu;    /* Adjust for zero repeat */
   5095       firstcuflags = zerofirstcuflags;
   5096       reqcu = zeroreqcu;        /* Ditto */
   5097       reqcuflags = zeroreqcuflags;
   5098       }
   5099 
   5100     /* Remember whether this is a variable length repeat */
   5101 
   5102     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
   5103 
   5104     op_type = 0;                    /* Default single-char op codes */
   5105     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
   5106 
   5107     /* Save start of previous item, in case we have to move it up in order to
   5108     insert something before it. */
   5109 
   5110     tempcode = previous;
   5111 
   5112     /* Before checking for a possessive quantifier, we must skip over
   5113     whitespace and comments in extended mode because Perl allows white space at
   5114     this point. */
   5115 
   5116     if ((options & PCRE2_EXTENDED) != 0)
   5117       {
   5118       ptr++;
   5119       for (;;)
   5120         {
   5121         while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_space) != 0) ptr++;
   5122         if (*ptr != CHAR_NUMBER_SIGN) break;
   5123         ptr++;
   5124         while (ptr < cb->end_pattern)
   5125           {
   5126           if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
   5127             {                        /* IS_NEWLINE sets cb->nllen. */
   5128             ptr += cb->nllen;
   5129             break;
   5130             }
   5131           ptr++;
   5132 #ifdef SUPPORT_UNICODE
   5133           if (utf) FORWARDCHAR(ptr);
   5134 #endif
   5135           }           /* Loop for comment characters */
   5136         }             /* Loop for multiple comments */
   5137       ptr--;          /* Last code unit of previous character. */
   5138       }
   5139 
   5140     /* If the next character is '+', we have a possessive quantifier. This
   5141     implies greediness, whatever the setting of the PCRE2_UNGREEDY option.
   5142     If the next character is '?' this is a minimizing repeat, by default,
   5143     but if PCRE2_UNGREEDY is set, it works the other way round. We change the
   5144     repeat type to the non-default. */
   5145 
   5146     if (ptr[1] == CHAR_PLUS)
   5147       {
   5148       repeat_type = 0;                  /* Force greedy */
   5149       possessive_quantifier = TRUE;
   5150       ptr++;
   5151       }
   5152     else if (ptr[1] == CHAR_QUESTION_MARK)
   5153       {
   5154       repeat_type = greedy_non_default;
   5155       ptr++;
   5156       }
   5157     else repeat_type = greedy_default;
   5158 
   5159     /* If the repeat is {1} we can ignore it. */
   5160 
   5161     if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
   5162 
   5163     /* If previous was a recursion call, wrap it in atomic brackets so that
   5164     previous becomes the atomic group. All recursions were so wrapped in the
   5165     past, but it no longer happens for non-repeated recursions. In fact, the
   5166     repeated ones could be re-implemented independently so as not to need this,
   5167     but for the moment we rely on the code for repeating groups. */
   5168 
   5169     if (*previous == OP_RECURSE)
   5170       {
   5171       memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
   5172       *previous = OP_ONCE;
   5173       PUT(previous, 1, 2 + 2*LINK_SIZE);
   5174       previous[2 + 2*LINK_SIZE] = OP_KET;
   5175       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
   5176       code += 2 + 2 * LINK_SIZE;
   5177       length_prevgroup = 3 + 3*LINK_SIZE;
   5178       }
   5179 
   5180     /* Now handle repetition for the different types of item. */
   5181 
   5182     /* If previous was a character or negated character match, abolish the item
   5183     and generate a repeat item instead. If a char item has a minimum of more
   5184     than one, ensure that it is set in reqcu - it might not be if a sequence
   5185     such as x{3} is the first thing in a branch because the x will have gone
   5186     into firstcu instead.  */
   5187 
   5188     if (*previous == OP_CHAR || *previous == OP_CHARI
   5189         || *previous == OP_NOT || *previous == OP_NOTI)
   5190       {
   5191       switch (*previous)
   5192         {
   5193         default: /* Make compiler happy. */
   5194         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
   5195         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
   5196         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
   5197         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
   5198         }
   5199 
   5200       /* Deal with UTF characters that take up more than one code unit. It's
   5201       easier to write this out separately than try to macrify it. Use c to
   5202       hold the length of the character in code units, plus UTF_LENGTH to flag
   5203       that it's a length rather than a small character. */
   5204 
   5205 #ifdef MAYBE_UTF_MULTI
   5206       if (utf && NOT_FIRSTCU(code[-1]))
   5207         {
   5208         PCRE2_UCHAR *lastchar = code - 1;
   5209         BACKCHAR(lastchar);
   5210         c = (int)(code - lastchar);               /* Length of UTF character */
   5211         memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */
   5212         c |= UTF_LENGTH;                          /* Flag c as a length */
   5213         }
   5214       else
   5215 #endif  /* MAYBE_UTF_MULTI */
   5216 
   5217       /* Handle the case of a single charater - either with no UTF support, or
   5218       with UTF disabled, or for a single-code-unit UTF character. */
   5219         {
   5220         c = code[-1];
   5221         if (*previous <= OP_CHARI && repeat_min > 1)
   5222           {
   5223           reqcu = c;
   5224           reqcuflags = req_caseopt | cb->req_varyopt;
   5225           }
   5226         }
   5227 
   5228       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
   5229       }
   5230 
   5231     /* If previous was a character type match (\d or similar), abolish it and
   5232     create a suitable repeat item. The code is shared with single-character
   5233     repeats by setting op_type to add a suitable offset into repeat_type. Note
   5234     the the Unicode property types will be present only when SUPPORT_UNICODE is
   5235     defined, but we don't wrap the little bits of code here because it just
   5236     makes it horribly messy. */
   5237 
   5238     else if (*previous < OP_EODN)
   5239       {
   5240       PCRE2_UCHAR *oldcode;
   5241       int prop_type, prop_value;
   5242       op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
   5243       c = *previous;                        /* Save previous opcode */
   5244       if (c == OP_PROP || c == OP_NOTPROP)
   5245         {
   5246         prop_type = previous[1];
   5247         prop_value = previous[2];
   5248         }
   5249       else
   5250         {
   5251         /* Come here from just above with a character in c */
   5252         OUTPUT_SINGLE_REPEAT:
   5253         prop_type = prop_value = -1;
   5254         }
   5255 
   5256       /* At this point we either have prop_type == prop_value == -1 and either
   5257       a code point or a character type that is not OP_[NOT]PROP in c, or we
   5258       have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
   5259 
   5260       oldcode = code;                   /* Save where we were */
   5261       code = previous;                  /* Usually overwrite previous item */
   5262 
   5263       /* If the maximum is zero then the minimum must also be zero; Perl allows
   5264       this case, so we do too - by simply omitting the item altogether. */
   5265 
   5266       if (repeat_max == 0) goto END_REPEAT;
   5267 
   5268       /* Combine the op_type with the repeat_type */
   5269 
   5270       repeat_type += op_type;
   5271 
   5272       /* A minimum of zero is handled either as the special case * or ?, or as
   5273       an UPTO, with the maximum given. */
   5274 
   5275       if (repeat_min == 0)
   5276         {
   5277         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
   5278           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
   5279         else
   5280           {
   5281           *code++ = OP_UPTO + repeat_type;
   5282           PUT2INC(code, 0, repeat_max);
   5283           }
   5284         }
   5285 
   5286       /* A repeat minimum of 1 is optimized into some special cases. If the
   5287       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
   5288       left in place and, if the maximum is greater than 1, we use OP_UPTO with
   5289       one less than the maximum. */
   5290 
   5291       else if (repeat_min == 1)
   5292         {
   5293         if (repeat_max == -1)
   5294           *code++ = OP_PLUS + repeat_type;
   5295         else
   5296           {
   5297           code = oldcode;                 /* Leave previous item in place */
   5298           if (repeat_max == 1) goto END_REPEAT;
   5299           *code++ = OP_UPTO + repeat_type;
   5300           PUT2INC(code, 0, repeat_max - 1);
   5301           }
   5302         }
   5303 
   5304       /* The case {n,n} is just an EXACT, while the general case {n,m} is
   5305       handled as an EXACT followed by an UPTO or STAR or QUERY. */
   5306 
   5307       else
   5308         {
   5309         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
   5310         PUT2INC(code, 0, repeat_min);
   5311 
   5312         /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and
   5313         then generate the second opcode. In UTF mode, multi-code-unit
   5314         characters have their length in c, with the UTF_LENGTH bit as a flag,
   5315         and the code units in utf_units. For a repeated Unicode property match,
   5316         there are two extra values that define the required property, and c
   5317         never has the UTF_LENGTH bit set. */
   5318 
   5319         if (repeat_max != repeat_min)
   5320           {
   5321 #ifdef MAYBE_UTF_MULTI
   5322           if (utf && (c & UTF_LENGTH) != 0)
   5323             {
   5324             memcpy(code, utf_units, CU2BYTES(c & 7));
   5325             code += c & 7;
   5326             }
   5327           else
   5328 #endif  /* MAYBE_UTF_MULTI */
   5329             {
   5330             *code++ = c;
   5331             if (prop_type >= 0)
   5332               {
   5333               *code++ = prop_type;
   5334               *code++ = prop_value;
   5335               }
   5336             }
   5337 
   5338           /* Now set up the following opcode */
   5339 
   5340           if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else
   5341             {
   5342             repeat_max -= repeat_min;
   5343             if (repeat_max == 1)
   5344               {
   5345               *code++ = OP_QUERY + repeat_type;
   5346               }
   5347             else
   5348               {
   5349               *code++ = OP_UPTO + repeat_type;
   5350               PUT2INC(code, 0, repeat_max);
   5351               }
   5352             }
   5353           }
   5354         }
   5355 
   5356       /* Fill in the character or character type for the final opcode. */
   5357 
   5358 #ifdef MAYBE_UTF_MULTI
   5359       if (utf && (c & UTF_LENGTH) != 0)
   5360         {
   5361         memcpy(code, utf_units, CU2BYTES(c & 7));
   5362         code += c & 7;
   5363         }
   5364       else
   5365 #endif  /* MAYBEW_UTF_MULTI */
   5366         {
   5367         *code++ = c;
   5368         if (prop_type >= 0)
   5369           {
   5370           *code++ = prop_type;
   5371           *code++ = prop_value;
   5372           }
   5373         }
   5374       }
   5375 
   5376     /* If previous was a character class or a back reference, we put the repeat
   5377     stuff after it, but just skip the item if the repeat was {0,0}. */
   5378 
   5379     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
   5380 #ifdef SUPPORT_WIDE_CHARS
   5381              *previous == OP_XCLASS ||
   5382 #endif
   5383              *previous == OP_REF   || *previous == OP_REFI ||
   5384              *previous == OP_DNREF || *previous == OP_DNREFI)
   5385       {
   5386       if (repeat_max == 0)
   5387         {
   5388         code = previous;
   5389         goto END_REPEAT;
   5390         }
   5391 
   5392       if (repeat_min == 0 && repeat_max == -1)
   5393         *code++ = OP_CRSTAR + repeat_type;
   5394       else if (repeat_min == 1 && repeat_max == -1)
   5395         *code++ = OP_CRPLUS + repeat_type;
   5396       else if (repeat_min == 0 && repeat_max == 1)
   5397         *code++ = OP_CRQUERY + repeat_type;
   5398       else
   5399         {
   5400         *code++ = OP_CRRANGE + repeat_type;
   5401         PUT2INC(code, 0, repeat_min);
   5402         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
   5403         PUT2INC(code, 0, repeat_max);
   5404         }
   5405       }
   5406 
   5407     /* If previous was a bracket group, we may have to replicate it in certain
   5408     cases. Note that at this point we can encounter only the "basic" bracket
   5409     opcodes such as BRA and CBRA, as this is the place where they get converted
   5410     into the more special varieties such as BRAPOS and SBRA. A test for >=
   5411     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
   5412     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
   5413     Originally, PCRE did not allow repetition of assertions, but now it does,
   5414     for Perl compatibility. */
   5415 
   5416     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
   5417       {
   5418       register int i;
   5419       int len = (int)(code - previous);
   5420       PCRE2_UCHAR *bralink = NULL;
   5421       PCRE2_UCHAR *brazeroptr = NULL;
   5422 
   5423       /* Repeating a DEFINE group (or any group where the condition is always
   5424       FALSE and there is only one branch) is pointless, but Perl allows the
   5425       syntax, so we just ignore the repeat. */
   5426 
   5427       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
   5428           previous[GET(previous, 1)] != OP_ALT)
   5429         goto END_REPEAT;
   5430 
   5431       /* There is no sense in actually repeating assertions. The only potential
   5432       use of repetition is in cases when the assertion is optional. Therefore,
   5433       if the minimum is greater than zero, just ignore the repeat. If the
   5434       maximum is not zero or one, set it to 1. */
   5435 
   5436       if (*previous < OP_ONCE)    /* Assertion */
   5437         {
   5438         if (repeat_min > 0) goto END_REPEAT;
   5439         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
   5440         }
   5441 
   5442       /* The case of a zero minimum is special because of the need to stick
   5443       OP_BRAZERO in front of it, and because the group appears once in the
   5444       data, whereas in other cases it appears the minimum number of times. For
   5445       this reason, it is simplest to treat this case separately, as otherwise
   5446       the code gets far too messy. There are several special subcases when the
   5447       minimum is zero. */
   5448 
   5449       if (repeat_min == 0)
   5450         {
   5451         /* If the maximum is also zero, we used to just omit the group from the
   5452         output altogether, like this:
   5453 
   5454         ** if (repeat_max == 0)
   5455         **   {
   5456         **   code = previous;
   5457         **   goto END_REPEAT;
   5458         **   }
   5459 
   5460         However, that fails when a group or a subgroup within it is referenced
   5461         as a subroutine from elsewhere in the pattern, so now we stick in
   5462         OP_SKIPZERO in front of it so that it is skipped on execution. As we
   5463         don't have a list of which groups are referenced, we cannot do this
   5464         selectively.
   5465 
   5466         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
   5467         and do no more at this point. */
   5468 
   5469         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
   5470           {
   5471           memmove(previous + 1, previous, CU2BYTES(len));
   5472           code++;
   5473           if (repeat_max == 0)
   5474             {
   5475             *previous++ = OP_SKIPZERO;
   5476             goto END_REPEAT;
   5477             }
   5478           brazeroptr = previous;    /* Save for possessive optimizing */
   5479           *previous++ = OP_BRAZERO + repeat_type;
   5480           }
   5481 
   5482         /* If the maximum is greater than 1 and limited, we have to replicate
   5483         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
   5484         The first one has to be handled carefully because it's the original
   5485         copy, which has to be moved up. The remainder can be handled by code
   5486         that is common with the non-zero minimum case below. We have to
   5487         adjust the value or repeat_max, since one less copy is required. */
   5488 
   5489         else
   5490           {
   5491           int offset;
   5492           memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
   5493           code += 2 + LINK_SIZE;
   5494           *previous++ = OP_BRAZERO + repeat_type;
   5495           *previous++ = OP_BRA;
   5496 
   5497           /* We chain together the bracket offset fields that have to be
   5498           filled in later when the ends of the brackets are reached. */
   5499 
   5500           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
   5501           bralink = previous;
   5502           PUTINC(previous, 0, offset);
   5503           }
   5504 
   5505         repeat_max--;
   5506         }
   5507 
   5508       /* If the minimum is greater than zero, replicate the group as many
   5509       times as necessary, and adjust the maximum to the number of subsequent
   5510       copies that we need. */
   5511 
   5512       else
   5513         {
   5514         if (repeat_min > 1)
   5515           {
   5516           /* In the pre-compile phase, we don't actually do the replication. We
   5517           just adjust the length as if we had. Do some paranoid checks for
   5518           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
   5519           integer type when available, otherwise double. */
   5520 
   5521           if (lengthptr != NULL)
   5522             {
   5523             size_t delta = (repeat_min - 1)*length_prevgroup;
   5524             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
   5525                   (INT64_OR_DOUBLE)length_prevgroup >
   5526                     (INT64_OR_DOUBLE)INT_MAX ||
   5527                 OFLOW_MAX - *lengthptr < delta)
   5528               {
   5529               *errorcodeptr = ERR20;
   5530               goto FAILED;
   5531               }
   5532             *lengthptr += delta;
   5533             }
   5534 
   5535           /* This is compiling for real. If there is a set first byte for
   5536           the group, and we have not yet set a "required byte", set it. */
   5537 
   5538           else
   5539             {
   5540             if (groupsetfirstcu && reqcuflags < 0)
   5541               {
   5542               reqcu = firstcu;
   5543               reqcuflags = firstcuflags;
   5544               }
   5545             for (i = 1; i < repeat_min; i++)
   5546               {
   5547               memcpy(code, previous, CU2BYTES(len));
   5548               code += len;
   5549               }
   5550             }
   5551           }
   5552 
   5553         if (repeat_max > 0) repeat_max -= repeat_min;
   5554         }
   5555 
   5556       /* This code is common to both the zero and non-zero minimum cases. If
   5557       the maximum is limited, it replicates the group in a nested fashion,
   5558       remembering the bracket starts on a stack. In the case of a zero minimum,
   5559       the first one was set up above. In all cases the repeat_max now specifies
   5560       the number of additional copies needed. Again, we must remember to
   5561       replicate entries on the forward reference list. */
   5562 
   5563       if (repeat_max >= 0)
   5564         {
   5565         /* In the pre-compile phase, we don't actually do the replication. We
   5566         just adjust the length as if we had. For each repetition we must add 1
   5567         to the length for BRAZERO and for all but the last repetition we must
   5568         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
   5569         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
   5570         a 64-bit integer type when available, otherwise double. */
   5571 
   5572         if (lengthptr != NULL && repeat_max > 0)
   5573           {
   5574           size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
   5575                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
   5576           if ((INT64_OR_DOUBLE)repeat_max *
   5577                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
   5578                   > (INT64_OR_DOUBLE)INT_MAX ||
   5579               OFLOW_MAX - *lengthptr < delta)
   5580             {
   5581             *errorcodeptr = ERR20;
   5582             goto FAILED;
   5583             }
   5584           *lengthptr += delta;
   5585           }
   5586 
   5587         /* This is compiling for real */
   5588 
   5589         else for (i = repeat_max - 1; i >= 0; i--)
   5590           {
   5591           *code++ = OP_BRAZERO + repeat_type;
   5592 
   5593           /* All but the final copy start a new nesting, maintaining the
   5594           chain of brackets outstanding. */
   5595 
   5596           if (i != 0)
   5597             {
   5598             int offset;
   5599             *code++ = OP_BRA;
   5600             offset = (bralink == NULL)? 0 : (int)(code - bralink);
   5601             bralink = code;
   5602             PUTINC(code, 0, offset);
   5603             }
   5604 
   5605           memcpy(code, previous, CU2BYTES(len));
   5606           code += len;
   5607           }
   5608 
   5609         /* Now chain through the pending brackets, and fill in their length
   5610         fields (which are holding the chain links pro tem). */
   5611 
   5612         while (bralink != NULL)
   5613           {
   5614           int oldlinkoffset;
   5615           int offset = (int)(code - bralink + 1);
   5616           PCRE2_UCHAR *bra = code - offset;
   5617           oldlinkoffset = GET(bra, 1);
   5618           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
   5619           *code++ = OP_KET;
   5620           PUTINC(code, 0, offset);
   5621           PUT(bra, 1, offset);
   5622           }
   5623         }
   5624 
   5625       /* If the maximum is unlimited, set a repeater in the final copy. For
   5626       ONCE brackets, that's all we need to do. However, possessively repeated
   5627       ONCE brackets can be converted into non-capturing brackets, as the
   5628       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
   5629       deal with possessive ONCEs specially.
   5630 
   5631       Otherwise, when we are doing the actual compile phase, check to see
   5632       whether this group is one that could match an empty string. If so,
   5633       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
   5634       that runtime checking can be done. [This check is also applied to ONCE
   5635       groups at runtime, but in a different way.]
   5636 
   5637       Then, if the quantifier was possessive and the bracket is not a
   5638       conditional, we convert the BRA code to the POS form, and the KET code to
   5639       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
   5640       subpattern at both the start and at the end.) The use of special opcodes
   5641       makes it possible to reduce greatly the stack usage in pcre2_match(). If
   5642       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
   5643 
   5644       Then, if the minimum number of matches is 1 or 0, cancel the possessive
   5645       flag so that the default action below, of wrapping everything inside
   5646       atomic brackets, does not happen. When the minimum is greater than 1,
   5647       there will be earlier copies of the group, and so we still have to wrap
   5648       the whole thing. */
   5649 
   5650       else
   5651         {
   5652         PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
   5653         PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
   5654 
   5655         /* Convert possessive ONCE brackets to non-capturing */
   5656 
   5657         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
   5658             possessive_quantifier) *bracode = OP_BRA;
   5659 
   5660         /* For non-possessive ONCE brackets, all we need to do is to
   5661         set the KET. */
   5662 
   5663         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
   5664           *ketcode = OP_KETRMAX + repeat_type;
   5665 
   5666         /* Handle non-ONCE brackets and possessive ONCEs (which have been
   5667         converted to non-capturing above). */
   5668 
   5669         else
   5670           {
   5671           /* In the compile phase, check whether the group could match an empty
   5672           string. */
   5673 
   5674           if (lengthptr == NULL)
   5675             {
   5676             PCRE2_UCHAR *scode = bracode;
   5677             do
   5678               {
   5679               int count = 0;
   5680               int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
   5681                 NULL, &count);
   5682               if (rc < 0)
   5683                 {
   5684                 *errorcodeptr = ERR86;
   5685                 goto FAILED;
   5686                 }
   5687               if (rc > 0)
   5688                 {
   5689                 *bracode += OP_SBRA - OP_BRA;
   5690                 break;
   5691                 }
   5692               scode += GET(scode, 1);
   5693               }
   5694             while (*scode == OP_ALT);
   5695 
   5696             /* A conditional group with only one branch has an implicit empty
   5697             alternative branch. */
   5698 
   5699             if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
   5700               *bracode = OP_SCOND;
   5701             }
   5702 
   5703           /* Handle possessive quantifiers. */
   5704 
   5705           if (possessive_quantifier)
   5706             {
   5707             /* For COND brackets, we wrap the whole thing in a possessively
   5708             repeated non-capturing bracket, because we have not invented POS
   5709             versions of the COND opcodes. */
   5710 
   5711             if (*bracode == OP_COND || *bracode == OP_SCOND)
   5712               {
   5713               int nlen = (int)(code - bracode);
   5714               memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
   5715               code += 1 + LINK_SIZE;
   5716               nlen += 1 + LINK_SIZE;
   5717               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
   5718               *code++ = OP_KETRPOS;
   5719               PUTINC(code, 0, nlen);
   5720               PUT(bracode, 1, nlen);
   5721               }
   5722 
   5723             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
   5724 
   5725             else
   5726               {
   5727               *bracode += 1;              /* Switch to xxxPOS opcodes */
   5728               *ketcode = OP_KETRPOS;
   5729               }
   5730 
   5731             /* If the minimum is zero, mark it as possessive, then unset the
   5732             possessive flag when the minimum is 0 or 1. */
   5733 
   5734             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
   5735             if (repeat_min < 2) possessive_quantifier = FALSE;
   5736             }
   5737 
   5738           /* Non-possessive quantifier */
   5739 
   5740           else *ketcode = OP_KETRMAX + repeat_type;
   5741           }
   5742         }
   5743       }
   5744 
   5745     /* If previous is OP_FAIL, it was generated by an empty class []
   5746     (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
   5747     generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a
   5748     "nothing to repeat" error above. We can just ignore the repeat in empty
   5749     class case. */
   5750 
   5751     else if (*previous == OP_FAIL) goto END_REPEAT;
   5752 
   5753     /* Else there's some kind of shambles */
   5754 
   5755     else
   5756       {
   5757       *errorcodeptr = ERR10;
   5758       goto FAILED;
   5759       }
   5760 
   5761     /* If the character following a repeat is '+', possessive_quantifier is
   5762     TRUE. For some opcodes, there are special alternative opcodes for this
   5763     case. For anything else, we wrap the entire repeated item inside OP_ONCE
   5764     brackets. Logically, the '+' notation is just syntactic sugar, taken from
   5765     Sun's Java package, but the special opcodes can optimize it.
   5766 
   5767     Some (but not all) possessively repeated subpatterns have already been
   5768     completely handled in the code just above. For them, possessive_quantifier
   5769     is always FALSE at this stage. Note that the repeated item starts at
   5770     tempcode, not at previous, which might be the first part of a string whose
   5771     (former) last char we repeated. */
   5772 
   5773     if (possessive_quantifier)
   5774       {
   5775       int len;
   5776 
   5777       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
   5778       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
   5779       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
   5780       remains is greater than zero, there's a further opcode that can be
   5781       handled. If not, do nothing, leaving the EXACT alone. */
   5782 
   5783       switch(*tempcode)
   5784         {
   5785         case OP_TYPEEXACT:
   5786         tempcode += PRIV(OP_lengths)[*tempcode] +
   5787           ((tempcode[1 + IMM2_SIZE] == OP_PROP
   5788           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
   5789         break;
   5790 
   5791         /* CHAR opcodes are used for exacts whose count is 1. */
   5792 
   5793         case OP_CHAR:
   5794         case OP_CHARI:
   5795         case OP_NOT:
   5796         case OP_NOTI:
   5797         case OP_EXACT:
   5798         case OP_EXACTI:
   5799         case OP_NOTEXACT:
   5800         case OP_NOTEXACTI:
   5801         tempcode += PRIV(OP_lengths)[*tempcode];
   5802 #ifdef SUPPORT_UNICODE
   5803         if (utf && HAS_EXTRALEN(tempcode[-1]))
   5804           tempcode += GET_EXTRALEN(tempcode[-1]);
   5805 #endif
   5806         break;
   5807 
   5808         /* For the class opcodes, the repeat operator appears at the end;
   5809         adjust tempcode to point to it. */
   5810 
   5811         case OP_CLASS:
   5812         case OP_NCLASS:
   5813         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
   5814         break;
   5815 
   5816 #ifdef SUPPORT_WIDE_CHARS
   5817         case OP_XCLASS:
   5818         tempcode += GET(tempcode, 1);
   5819         break;
   5820 #endif
   5821         }
   5822 
   5823       /* If tempcode is equal to code (which points to the end of the repeated
   5824       item), it means we have skipped an EXACT item but there is no following
   5825       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
   5826       all other cases, tempcode will be pointing to the repeat opcode, and will
   5827       be less than code, so the value of len will be greater than 0. */
   5828 
   5829       len = (int)(code - tempcode);
   5830       if (len > 0)
   5831         {
   5832         unsigned int repcode = *tempcode;
   5833 
   5834         /* There is a table for possessifying opcodes, all of which are less
   5835         than OP_CALLOUT. A zero entry means there is no possessified version.
   5836         */
   5837 
   5838         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
   5839           *tempcode = opcode_possessify[repcode];
   5840 
   5841         /* For opcode without a special possessified version, wrap the item in
   5842         ONCE brackets. */
   5843 
   5844         else
   5845           {
   5846           memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
   5847           code += 1 + LINK_SIZE;
   5848           len += 1 + LINK_SIZE;
   5849           tempcode[0] = OP_ONCE;
   5850           *code++ = OP_KET;
   5851           PUTINC(code, 0, len);
   5852           PUT(tempcode, 1, len);
   5853           }
   5854         }
   5855       }
   5856 
   5857     /* In all case we no longer have a previous item. We also set the
   5858     "follows varying string" flag for subsequently encountered reqcus if
   5859     it isn't already set and we have just passed a varying length item. */
   5860 
   5861     END_REPEAT:
   5862     previous = NULL;
   5863     cb->req_varyopt |= reqvary;
   5864     break;
   5865 
   5866 
   5867     /* ===================================================================*/
   5868     /* Start of nested parenthesized sub-expression, or lookahead or lookbehind
   5869     or option setting or condition or all the other extended parenthesis forms.
   5870     We must save the current high-water-mark for the forward reference list so
   5871     that we know where they start for this group. However, because the list may
   5872     be extended when there are very many forward references (usually the result
   5873     of a replicated inner group), we must use an offset rather than an absolute
   5874     address. Note that (?# comments are dealt with at the top of the loop;
   5875     they do not get this far. */
   5876 
   5877     case CHAR_LEFT_PARENTHESIS:
   5878     ptr++;
   5879 
   5880     /* Deal with various "verbs" that can be introduced by '*'. */
   5881 
   5882     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
   5883          || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0))))
   5884       {
   5885       int i, namelen;
   5886       int arglen = 0;
   5887       const char *vn = verbnames;
   5888       PCRE2_SPTR name = ptr + 1;
   5889       PCRE2_SPTR arg = NULL;
   5890       previous = NULL;
   5891       ptr++;
   5892 
   5893       /* Increment ptr, set namelen, check length */
   5894 
   5895       READ_NAME(ctype_letter, ERR60, *errorcodeptr);
   5896 
   5897       /* It appears that Perl allows any characters whatsoever, other than
   5898       a closing parenthesis, to appear in arguments, so we no longer insist on
   5899       letters, digits, and underscores. Perl does not, however, do any
   5900       interpretation within arguments, and has no means of including a closing
   5901       parenthesis. PCRE supports escape processing but only when it is
   5902       requested by an option. Note that check_escape() will not return values
   5903       greater than the code unit maximum when not in UTF mode. */
   5904 
   5905       if (*ptr == CHAR_COLON)
   5906         {
   5907         arg = ++ptr;
   5908 
   5909         if ((options & PCRE2_ALT_VERBNAMES) == 0)
   5910           {
   5911           arglen = 0;
   5912           while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
   5913             {
   5914             ptr++;                                /* Check length as we go */
   5915             arglen++;                             /* along, to avoid the   */
   5916             if ((unsigned int)arglen > MAX_MARK)  /* possibility of overflow. */
   5917               {
   5918               *errorcodeptr = ERR76;
   5919               goto FAILED;
   5920               }
   5921             }
   5922           }
   5923         else
   5924           {
   5925           /* The length check is in process_verb_names() */
   5926           arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
   5927             utf, cb);
   5928           if (arglen < 0) goto FAILED;
   5929           }
   5930         }
   5931 
   5932       if (*ptr != CHAR_RIGHT_PARENTHESIS)
   5933         {
   5934         *errorcodeptr = ERR60;
   5935         goto FAILED;
   5936         }
   5937 
   5938       /* Scan the table of verb names */
   5939 
   5940       for (i = 0; i < verbcount; i++)
   5941         {
   5942         if (namelen == verbs[i].len &&
   5943             PRIV(strncmp_c8)(name, vn, namelen) == 0)
   5944           {
   5945           int setverb;
   5946 
   5947           /* Check for open captures before ACCEPT and convert it to
   5948           ASSERT_ACCEPT if in an assertion. */
   5949 
   5950           if (verbs[i].op == OP_ACCEPT)
   5951             {
   5952             open_capitem *oc;
   5953             if (arglen != 0)
   5954               {
   5955               *errorcodeptr = ERR59;
   5956               goto FAILED;
   5957               }
   5958             cb->had_accept = TRUE;
   5959 
   5960             /* In the first pass, just accumulate the length required;
   5961             otherwise hitting (*ACCEPT) inside many nested parentheses can
   5962             cause workspace overflow. */
   5963 
   5964             for (oc = cb->open_caps; oc != NULL; oc = oc->next)
   5965               {
   5966               if (lengthptr != NULL)
   5967                 {
   5968                 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
   5969                 }
   5970               else
   5971                 {
   5972                 *code++ = OP_CLOSE;
   5973                 PUT2INC(code, 0, oc->number);
   5974                 }
   5975               }
   5976             setverb = *code++ =
   5977               (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
   5978 
   5979             /* Do not set firstcu after *ACCEPT */
   5980             if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5981             }
   5982 
   5983           /* Handle other cases with/without an argument */
   5984 
   5985           else if (arglen == 0)    /* There is no argument */
   5986             {
   5987             if (verbs[i].op < 0)   /* Argument is mandatory */
   5988               {
   5989               *errorcodeptr = ERR66;
   5990               goto FAILED;
   5991               }
   5992             setverb = *code++ = verbs[i].op;
   5993             }
   5994 
   5995           else                        /* An argument is present */
   5996             {
   5997             if (verbs[i].op_arg < 0)  /* Argument is forbidden */
   5998               {
   5999               *errorcodeptr = ERR59;
   6000               goto FAILED;
   6001               }
   6002             setverb = *code++ = verbs[i].op_arg;
   6003 
   6004             /* Arguments can be very long, especially in 16- and 32-bit modes,
   6005             and can overflow the workspace in the first pass. Instead of
   6006             putting the argument into memory, we just update the length counter
   6007             and set up an empty argument. */
   6008 
   6009             if (lengthptr != NULL)
   6010               {
   6011               *lengthptr += arglen;
   6012               *code++ = 0;
   6013               }
   6014             else
   6015               {
   6016               *code++ = arglen;
   6017               if ((options & PCRE2_ALT_VERBNAMES) != 0)
   6018                 {
   6019                 PCRE2_UCHAR *memcode = code;  /* code is "register" */
   6020                 (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
   6021                   utf, cb);
   6022                 code = memcode;
   6023                 }
   6024               else   /* No argument processing */
   6025                 {
   6026                 memcpy(code, arg, CU2BYTES(arglen));
   6027                 code += arglen;
   6028                 }
   6029               }
   6030 
   6031             *code++ = 0;
   6032             }
   6033 
   6034           switch (setverb)
   6035             {
   6036             case OP_THEN:
   6037             case OP_THEN_ARG:
   6038             cb->external_flags |= PCRE2_HASTHEN;
   6039             break;
   6040 
   6041             case OP_PRUNE:
   6042             case OP_PRUNE_ARG:
   6043             case OP_SKIP:
   6044             case OP_SKIP_ARG:
   6045             cb->had_pruneorskip = TRUE;
   6046             break;
   6047             }
   6048 
   6049           break;  /* Found verb, exit loop */
   6050           }
   6051 
   6052         vn += verbs[i].len + 1;
   6053         }
   6054 
   6055       if (i < verbcount) continue;    /* Successfully handled a verb */
   6056       *errorcodeptr = ERR60;          /* Verb not recognized */
   6057       goto FAILED;
   6058       }
   6059 
   6060     /* Initialization for "real" parentheses */
   6061 
   6062     newoptions = options;
   6063     skipunits = 0;
   6064     bravalue = OP_CBRA;
   6065     reset_bracount = FALSE;
   6066 
   6067     /* Deal with the extended parentheses; all are introduced by '?', and the
   6068     appearance of any of them means that this is not a capturing group. */
   6069 
   6070     if (*ptr == CHAR_QUESTION_MARK)
   6071       {
   6072       int i, count;
   6073       int namelen;                /* Must be signed */
   6074       uint32_t index;
   6075       uint32_t set, unset, *optset;
   6076       named_group *ng;
   6077       PCRE2_SPTR name;
   6078       PCRE2_UCHAR *slot;
   6079 
   6080       switch (*(++ptr))
   6081         {
   6082         /* ------------------------------------------------------------ */
   6083         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
   6084         reset_bracount = TRUE;
   6085         /* Fall through */
   6086 
   6087         /* ------------------------------------------------------------ */
   6088         case CHAR_COLON:          /* Non-capturing bracket */
   6089         bravalue = OP_BRA;
   6090         ptr++;
   6091         break;
   6092 
   6093         /* ------------------------------------------------------------ */
   6094         case CHAR_LEFT_PARENTHESIS:
   6095         bravalue = OP_COND;       /* Conditional group */
   6096         tempptr = ptr;
   6097 
   6098         /* A condition can be an assertion, a number (referring to a numbered
   6099         group's having been set), a name (referring to a named group), or 'R',
   6100         referring to recursion. R<digits> and R&name are also permitted for
   6101         recursion tests.
   6102 
   6103         There are ways of testing a named group: (?(name)) is used by Python;
   6104         Perl 5.10 onwards uses (?(<name>) or (?('name')).
   6105 
   6106         There is one unfortunate ambiguity, caused by history. 'R' can be the
   6107         recursive thing or the name 'R' (and similarly for 'R' followed by
   6108         digits). We look for a name first; if not found, we try the other case.
   6109 
   6110         For compatibility with auto-callouts, we allow a callout to be
   6111         specified before a condition that is an assertion. First, check for the
   6112         syntax of a callout; if found, adjust the temporary pointer that is
   6113         used to check for an assertion condition. That's all that is needed! */
   6114 
   6115         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
   6116           {
   6117           if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS)
   6118             {
   6119             for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
   6120             if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
   6121               tempptr += i + 1;
   6122             }
   6123           else
   6124             {
   6125             uint32_t delimiter = 0;
   6126             for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
   6127               {
   6128               if (ptr[3] == PRIV(callout_start_delims)[i])
   6129                 {
   6130                 delimiter = PRIV(callout_end_delims)[i];
   6131                 break;
   6132                 }
   6133               }
   6134             if (delimiter != 0)
   6135               {
   6136               for (i = 4; ptr + i < cb->end_pattern; i++)
   6137                 {
   6138                 if (ptr[i] == delimiter)
   6139                   {
   6140                   if (ptr[i+1] == delimiter) i++;
   6141                   else
   6142                     {
   6143                     if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2;
   6144                     break;
   6145                     }
   6146                   }
   6147                 }
   6148               }
   6149             }
   6150 
   6151           /* tempptr should now be pointing to the opening parenthesis of the
   6152           assertion condition. */
   6153 
   6154           if (*tempptr != CHAR_LEFT_PARENTHESIS)
   6155             {
   6156             *errorcodeptr = ERR28;
   6157             goto FAILED;
   6158             }
   6159           }
   6160 
   6161         /* For conditions that are assertions, check the syntax, and then exit
   6162         the switch. This will take control down to where bracketed groups
   6163         are processed. The assertion will be handled as part of the group,
   6164         but we need to identify this case because the conditional assertion may
   6165         not be quantifier. */
   6166 
   6167         if (tempptr[1] == CHAR_QUESTION_MARK &&
   6168               (tempptr[2] == CHAR_EQUALS_SIGN ||
   6169                tempptr[2] == CHAR_EXCLAMATION_MARK ||
   6170                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
   6171                    (tempptr[3] == CHAR_EQUALS_SIGN ||
   6172                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
   6173           {
   6174           cb->iscondassert = TRUE;
   6175           break;
   6176           }
   6177 
   6178         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
   6179         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
   6180 
   6181         code[1+LINK_SIZE] = OP_CREF;
   6182         skipunits = 1+IMM2_SIZE;
   6183         refsign = -1;     /* => not a number */
   6184         namelen = -1;     /* => not a name; must set to avoid warning */
   6185         name = NULL;      /* Always set to avoid warning */
   6186         recno = 0;        /* Always set to avoid warning */
   6187 
   6188         /* Point at character after (?( */
   6189 
   6190         ptr++;
   6191 
   6192         /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect
   6193         users of PCRE2 via an application can discover which release of PCRE2
   6194         is being used. */
   6195 
   6196         if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
   6197             ptr[7] != CHAR_RIGHT_PARENTHESIS)
   6198           {
   6199           BOOL ge = FALSE;
   6200           int major = 0;
   6201           int minor = 0;
   6202 
   6203           ptr += 7;
   6204           if (*ptr == CHAR_GREATER_THAN_SIGN)
   6205             {
   6206             ge = TRUE;
   6207             ptr++;
   6208             }
   6209 
   6210           /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
   6211           references its argument twice. */
   6212 
   6213           if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
   6214             {
   6215             *errorcodeptr = ERR79;
   6216             goto FAILED;
   6217             }
   6218 
   6219           while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0';
   6220           if (*ptr == CHAR_DOT)
   6221             {
   6222             ptr++;
   6223             while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0';
   6224             if (minor < 10) minor *= 10;
   6225             }
   6226 
   6227           if (*ptr != CHAR_RIGHT_PARENTHESIS || minor > 99)
   6228             {
   6229             *errorcodeptr = ERR79;
   6230             goto FAILED;
   6231             }
   6232 
   6233           if (ge)
   6234             code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) ||
   6235               (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))?
   6236                 OP_TRUE : OP_FALSE;
   6237           else
   6238             code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)?
   6239               OP_TRUE : OP_FALSE;
   6240 
   6241           ptr++;
   6242           skipunits = 1;
   6243           break;  /* End of condition processing */
   6244           }
   6245 
   6246         /* Check for a test for recursion in a named group. */
   6247 
   6248         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
   6249           {
   6250           terminator = -1;
   6251           ptr += 2;
   6252           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
   6253           }
   6254 
   6255         /* Check for a test for a named group's having been set, using the Perl
   6256         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
   6257         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
   6258 
   6259         else if (*ptr == CHAR_LESS_THAN_SIGN)
   6260           {
   6261           terminator = CHAR_GREATER_THAN_SIGN;
   6262           ptr++;
   6263           }
   6264         else if (*ptr == CHAR_APOSTROPHE)
   6265           {
   6266           terminator = CHAR_APOSTROPHE;
   6267           ptr++;
   6268           }
   6269         else
   6270           {
   6271           terminator = CHAR_NULL;
   6272           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
   6273             else if (IS_DIGIT(*ptr)) refsign = 0;
   6274           }
   6275 
   6276         /* Handle a number */
   6277 
   6278         if (refsign >= 0)
   6279           {
   6280           while (IS_DIGIT(*ptr))
   6281             {
   6282             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
   6283               {
   6284               while (IS_DIGIT(*ptr)) ptr++;
   6285               *errorcodeptr = ERR61;
   6286               goto FAILED;
   6287               }
   6288             recno = recno * 10 + (int)(*ptr - CHAR_0);
   6289             ptr++;
   6290             }
   6291           }
   6292 
   6293         /* Otherwise we expect to read a name; anything else is an error. When
   6294         the referenced name is one of a number of duplicates, a different
   6295         opcode is used and it needs more memory. Unfortunately we cannot tell
   6296         whether this is the case in the first pass, so we have to allow for
   6297         more memory always. In the second pass, the additional to skipunits
   6298         happens later. */
   6299 
   6300         else
   6301           {
   6302           if (IS_DIGIT(*ptr))
   6303             {
   6304             *errorcodeptr = ERR44;  /* Group name must start with non-digit */
   6305             goto FAILED;
   6306             }
   6307           if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0)
   6308             {
   6309             *errorcodeptr = ERR28;   /* Assertion expected */
   6310             goto FAILED;
   6311             }
   6312           name = ptr;
   6313           /* Increment ptr, set namelen, check length */
   6314           READ_NAME(ctype_word, ERR48, *errorcodeptr);
   6315           if (lengthptr != NULL) skipunits += IMM2_SIZE;
   6316           }
   6317 
   6318         /* Check the terminator */
   6319 
   6320         if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) ||
   6321             *ptr++ != CHAR_RIGHT_PARENTHESIS)
   6322           {
   6323           ptr--;                  /* Error offset */
   6324           *errorcodeptr = ERR26;  /* Malformed number or name */
   6325           goto FAILED;
   6326           }
   6327 
   6328         /* Do no further checking in the pre-compile phase. */
   6329 
   6330         if (lengthptr != NULL) break;
   6331 
   6332         /* In the real compile we do the work of looking for the actual
   6333         reference. If refsign is not negative, it means we have a number in
   6334         recno. */
   6335 
   6336         if (refsign >= 0)
   6337           {
   6338           if (recno <= 0)
   6339             {
   6340             *errorcodeptr = ERR35;
   6341             goto FAILED;
   6342             }
   6343           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
   6344             (cb->bracount + 1) - recno : recno + cb->bracount;
   6345           if (recno <= 0 || (uint32_t)recno > cb->final_bracount)
   6346             {
   6347             *errorcodeptr = ERR15;
   6348             goto FAILED;
   6349             }
   6350           PUT2(code, 2+LINK_SIZE, recno);
   6351           if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
   6352           break;
   6353           }
   6354 
   6355         /* Otherwise look for the name. */
   6356 
   6357         slot = cb->name_table;
   6358         for (i = 0; i < cb->names_found; i++)
   6359           {
   6360           if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break;
   6361           slot += cb->name_entry_size;
   6362           }
   6363 
   6364         /* Found the named subpattern. If the name is duplicated, add one to
   6365         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
   6366         appropriate data values. Otherwise, just insert the unique subpattern
   6367         number. */
   6368 
   6369         if (i < cb->names_found)
   6370           {
   6371           int offset = i;            /* Offset of first name found */
   6372 
   6373           count = 0;
   6374           for (;;)
   6375             {
   6376             recno = GET2(slot, 0);   /* Number for last found */
   6377             if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
   6378             count++;
   6379             if (++i >= cb->names_found) break;
   6380             slot += cb->name_entry_size;
   6381             if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 ||
   6382               (slot+IMM2_SIZE)[namelen] != 0) break;
   6383             }
   6384 
   6385           if (count > 1)
   6386             {
   6387             PUT2(code, 2+LINK_SIZE, offset);
   6388             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
   6389             skipunits += IMM2_SIZE;
   6390             code[1+LINK_SIZE]++;
   6391             }
   6392           else  /* Not a duplicated name */
   6393             {
   6394             PUT2(code, 2+LINK_SIZE, recno);
   6395             }
   6396           }
   6397 
   6398         /* If terminator == CHAR_NULL it means that the name followed directly
   6399         after the opening parenthesis [e.g. (?(abc)...] and in this case there
   6400         are some further alternatives to try. For the cases where terminator !=
   6401         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
   6402         we have now checked all the possibilities, so give an error. */
   6403 
   6404         else if (terminator != CHAR_NULL)
   6405           {
   6406           *errorcodeptr = ERR15;
   6407           goto FAILED;
   6408           }
   6409 
   6410         /* Check for (?(R) for recursion. Allow digits after R to specify a
   6411         specific group number. */
   6412 
   6413         else if (*name == CHAR_R)
   6414           {
   6415           recno = 0;
   6416           for (i = 1; i < namelen; i++)
   6417             {
   6418             if (!IS_DIGIT(name[i]))
   6419               {
   6420               *errorcodeptr = ERR15;        /* Non-existent subpattern */
   6421               goto FAILED;
   6422               }
   6423             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
   6424               {
   6425               *errorcodeptr = ERR61;
   6426               goto FAILED;
   6427               }
   6428             recno = recno * 10 + name[i] - CHAR_0;
   6429             }
   6430           if (recno == 0) recno = RREF_ANY;
   6431           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
   6432           PUT2(code, 2+LINK_SIZE, recno);
   6433           }
   6434 
   6435         /* Similarly, check for the (?(DEFINE) "condition", which is always
   6436         false. During compilation we set OP_DEFINE to distinguish this from
   6437         other OP_FALSE conditions so that it can be checked for having only one
   6438         branch, but after that the opcode is changed to OP_FALSE. */
   6439 
   6440         else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
   6441           {
   6442           code[1+LINK_SIZE] = OP_DEFINE;
   6443           skipunits = 1;
   6444           }
   6445 
   6446         /* Reference to an unidentified subpattern. */
   6447 
   6448         else
   6449           {
   6450           *errorcodeptr = ERR15;
   6451           goto FAILED;
   6452           }
   6453         break;
   6454 
   6455 
   6456         /* ------------------------------------------------------------ */
   6457         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
   6458         bravalue = OP_ASSERT;
   6459         cb->assert_depth += 1;
   6460         ptr++;
   6461         break;
   6462 
   6463         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
   6464         thing to do, but Perl allows all assertions to be quantified, and when
   6465         they contain capturing parentheses there may be a potential use for
   6466         this feature. Not that that applies to a quantified (?!) but we allow
   6467         it for uniformity. */
   6468 
   6469         /* ------------------------------------------------------------ */
   6470         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
   6471         ptr++;
   6472         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
   6473              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
   6474             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
   6475           {
   6476           *code++ = OP_FAIL;
   6477           previous = NULL;
   6478           continue;
   6479           }
   6480         bravalue = OP_ASSERT_NOT;
   6481         cb->assert_depth += 1;
   6482         break;
   6483 
   6484 
   6485         /* ------------------------------------------------------------ */
   6486         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
   6487         switch (ptr[1])
   6488           {
   6489           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
   6490           bravalue = OP_ASSERTBACK;
   6491           cb->assert_depth += 1;
   6492           ptr += 2;
   6493           break;
   6494 
   6495           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
   6496           bravalue = OP_ASSERTBACK_NOT;
   6497           cb->assert_depth += 1;
   6498           ptr += 2;
   6499           break;
   6500 
   6501           /* Must be a name definition - as the syntax was checked in the
   6502           pre-pass, we can assume here that it is valid. Skip over the name
   6503           and go to handle the numbered group. */
   6504 
   6505           default:
   6506           while (*(++ptr) != CHAR_GREATER_THAN_SIGN);
   6507           ptr++;
   6508           goto NUMBERED_GROUP;
   6509           }
   6510         break;
   6511 
   6512 
   6513         /* ------------------------------------------------------------ */
   6514         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
   6515         bravalue = OP_ONCE;
   6516         ptr++;
   6517         break;
   6518 
   6519 
   6520         /* ------------------------------------------------------------ */
   6521         case CHAR_C:                 /* Callout */
   6522         previous_callout = code;     /* Save for later completion */
   6523         after_manual_callout = 1;    /* Skip one item before completing */
   6524         ptr++;                       /* Character after (?C */
   6525 
   6526         /* A callout may have a string argument, delimited by one of a fixed
   6527         number of characters, or an undelimited numerical argument, or no
   6528         argument, which is the same as (?C0). Different opcodes are used for
   6529         the two cases. */
   6530 
   6531         if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
   6532           {
   6533           uint32_t delimiter = 0;
   6534 
   6535           for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
   6536             {
   6537             if (*ptr == PRIV(callout_start_delims)[i])
   6538               {
   6539               delimiter = PRIV(callout_end_delims)[i];
   6540               break;
   6541               }
   6542             }
   6543 
   6544           if (delimiter == 0)
   6545             {
   6546             *errorcodeptr = ERR82;
   6547             goto FAILED;
   6548             }
   6549 
   6550           /* During the pre-compile phase, we parse the string and update the
   6551           length. There is no need to generate any code. (In fact, the string
   6552           has already been parsed in the pre-pass that looks for named
   6553           parentheses, but it does no harm to leave this code in.) */
   6554 
   6555           if (lengthptr != NULL)     /* Only check the string */
   6556             {
   6557             PCRE2_SPTR start = ptr;
   6558             do
   6559               {
   6560               if (++ptr >= cb->end_pattern)
   6561                 {
   6562                 *errorcodeptr = ERR81;
   6563                 ptr = start;   /* To give a more useful message */
   6564                 goto FAILED;
   6565                 }
   6566               if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
   6567               }
   6568             while (ptr[0] != delimiter);
   6569 
   6570             /* Start points to the opening delimiter, ptr points to the
   6571             closing delimiter. We must allow for including the delimiter and
   6572             for the terminating zero. Any doubled delimiters within the string
   6573             make this an overestimate, but it is not worth bothering about. */
   6574 
   6575             (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE);
   6576             }
   6577 
   6578           /* In the real compile we can copy the string, knowing that it is
   6579           syntactically OK. The starting delimiter is included so that the
   6580           client can discover it if they want. We also pass the start offset to
   6581           help a script language give better error messages. */
   6582 
   6583           else
   6584             {
   6585             PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
   6586             *callout_string++ = *ptr++;
   6587             PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */
   6588             for(;;)
   6589               {
   6590               if (*ptr == delimiter)
   6591                 {
   6592                 if (ptr[1] == delimiter) ptr++; else break;
   6593                 }
   6594               *callout_string++ = *ptr++;
   6595               }
   6596             *callout_string++ = CHAR_NULL;
   6597             code[0] = OP_CALLOUT_STR;
   6598             PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */
   6599             PUT(code, 1 + LINK_SIZE, 0);      /* Default length */
   6600             PUT(code, 1 + 2*LINK_SIZE,        /* Compute size */
   6601                 (int)(callout_string - code));
   6602             code = callout_string;
   6603             }
   6604 
   6605           /* Advance to what should be the closing parenthesis, which is
   6606           checked below. */
   6607 
   6608           ptr++;
   6609           }
   6610 
   6611         /* Handle a callout with an optional numerical argument, which must be
   6612         less than or equal to 255. A missing argument gives 0. */
   6613 
   6614         else
   6615           {
   6616           int n = 0;
   6617           code[0] = OP_CALLOUT;     /* Numerical callout */
   6618           while (IS_DIGIT(*ptr))
   6619             {
   6620             n = n * 10 + *ptr++ - CHAR_0;
   6621             if (n > 255)
   6622               {
   6623               *errorcodeptr = ERR38;
   6624               goto FAILED;
   6625               }
   6626             }
   6627           PUT(code, 1, (int)(ptr - cb->start_pattern + 1));  /* Next offset */
   6628           PUT(code, 1 + LINK_SIZE, 0);                    /* Default length */
   6629           code[1 + 2*LINK_SIZE] = n;                      /* Callout number */
   6630           code += PRIV(OP_lengths)[OP_CALLOUT];
   6631           }
   6632 
   6633         /* Both formats must have a closing parenthesis */
   6634 
   6635         if (*ptr != CHAR_RIGHT_PARENTHESIS)
   6636           {
   6637           *errorcodeptr = ERR39;
   6638           goto FAILED;
   6639           }
   6640 
   6641         /* Callouts cannot be quantified. */
   6642 
   6643         previous = NULL;
   6644         continue;
   6645 
   6646 
   6647         /* ------------------------------------------------------------ */
   6648         case CHAR_P:              /* Python-style named subpattern handling */
   6649         if (*(++ptr) == CHAR_EQUALS_SIGN ||
   6650             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
   6651           {
   6652           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
   6653           terminator = CHAR_RIGHT_PARENTHESIS;
   6654           goto NAMED_REF_OR_RECURSE;
   6655           }
   6656         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
   6657           {
   6658           *errorcodeptr = ERR41;
   6659           goto FAILED;
   6660           }
   6661         /* Fall through to handle (?P< as (?< is handled */
   6662 
   6663 
   6664         /* ------------------------------------------------------------ */
   6665         case CHAR_APOSTROPHE:   /* Define a name - note fall through above */
   6666 
   6667         /* The syntax was checked and the list of names was set up in the
   6668         pre-pass, so there is nothing to be done now except to skip over the
   6669         name. */
   6670 
   6671         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
   6672                   CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
   6673         while (*(++ptr) != (unsigned int)terminator);
   6674         ptr++;
   6675         goto NUMBERED_GROUP;      /* Set up numbered group */
   6676 
   6677 
   6678         /* ------------------------------------------------------------ */
   6679         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
   6680         terminator = CHAR_RIGHT_PARENTHESIS;
   6681         is_recurse = TRUE;
   6682         /* Fall through */
   6683 
   6684         /* We come here from the Python syntax above that handles both
   6685         references (?P=name) and recursion (?P>name), as well as falling
   6686         through from the Perl recursion syntax (?&name). We also come here from
   6687         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
   6688         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
   6689 
   6690         NAMED_REF_OR_RECURSE:
   6691         name = ++ptr;
   6692         if (IS_DIGIT(*ptr))
   6693           {
   6694           *errorcodeptr = ERR44;   /* Group name must start with non-digit */
   6695           goto FAILED;
   6696           }
   6697         /* Increment ptr, set namelen, check length */
   6698         READ_NAME(ctype_word, ERR48, *errorcodeptr);
   6699 
   6700         /* In the pre-compile phase, do a syntax check. */
   6701 
   6702         if (lengthptr != NULL)
   6703           {
   6704           if (namelen == 0)
   6705             {
   6706             *errorcodeptr = ERR62;
   6707             goto FAILED;
   6708             }
   6709           if (*ptr != (PCRE2_UCHAR)terminator)
   6710             {
   6711             *errorcodeptr = ERR42;
   6712             goto FAILED;
   6713             }
   6714           }
   6715 
   6716         /* Scan the list of names generated in the pre-pass in order to get
   6717         a number and whether or not this name is duplicated. */
   6718 
   6719         recno = 0;
   6720         is_dupname = FALSE;
   6721         ng = cb->named_groups;
   6722 
   6723         for (i = 0; i < cb->names_found; i++, ng++)
   6724           {
   6725           if (namelen == ng->length &&
   6726               PRIV(strncmp)(name, ng->name, namelen) == 0)
   6727             {
   6728             open_capitem *oc;
   6729             is_dupname = ng->isdup;
   6730             recno = ng->number;
   6731 
   6732             /* For a recursion, that's all that is needed. We can now go to the
   6733             code that handles numerical recursion. */
   6734 
   6735             if (is_recurse) goto HANDLE_RECURSION;
   6736 
   6737             /* For a back reference, update the back reference map and the
   6738             maximum back reference. Then for each group we must check to see if
   6739             it is recursive, that is, it is inside the group that it
   6740             references. A flag is set so that the group can be made atomic. */
   6741 
   6742             cb->backref_map |= (recno < 32)? (1u << recno) : 1;
   6743             if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
   6744 
   6745             for (oc = cb->open_caps; oc != NULL; oc = oc->next)
   6746               {
   6747               if (oc->number == recno)
   6748                 {
   6749                 oc->flag = TRUE;
   6750                 break;
   6751                 }
   6752               }
   6753             }
   6754           }
   6755 
   6756         /* If the name was not found we have a bad reference. */
   6757 
   6758         if (recno == 0)
   6759           {
   6760           *errorcodeptr = ERR15;
   6761           goto FAILED;
   6762           }
   6763 
   6764         /* If a back reference name is not duplicated, we can handle it as a
   6765         numerical reference. */
   6766 
   6767         if (!is_dupname) goto HANDLE_REFERENCE;
   6768 
   6769         /* If a back reference name is duplicated, we generate a different
   6770         opcode to a numerical back reference. In the second pass we must search
   6771         for the index and count in the final name table. */
   6772 
   6773         count = 0;
   6774         index = 0;
   6775 
   6776         if (lengthptr == NULL)
   6777           {
   6778           slot = cb->name_table;
   6779           for (i = 0; i < cb->names_found; i++)
   6780             {
   6781             if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 &&
   6782                 slot[IMM2_SIZE+namelen] == 0)
   6783               {
   6784               if (count == 0) index = i;
   6785               count++;
   6786               }
   6787             slot += cb->name_entry_size;
   6788             }
   6789 
   6790           if (count == 0)
   6791             {
   6792             *errorcodeptr = ERR15;
   6793             goto FAILED;
   6794             }
   6795           }
   6796 
   6797         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   6798         previous = code;
   6799         *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
   6800         PUT2INC(code, 0, index);
   6801         PUT2INC(code, 0, count);
   6802         continue;  /* End of back ref handling */
   6803 
   6804 
   6805         /* ------------------------------------------------------------ */
   6806         case CHAR_R:              /* Recursion, same as (?0) */
   6807         recno = 0;
   6808         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
   6809           {
   6810           *errorcodeptr = ERR29;
   6811           goto FAILED;
   6812           }
   6813         goto HANDLE_RECURSION;
   6814 
   6815 
   6816         /* ------------------------------------------------------------ */
   6817         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
   6818         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
   6819         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   6820           {
   6821           terminator = CHAR_RIGHT_PARENTHESIS;
   6822 
   6823           /* Come here from the \g<...> and \g'...' code (Oniguruma
   6824           compatibility). However, the syntax has been checked to ensure that
   6825           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
   6826           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
   6827           ever be taken. */
   6828 
   6829           HANDLE_NUMERICAL_RECURSION:
   6830 
   6831           if ((refsign = *ptr) == CHAR_PLUS)
   6832             {
   6833             ptr++;
   6834             if (!IS_DIGIT(*ptr))
   6835               {
   6836               *errorcodeptr = ERR63;
   6837               goto FAILED;
   6838               }
   6839             }
   6840           else if (refsign == CHAR_MINUS)
   6841             {
   6842             if (!IS_DIGIT(ptr[1]))
   6843               goto OTHER_CHAR_AFTER_QUERY;
   6844             ptr++;
   6845             }
   6846 
   6847           recno = 0;
   6848           while (IS_DIGIT(*ptr))
   6849             {
   6850             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
   6851               {
   6852               while (IS_DIGIT(*ptr)) ptr++;
   6853               *errorcodeptr = ERR61;
   6854               goto FAILED;
   6855               }
   6856             recno = recno * 10 + *ptr++ - CHAR_0;
   6857             }
   6858 
   6859           if (*ptr != (PCRE2_UCHAR)terminator)
   6860             {
   6861             *errorcodeptr = ERR29;
   6862             goto FAILED;
   6863             }
   6864 
   6865           if (refsign == CHAR_MINUS)
   6866             {
   6867             if (recno == 0)
   6868               {
   6869               *errorcodeptr = ERR58;
   6870               goto FAILED;
   6871               }
   6872             recno = (int)(cb->bracount + 1) - recno;
   6873             if (recno <= 0)
   6874               {
   6875               *errorcodeptr = ERR15;
   6876               goto FAILED;
   6877               }
   6878             }
   6879           else if (refsign == CHAR_PLUS)
   6880             {
   6881             if (recno == 0)
   6882               {
   6883               *errorcodeptr = ERR58;
   6884               goto FAILED;
   6885               }
   6886             recno += cb->bracount;
   6887             }
   6888 
   6889           if ((uint32_t)recno > cb->final_bracount)
   6890             {
   6891             *errorcodeptr = ERR15;
   6892             goto FAILED;
   6893             }
   6894 
   6895           /* Come here from code above that handles a named recursion.
   6896           We insert the number of the called group after OP_RECURSE. At the
   6897           end of compiling the pattern is scanned and these numbers are
   6898           replaced by offsets within the pattern. It is done like this to avoid
   6899           problems with forward references and adjusting offsets when groups
   6900           are duplicated and moved (as discovered in previous implementations).
   6901           Note that a recursion does not have a set first character (relevant
   6902           if it is repeated, because it will then be wrapped with ONCE
   6903           brackets). */
   6904 
   6905           HANDLE_RECURSION:
   6906           previous = code;
   6907           *code = OP_RECURSE;
   6908           PUT(code, 1, recno);
   6909           code += 1 + LINK_SIZE;
   6910           groupsetfirstcu = FALSE;
   6911           cb->had_recurse = TRUE;
   6912           }
   6913 
   6914         /* Can't determine a first byte now */
   6915 
   6916         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   6917         continue;
   6918 
   6919 
   6920         /* ------------------------------------------------------------ */
   6921         default:              /* Other characters: check option setting */
   6922         OTHER_CHAR_AFTER_QUERY:
   6923         set = unset = 0;
   6924         optset = &set;
   6925 
   6926         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
   6927           {
   6928           switch (*ptr++)
   6929             {
   6930             case CHAR_MINUS: optset = &unset; break;
   6931 
   6932             case CHAR_J:    /* Record that it changed in the external options */
   6933             *optset |= PCRE2_DUPNAMES;
   6934             cb->external_flags |= PCRE2_JCHANGED;
   6935             break;
   6936 
   6937             case CHAR_i: *optset |= PCRE2_CASELESS; break;
   6938             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
   6939             case CHAR_s: *optset |= PCRE2_DOTALL; break;
   6940             case CHAR_x: *optset |= PCRE2_EXTENDED; break;
   6941             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
   6942 
   6943             default:  *errorcodeptr = ERR11;
   6944                       ptr--;    /* Correct the offset */
   6945                       goto FAILED;
   6946             }
   6947           }
   6948 
   6949         /* Set up the changed option bits, but don't change anything yet. */
   6950 
   6951         newoptions = (options | set) & (~unset);
   6952 
   6953         /* If the options ended with ')' this is not the start of a nested
   6954         group with option changes, so the options change at this level. They
   6955         must also be passed back for use in subsequent branches. Reset the
   6956         greedy defaults and the case value for firstcu and reqcu. */
   6957 
   6958         if (*ptr == CHAR_RIGHT_PARENTHESIS)
   6959           {
   6960           *optionsptr = options = newoptions;
   6961           greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
   6962           greedy_non_default = greedy_default ^ 1;
   6963           req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
   6964           previous = NULL;       /* This item can't be repeated */
   6965           continue;              /* It is complete */
   6966           }
   6967 
   6968         /* If the options ended with ':' we are heading into a nested group
   6969         with possible change of options. Such groups are non-capturing and are
   6970         not assertions of any kind. All we need to do is skip over the ':';
   6971         the newoptions value is handled below. */
   6972 
   6973         bravalue = OP_BRA;
   6974         ptr++;
   6975         }     /* End of switch for character following (? */
   6976       }       /* End of (? handling */
   6977 
   6978     /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE
   6979     is set, all unadorned brackets become non-capturing and behave like (?:...)
   6980     brackets. */
   6981 
   6982     else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0)
   6983       {
   6984       bravalue = OP_BRA;
   6985       }
   6986 
   6987     /* Else we have a capturing group. */
   6988 
   6989     else
   6990       {
   6991       NUMBERED_GROUP:
   6992       cb->bracount += 1;
   6993       PUT2(code, 1+LINK_SIZE, cb->bracount);
   6994       skipunits = IMM2_SIZE;
   6995       }
   6996 
   6997     /* Process nested bracketed regex. First check for parentheses nested too
   6998     deeply. */
   6999 
   7000     if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit))
   7001       {
   7002       *errorcodeptr = ERR19;
   7003       goto FAILED;
   7004       }
   7005 
   7006     /* All assertions used not to be repeatable, but this was changed for Perl
   7007     compatibility. All kinds can now be repeated except for assertions that are
   7008     conditions (Perl also forbids these to be repeated). We copy code into a
   7009     non-register variable (tempcode) in order to be able to pass its address
   7010     because some compilers complain otherwise. At the start of a conditional
   7011     group whose condition is an assertion, cb->iscondassert is set. We unset it
   7012     here so as to allow assertions later in the group to be quantified. */
   7013 
   7014     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
   7015         cb->iscondassert)
   7016       {
   7017       previous = NULL;
   7018       cb->iscondassert = FALSE;
   7019       }
   7020     else
   7021       {
   7022       previous = code;
   7023       }
   7024 
   7025     *code = bravalue;
   7026     tempcode = code;
   7027     tempreqvary = cb->req_varyopt;        /* Save value before bracket */
   7028     tempbracount = cb->bracount;          /* Save value before bracket */
   7029     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
   7030 
   7031     if (!compile_regex(
   7032          newoptions,                      /* The complete new option state */
   7033          &tempcode,                       /* Where to put code (updated) */
   7034          &ptr,                            /* Input pointer (updated) */
   7035          errorcodeptr,                    /* Where to put an error message */
   7036          (bravalue == OP_ASSERTBACK ||
   7037           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
   7038          reset_bracount,                  /* True if (?| group */
   7039          skipunits,                       /* Skip over bracket number */
   7040          cond_depth +
   7041            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
   7042          &subfirstcu,                     /* For possible first char */
   7043          &subfirstcuflags,
   7044          &subreqcu,                       /* For possible last char */
   7045          &subreqcuflags,
   7046          bcptr,                           /* Current branch chain */
   7047          cb,                              /* Compile data block */
   7048          (lengthptr == NULL)? NULL :      /* Actual compile phase */
   7049            &length_prevgroup              /* Pre-compile phase */
   7050          ))
   7051       goto FAILED;
   7052 
   7053     cb->parens_depth -= 1;
   7054 
   7055     /* If this was an atomic group and there are no capturing groups within it,
   7056     generate OP_ONCE_NC instead of OP_ONCE. */
   7057 
   7058     if (bravalue == OP_ONCE && cb->bracount <= tempbracount)
   7059       *code = OP_ONCE_NC;
   7060 
   7061     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
   7062       cb->assert_depth -= 1;
   7063 
   7064     /* At the end of compiling, code is still pointing to the start of the
   7065     group, while tempcode has been updated to point past the end of the group.
   7066     The pattern pointer (ptr) is on the bracket.
   7067 
   7068     If this is a conditional bracket, check that there are no more than
   7069     two branches in the group, or just one if it's a DEFINE group. We do this
   7070     in the real compile phase, not in the pre-pass, where the whole group may
   7071     not be available. */
   7072 
   7073     if (bravalue == OP_COND && lengthptr == NULL)
   7074       {
   7075       PCRE2_UCHAR *tc = code;
   7076       int condcount = 0;
   7077 
   7078       do {
   7079          condcount++;
   7080          tc += GET(tc,1);
   7081          }
   7082       while (*tc != OP_KET);
   7083 
   7084       /* A DEFINE group is never obeyed inline (the "condition" is always
   7085       false). It must have only one branch. Having checked this, change the
   7086       opcode to OP_FALSE. */
   7087 
   7088       if (code[LINK_SIZE+1] == OP_DEFINE)
   7089         {
   7090         if (condcount > 1)
   7091           {
   7092           *errorcodeptr = ERR54;
   7093           goto FAILED;
   7094           }
   7095         code[LINK_SIZE+1] = OP_FALSE;
   7096         bravalue = OP_DEFINE;   /* Just a flag to suppress char handling below */
   7097         }
   7098 
   7099       /* A "normal" conditional group. If there is just one branch, we must not
   7100       make use of its firstcu or reqcu, because this is equivalent to an
   7101       empty second branch. */
   7102 
   7103       else
   7104         {
   7105         if (condcount > 2)
   7106           {
   7107           *errorcodeptr = ERR27;
   7108           goto FAILED;
   7109           }
   7110         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
   7111         }
   7112       }
   7113 
   7114     /* At the end of a group, it's an error if we hit end of pattern or
   7115     any non-closing parenthesis. This check also happens in the pre-scan,
   7116     so should not trigger here, but leave this code as an insurance. */
   7117 
   7118     if (*ptr != CHAR_RIGHT_PARENTHESIS)
   7119       {
   7120       *errorcodeptr = ERR14;
   7121       goto FAILED;
   7122       }
   7123 
   7124     /* In the pre-compile phase, update the length by the length of the group,
   7125     less the brackets at either end. Then reduce the compiled code to just a
   7126     set of non-capturing brackets so that it doesn't use much memory if it is
   7127     duplicated by a quantifier.*/
   7128 
   7129     if (lengthptr != NULL)
   7130       {
   7131       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
   7132         {
   7133         *errorcodeptr = ERR20;
   7134         goto FAILED;
   7135         }
   7136       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
   7137       code++;   /* This already contains bravalue */
   7138       PUTINC(code, 0, 1 + LINK_SIZE);
   7139       *code++ = OP_KET;
   7140       PUTINC(code, 0, 1 + LINK_SIZE);
   7141       break;    /* No need to waste time with special character handling */
   7142       }
   7143 
   7144     /* Otherwise update the main code pointer to the end of the group. */
   7145 
   7146     code = tempcode;
   7147 
   7148     /* For a DEFINE group, required and first character settings are not
   7149     relevant. */
   7150 
   7151     if (bravalue == OP_DEFINE) break;
   7152 
   7153     /* Handle updating of the required and first characters for other types of
   7154     group. Update for normal brackets of all kinds, and conditions with two
   7155     branches (see code above). If the bracket is followed by a quantifier with
   7156     zero repeat, we have to back off. Hence the definition of zeroreqcu and
   7157     zerofirstcu outside the main loop so that they can be accessed for the
   7158     back off. */
   7159 
   7160     zeroreqcu = reqcu;
   7161     zeroreqcuflags = reqcuflags;
   7162     zerofirstcu = firstcu;
   7163     zerofirstcuflags = firstcuflags;
   7164     groupsetfirstcu = FALSE;
   7165 
   7166     if (bravalue >= OP_ONCE)
   7167       {
   7168       /* If we have not yet set a firstcu in this branch, take it from the
   7169       subpattern, remembering that it was set here so that a repeat of more
   7170       than one can replicate it as reqcu if necessary. If the subpattern has
   7171       no firstcu, set "none" for the whole branch. In both cases, a zero
   7172       repeat forces firstcu to "none". */
   7173 
   7174       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
   7175         {
   7176         if (subfirstcuflags >= 0)
   7177           {
   7178           firstcu = subfirstcu;
   7179           firstcuflags = subfirstcuflags;
   7180           groupsetfirstcu = TRUE;
   7181           }
   7182         else firstcuflags = REQ_NONE;
   7183         zerofirstcuflags = REQ_NONE;
   7184         }
   7185 
   7186       /* If firstcu was previously set, convert the subpattern's firstcu
   7187       into reqcu if there wasn't one, using the vary flag that was in
   7188       existence beforehand. */
   7189 
   7190       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
   7191         {
   7192         subreqcu = subfirstcu;
   7193         subreqcuflags = subfirstcuflags | tempreqvary;
   7194         }
   7195 
   7196       /* If the subpattern set a required byte (or set a first byte that isn't
   7197       really the first byte - see above), set it. */
   7198 
   7199       if (subreqcuflags >= 0)
   7200         {
   7201         reqcu = subreqcu;
   7202         reqcuflags = subreqcuflags;
   7203         }
   7204       }
   7205 
   7206     /* For a forward assertion, we take the reqcu, if set. This can be
   7207     helpful if the pattern that follows the assertion doesn't set a different
   7208     char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
   7209     for an assertion, however because it leads to incorrect effect for patterns
   7210     such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
   7211     of a firstcu. This is overcome by a scan at the end if there's no
   7212     firstcu, looking for an asserted first char. */
   7213 
   7214     else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
   7215       {
   7216       reqcu = subreqcu;
   7217       reqcuflags = subreqcuflags;
   7218       }
   7219     break;     /* End of processing '(' */
   7220 
   7221 
   7222     /* ===================================================================*/
   7223     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
   7224     are arranged to be the negation of the corresponding OP_values in the
   7225     default case when PCRE2_UCP is not set. For the back references, the values
   7226     are negative the reference number. Only back references and those types
   7227     that consume a character may be repeated. We can test for values between
   7228     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
   7229     ever created.
   7230 
   7231     Note: \Q and \E are handled at the start of the character-processing loop,
   7232     not here. */
   7233 
   7234     case CHAR_BACKSLASH:
   7235     tempptr = ptr;
   7236     escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
   7237       options, FALSE, cb);
   7238     if (*errorcodeptr != 0) goto FAILED;
   7239 
   7240     if (escape == 0)                  /* The escape coded a single character */
   7241       c = ec;
   7242     else
   7243       {
   7244       /* For metasequences that actually match a character, we disable the
   7245       setting of a first character if it hasn't already been set. */
   7246 
   7247       if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
   7248         firstcuflags = REQ_NONE;
   7249 
   7250       /* Set values to reset to if this is followed by a zero repeat. */
   7251 
   7252       zerofirstcu = firstcu;
   7253       zerofirstcuflags = firstcuflags;
   7254       zeroreqcu = reqcu;
   7255       zeroreqcuflags = reqcuflags;
   7256 
   7257       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
   7258       is a subroutine call by number (Oniguruma syntax). In fact, the value
   7259       ESC_g is returned only for these cases. So we don't need to check for <
   7260       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
   7261       -n, and for the Perl syntax \g{name} the result is ESC_k (as
   7262       that is a synonym for a named back reference). */
   7263 
   7264       if (escape == ESC_g)
   7265         {
   7266         PCRE2_SPTR p;
   7267         uint32_t cf;
   7268 
   7269         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
   7270           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
   7271 
   7272         /* These two statements stop the compiler for warning about possibly
   7273         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
   7274         fact, because we do the check for a number below, the paths that
   7275         would actually be in error are never taken. */
   7276 
   7277         skipunits = 0;
   7278         reset_bracount = FALSE;
   7279 
   7280         /* If it's not a signed or unsigned number, treat it as a name. */
   7281 
   7282         cf = ptr[1];
   7283         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
   7284           {
   7285           is_recurse = TRUE;
   7286           goto NAMED_REF_OR_RECURSE;
   7287           }
   7288 
   7289         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
   7290         or a digit. */
   7291 
   7292         p = ptr + 2;
   7293         while (IS_DIGIT(*p)) p++;
   7294         if (*p != (PCRE2_UCHAR)terminator)
   7295           {
   7296           *errorcodeptr = ERR57;
   7297           goto FAILED;
   7298           }
   7299         ptr++;
   7300         goto HANDLE_NUMERICAL_RECURSION;
   7301         }
   7302 
   7303       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
   7304       We also support \k{name} (.NET syntax).  */
   7305 
   7306       if (escape == ESC_k)
   7307         {
   7308         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
   7309           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
   7310           {
   7311           *errorcodeptr = ERR69;
   7312           goto FAILED;
   7313           }
   7314         is_recurse = FALSE;
   7315         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
   7316           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
   7317           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
   7318         goto NAMED_REF_OR_RECURSE;
   7319         }
   7320 
   7321       /* Back references are handled specially; must disable firstcu if
   7322       not set to cope with cases like (?=(\w+))\1: which would otherwise set
   7323       ':' later. */
   7324 
   7325       if (escape < 0)
   7326         {
   7327         open_capitem *oc;
   7328         recno = -escape;
   7329 
   7330         /* Come here from named backref handling when the reference is to a
   7331         single group (i.e. not to a duplicated name). */
   7332 
   7333         HANDLE_REFERENCE:
   7334         if (recno > (int)cb->final_bracount)
   7335           {
   7336           *errorcodeptr = ERR15;
   7337           goto FAILED;
   7338           }
   7339         if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   7340         previous = code;
   7341         *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
   7342         PUT2INC(code, 0, recno);
   7343         cb->backref_map |= (recno < 32)? (1u << recno) : 1;
   7344         if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
   7345 
   7346         /* Check to see if this back reference is recursive, that it, it
   7347         is inside the group that it references. A flag is set so that the
   7348         group can be made atomic. */
   7349 
   7350         for (oc = cb->open_caps; oc != NULL; oc = oc->next)
   7351           {
   7352           if (oc->number == recno)
   7353             {
   7354             oc->flag = TRUE;
   7355             break;
   7356             }
   7357           }
   7358         }
   7359 
   7360       /* So are Unicode property matches, if supported. */
   7361 
   7362 #ifdef SUPPORT_UNICODE
   7363       else if (escape == ESC_P || escape == ESC_p)
   7364         {
   7365         BOOL negated;
   7366         unsigned int ptype = 0, pdata = 0;
   7367         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
   7368           goto FAILED;
   7369         previous = code;
   7370         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
   7371         *code++ = ptype;
   7372         *code++ = pdata;
   7373         }
   7374 #else
   7375 
   7376       /* If Unicode properties are not supported, \X, \P, and \p are not
   7377       allowed. */
   7378 
   7379       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
   7380         {
   7381         *errorcodeptr = ERR45;
   7382         goto FAILED;
   7383         }
   7384 #endif
   7385 
   7386       /* The use of \C can be locked out. */
   7387 
   7388 #ifdef NEVER_BACKSLASH_C
   7389       else if (escape == ESC_C)
   7390         {
   7391         *errorcodeptr = ERR85;
   7392         goto FAILED;
   7393         }
   7394 #else
   7395       else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
   7396         {
   7397         *errorcodeptr = ERR83;
   7398         goto FAILED;
   7399         }
   7400 #endif
   7401 
   7402       /* For the rest (including \X when Unicode properties are supported), we
   7403       can obtain the OP value by negating the escape value in the default
   7404       situation when PCRE2_UCP is not set. When it *is* set, we substitute
   7405       Unicode property tests. Note that \b and \B do a one-character
   7406       lookbehind, and \A also behaves as if it does. */
   7407 
   7408       else
   7409         {
   7410         if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
   7411         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
   7412              cb->max_lookbehind == 0)
   7413           cb->max_lookbehind = 1;
   7414 #ifdef SUPPORT_UNICODE
   7415         if (escape >= ESC_DU && escape <= ESC_wu)
   7416           {
   7417           cb->nestptr[1] = cb->nestptr[0];         /* Back up if at 2nd level */
   7418           cb->nestptr[0] = ptr + 1;                /* Where to resume */
   7419           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
   7420           }
   7421         else
   7422 #endif
   7423         /* In non-UTF mode, and for both 32-bit modes, we turn \C into
   7424         OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
   7425         lookbehinds. */
   7426 
   7427           {
   7428           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
   7429 #if PCRE2_CODE_UNIT_WIDTH == 32
   7430           *code++ = (escape == ESC_C)? OP_ALLANY : escape;
   7431 #else
   7432           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
   7433 #endif
   7434           }
   7435         }
   7436       continue;
   7437       }
   7438 
   7439     /* We have a data character whose value is in c. In UTF-8 mode it may have
   7440     a value > 127. We set its representation in the length/buffer, and then
   7441     handle it as a data character. */
   7442 
   7443     mclength = PUTCHAR(c, mcbuffer);
   7444     goto ONE_CHAR;
   7445 
   7446 
   7447     /* ===================================================================*/
   7448     /* Handle a literal character. It is guaranteed not to be whitespace or #
   7449     when the extended flag is set. If we are in a UTF mode, it may be a
   7450     multi-unit literal character. */
   7451 
   7452     default:
   7453     NORMAL_CHAR:
   7454     mclength = 1;
   7455     mcbuffer[0] = c;
   7456 
   7457 #ifdef SUPPORT_UNICODE
   7458     if (utf && HAS_EXTRALEN(c))
   7459       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
   7460 #endif
   7461 
   7462     /* At this point we have the character's bytes in mcbuffer, and the length
   7463     in mclength. When not in UTF mode, the length is always 1. */
   7464 
   7465     ONE_CHAR:
   7466     previous = code;
   7467 
   7468     /* For caseless UTF mode, check whether this character has more than one
   7469     other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
   7470 
   7471 #ifdef SUPPORT_UNICODE
   7472     if (utf && (options & PCRE2_CASELESS) != 0)
   7473       {
   7474       GETCHAR(c, mcbuffer);
   7475       if ((c = UCD_CASESET(c)) != 0)
   7476         {
   7477         *code++ = OP_PROP;
   7478         *code++ = PT_CLIST;
   7479         *code++ = c;
   7480         if (firstcuflags == REQ_UNSET)
   7481           firstcuflags = zerofirstcuflags = REQ_NONE;
   7482         break;
   7483         }
   7484       }
   7485 #endif
   7486 
   7487     /* Caseful matches, or not one of the multicase characters. */
   7488 
   7489     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
   7490     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
   7491 
   7492     /* Remember if \r or \n were seen */
   7493 
   7494     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
   7495       cb->external_flags |= PCRE2_HASCRORLF;
   7496 
   7497     /* Set the first and required bytes appropriately. If no previous first
   7498     byte, set it from this character, but revert to none on a zero repeat.
   7499     Otherwise, leave the firstcu value alone, and don't change it on a zero
   7500     repeat. */
   7501 
   7502     if (firstcuflags == REQ_UNSET)
   7503       {
   7504       zerofirstcuflags = REQ_NONE;
   7505       zeroreqcu = reqcu;
   7506       zeroreqcuflags = reqcuflags;
   7507 
   7508       /* If the character is more than one byte long, we can set firstcu
   7509       only if it is not to be matched caselessly. */
   7510 
   7511       if (mclength == 1 || req_caseopt == 0)
   7512         {
   7513         firstcu = mcbuffer[0] | req_caseopt;
   7514         firstcu = mcbuffer[0];
   7515         firstcuflags = req_caseopt;
   7516 
   7517         if (mclength != 1)
   7518           {
   7519           reqcu = code[-1];
   7520           reqcuflags = cb->req_varyopt;
   7521           }
   7522         }
   7523       else firstcuflags = reqcuflags = REQ_NONE;
   7524       }
   7525 
   7526     /* firstcu was previously set; we can set reqcu only if the length is
   7527     1 or the matching is caseful. */
   7528 
   7529     else
   7530       {
   7531       zerofirstcu = firstcu;
   7532       zerofirstcuflags = firstcuflags;
   7533       zeroreqcu = reqcu;
   7534       zeroreqcuflags = reqcuflags;
   7535       if (mclength == 1 || req_caseopt == 0)
   7536         {
   7537         reqcu = code[-1];
   7538         reqcuflags = req_caseopt | cb->req_varyopt;
   7539         }
   7540       }
   7541 
   7542     break;            /* End of literal character handling */
   7543     }
   7544   }                   /* end of big loop */
   7545 
   7546 /* Control never reaches here by falling through, only by a goto for all the
   7547 error states. Pass back the position in the pattern so that it can be displayed
   7548 to the user for diagnosing the error. */
   7549 
   7550 FAILED:
   7551 *ptrptr = ptr;
   7552 return FALSE;
   7553 }
   7554 
   7555 
   7556 
   7557 /*************************************************
   7558 *   Compile regex: a sequence of alternatives    *
   7559 *************************************************/
   7560 
   7561 /* On entry, ptr is pointing past the bracket character, but on return it
   7562 points to the closing bracket, or vertical bar, or end of string. The code
   7563 variable is pointing at the byte into which the BRA operator has been stored.
   7564 This function is used during the pre-compile phase when we are trying to find
   7565 out the amount of memory needed, as well as during the real compile phase. The
   7566 value of lengthptr distinguishes the two phases.
   7567 
   7568 Arguments:
   7569   options           option bits, including any changes for this subpattern
   7570   codeptr           -> the address of the current code pointer
   7571   ptrptr            -> the address of the current pattern pointer
   7572   errorcodeptr      -> pointer to error code variable
   7573   lookbehind        TRUE if this is a lookbehind assertion
   7574   reset_bracount    TRUE to reset the count for each branch
   7575   skipunits         skip this many code units at start (for brackets and OP_COND)
   7576   cond_depth        depth of nesting for conditional subpatterns
   7577   firstcuptr        place to put the first required code unit
   7578   firstcuflagsptr   place to put the first code unit flags, or a negative number
   7579   reqcuptr          place to put the last required code unit
   7580   reqcuflagsptr     place to put the last required code unit flags, or a negative number
   7581   bcptr             pointer to the chain of currently open branches
   7582   cb                points to the data block with tables pointers etc.
   7583   lengthptr         NULL during the real compile phase
   7584                     points to length accumulator during pre-compile phase
   7585 
   7586 Returns:            TRUE on success
   7587 */
   7588 
   7589 static BOOL
   7590 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
   7591   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits,
   7592   int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
   7593   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
   7594   compile_block *cb, size_t *lengthptr)
   7595 {
   7596 PCRE2_SPTR ptr = *ptrptr;
   7597 PCRE2_UCHAR *code = *codeptr;
   7598 PCRE2_UCHAR *last_branch = code;
   7599 PCRE2_UCHAR *start_bracket = code;
   7600 PCRE2_UCHAR *reverse_count = NULL;
   7601 open_capitem capitem;
   7602 int capnumber = 0;
   7603 uint32_t firstcu, reqcu;
   7604 int32_t firstcuflags, reqcuflags;
   7605 uint32_t branchfirstcu, branchreqcu;
   7606 int32_t branchfirstcuflags, branchreqcuflags;
   7607 size_t length;
   7608 unsigned int orig_bracount;
   7609 unsigned int max_bracount;
   7610 branch_chain bc;
   7611 
   7612 /* If set, call the external function that checks for stack availability. */
   7613 
   7614 if (cb->cx->stack_guard != NULL &&
   7615     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
   7616   {
   7617   *errorcodeptr= ERR33;
   7618   return FALSE;
   7619   }
   7620 
   7621 /* Miscellaneous initialization */
   7622 
   7623 bc.outer = bcptr;
   7624 bc.current_branch = code;
   7625 
   7626 firstcu = reqcu = 0;
   7627 firstcuflags = reqcuflags = REQ_UNSET;
   7628 
   7629 /* Accumulate the length for use in the pre-compile phase. Start with the
   7630 length of the BRA and KET and any extra code units that are required at the
   7631 beginning. We accumulate in a local variable to save frequent testing of
   7632 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
   7633 start and end of each alternative, because compiled items are discarded during
   7634 the pre-compile phase so that the work space is not exceeded. */
   7635 
   7636 length = 2 + 2*LINK_SIZE + skipunits;
   7637 
   7638 /* WARNING: If the above line is changed for any reason, you must also change
   7639 the code that abstracts option settings at the start of the pattern and makes
   7640 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
   7641 pre-compile phase to find out whether or not anything has yet been compiled.
   7642 
   7643 If this is a capturing subpattern, add to the chain of open capturing items
   7644 so that we can detect them if (*ACCEPT) is encountered. This is also used to
   7645 detect groups that contain recursive back references to themselves. Note that
   7646 only OP_CBRA need be tested here; changing this opcode to one of its variants,
   7647 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
   7648 
   7649 if (*code == OP_CBRA)
   7650   {
   7651   capnumber = GET2(code, 1 + LINK_SIZE);
   7652   capitem.number = capnumber;
   7653   capitem.next = cb->open_caps;
   7654   capitem.flag = FALSE;
   7655   cb->open_caps = &capitem;
   7656   }
   7657 
   7658 /* Offset is set zero to mark that this bracket is still open */
   7659 
   7660 PUT(code, 1, 0);
   7661 code += 1 + LINK_SIZE + skipunits;
   7662 
   7663 /* Loop for each alternative branch */
   7664 
   7665 orig_bracount = max_bracount = cb->bracount;
   7666 
   7667 for (;;)
   7668   {
   7669   /* For a (?| group, reset the capturing bracket count so that each branch
   7670   uses the same numbers. */
   7671 
   7672   if (reset_bracount) cb->bracount = orig_bracount;
   7673 
   7674   /* Set up dummy OP_REVERSE if lookbehind assertion */
   7675 
   7676   if (lookbehind)
   7677     {
   7678     *code++ = OP_REVERSE;
   7679     reverse_count = code;
   7680     PUTINC(code, 0, 0);
   7681     length += 1 + LINK_SIZE;
   7682     }
   7683 
   7684   /* Now compile the branch; in the pre-compile phase its length gets added
   7685   into the length. */
   7686 
   7687   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu,
   7688         &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
   7689         cond_depth, cb, (lengthptr == NULL)? NULL : &length))
   7690     {
   7691     *ptrptr = ptr;
   7692     return FALSE;
   7693     }
   7694 
   7695   /* Keep the highest bracket count in case (?| was used and some branch
   7696   has fewer than the rest. */
   7697 
   7698   if (cb->bracount > max_bracount) max_bracount = cb->bracount;
   7699 
   7700   /* In the real compile phase, there is some post-processing to be done. */
   7701 
   7702   if (lengthptr == NULL)
   7703     {
   7704     /* If this is the first branch, the firstcu and reqcu values for the
   7705     branch become the values for the regex. */
   7706 
   7707     if (*last_branch != OP_ALT)
   7708       {
   7709       firstcu = branchfirstcu;
   7710       firstcuflags = branchfirstcuflags;
   7711       reqcu = branchreqcu;
   7712       reqcuflags = branchreqcuflags;
   7713       }
   7714 
   7715     /* If this is not the first branch, the first char and reqcu have to
   7716     match the values from all the previous branches, except that if the
   7717     previous value for reqcu didn't have REQ_VARY set, it can still match,
   7718     and we set REQ_VARY for the regex. */
   7719 
   7720     else
   7721       {
   7722       /* If we previously had a firstcu, but it doesn't match the new branch,
   7723       we have to abandon the firstcu for the regex, but if there was
   7724       previously no reqcu, it takes on the value of the old firstcu. */
   7725 
   7726       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
   7727         {
   7728         if (firstcuflags >= 0)
   7729           {
   7730           if (reqcuflags < 0)
   7731             {
   7732             reqcu = firstcu;
   7733             reqcuflags = firstcuflags;
   7734             }
   7735           }
   7736         firstcuflags = REQ_NONE;
   7737         }
   7738 
   7739       /* If we (now or from before) have no firstcu, a firstcu from the
   7740       branch becomes a reqcu if there isn't a branch reqcu. */
   7741 
   7742       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
   7743           branchreqcuflags < 0)
   7744         {
   7745         branchreqcu = branchfirstcu;
   7746         branchreqcuflags = branchfirstcuflags;
   7747         }
   7748 
   7749       /* Now ensure that the reqcus match */
   7750 
   7751       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
   7752           reqcu != branchreqcu)
   7753         reqcuflags = REQ_NONE;
   7754       else
   7755         {
   7756         reqcu = branchreqcu;
   7757         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
   7758         }
   7759       }
   7760 
   7761     /* If lookbehind, check that this branch matches a fixed-length string, and
   7762     put the length into the OP_REVERSE item. Temporarily mark the end of the
   7763     branch with OP_END. If the branch contains OP_RECURSE, the result is
   7764     FFL_LATER (a negative value) because there may be forward references that
   7765     we can't check here. Set a flag to cause another lookbehind check at the
   7766     end. Why not do it all at the end? Because common errors can be picked up
   7767     here and the offset of the problem can be shown. */
   7768 
   7769     if (lookbehind)
   7770       {
   7771       int fixed_length;
   7772       int count = 0;
   7773       *code = OP_END;
   7774       fixed_length = find_fixedlength(last_branch,  (options & PCRE2_UTF) != 0,
   7775         FALSE, cb, NULL, &count);
   7776       if (fixed_length == FFL_LATER)
   7777         {
   7778         cb->check_lookbehind = TRUE;
   7779         }
   7780       else if (fixed_length < 0)
   7781         {
   7782         *errorcodeptr = fixed_length_errors[-fixed_length];
   7783         *ptrptr = ptr;
   7784         return FALSE;
   7785         }
   7786       else
   7787         {
   7788         if (fixed_length > cb->max_lookbehind)
   7789           cb->max_lookbehind = fixed_length;
   7790         PUT(reverse_count, 0, fixed_length);
   7791         }
   7792       }
   7793     }
   7794 
   7795   /* Reached end of expression, either ')' or end of pattern. In the real
   7796   compile phase, go back through the alternative branches and reverse the chain
   7797   of offsets, with the field in the BRA item now becoming an offset to the
   7798   first alternative. If there are no alternatives, it points to the end of the
   7799   group. The length in the terminating ket is always the length of the whole
   7800   bracketed item. Return leaving the pointer at the terminating char. */
   7801 
   7802   if (*ptr != CHAR_VERTICAL_LINE)
   7803     {
   7804     if (lengthptr == NULL)
   7805       {
   7806       size_t branch_length = code - last_branch;
   7807       do
   7808         {
   7809         size_t prev_length = GET(last_branch, 1);
   7810         PUT(last_branch, 1, branch_length);
   7811         branch_length = prev_length;
   7812         last_branch -= branch_length;
   7813         }
   7814       while (branch_length > 0);
   7815       }
   7816 
   7817     /* Fill in the ket */
   7818 
   7819     *code = OP_KET;
   7820     PUT(code, 1, (int)(code - start_bracket));
   7821     code += 1 + LINK_SIZE;
   7822 
   7823     /* If it was a capturing subpattern, check to see if it contained any
   7824     recursive back references. If so, we must wrap it in atomic brackets. In
   7825     any event, remove the block from the chain. */
   7826 
   7827     if (capnumber > 0)
   7828       {
   7829       if (cb->open_caps->flag)
   7830         {
   7831         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
   7832           CU2BYTES(code - start_bracket));
   7833         *start_bracket = OP_ONCE;
   7834         code += 1 + LINK_SIZE;
   7835         PUT(start_bracket, 1, (int)(code - start_bracket));
   7836         *code = OP_KET;
   7837         PUT(code, 1, (int)(code - start_bracket));
   7838         code += 1 + LINK_SIZE;
   7839         length += 2 + 2*LINK_SIZE;
   7840         }
   7841       cb->open_caps = cb->open_caps->next;
   7842       }
   7843 
   7844     /* Retain the highest bracket number, in case resetting was used. */
   7845 
   7846     cb->bracount = max_bracount;
   7847 
   7848     /* Set values to pass back */
   7849 
   7850     *codeptr = code;
   7851     *ptrptr = ptr;
   7852     *firstcuptr = firstcu;
   7853     *firstcuflagsptr = firstcuflags;
   7854     *reqcuptr = reqcu;
   7855     *reqcuflagsptr = reqcuflags;
   7856     if (lengthptr != NULL)
   7857       {
   7858       if (OFLOW_MAX - *lengthptr < length)
   7859         {
   7860         *errorcodeptr = ERR20;
   7861         return FALSE;
   7862         }
   7863       *lengthptr += length;
   7864       }
   7865     return TRUE;
   7866     }
   7867 
   7868   /* Another branch follows. In the pre-compile phase, we can move the code
   7869   pointer back to where it was for the start of the first branch. (That is,
   7870   pretend that each branch is the only one.)
   7871 
   7872   In the real compile phase, insert an ALT node. Its length field points back
   7873   to the previous branch while the bracket remains open. At the end the chain
   7874   is reversed. It's done like this so that the start of the bracket has a
   7875   zero offset until it is closed, making it possible to detect recursion. */
   7876 
   7877   if (lengthptr != NULL)
   7878     {
   7879     code = *codeptr + 1 + LINK_SIZE + skipunits;
   7880     length += 1 + LINK_SIZE;
   7881     }
   7882   else
   7883     {
   7884     *code = OP_ALT;
   7885     PUT(code, 1, (int)(code - last_branch));
   7886     bc.current_branch = last_branch = code;
   7887     code += 1 + LINK_SIZE;
   7888     }
   7889 
   7890   /* Advance past the vertical bar */
   7891 
   7892   ptr++;
   7893   }
   7894 /* Control never reaches here */
   7895 }
   7896 
   7897 
   7898 
   7899 /*************************************************
   7900 *          Check for anchored pattern            *
   7901 *************************************************/
   7902 
   7903 /* Try to find out if this is an anchored regular expression. Consider each
   7904 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
   7905 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
   7906 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
   7907 be found, because ^ generates OP_CIRCM in that mode.
   7908 
   7909 We can also consider a regex to be anchored if OP_SOM starts all its branches.
   7910 This is the code for \G, which means "match at start of match position, taking
   7911 into account the match offset".
   7912 
   7913 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
   7914 because that will try the rest of the pattern at all possible matching points,
   7915 so there is no point trying again.... er ....
   7916 
   7917 .... except when the .* appears inside capturing parentheses, and there is a
   7918 subsequent back reference to those parentheses. We haven't enough information
   7919 to catch that case precisely.
   7920 
   7921 At first, the best we could do was to detect when .* was in capturing brackets
   7922 and the highest back reference was greater than or equal to that level.
   7923 However, by keeping a bitmap of the first 31 back references, we can catch some
   7924 of the more common cases more precisely.
   7925 
   7926 ... A second exception is when the .* appears inside an atomic group, because
   7927 this prevents the number of characters it matches from being adjusted.
   7928 
   7929 Arguments:
   7930   code           points to start of the compiled pattern
   7931   bracket_map    a bitmap of which brackets we are inside while testing; this
   7932                    handles up to substring 31; after that we just have to take
   7933                    the less precise approach
   7934   cb             points to the compile data block
   7935   atomcount      atomic group level
   7936 
   7937 Returns:     TRUE or FALSE
   7938 */
   7939 
   7940 static BOOL
   7941 is_anchored(register PCRE2_SPTR code, unsigned int bracket_map,
   7942   compile_block *cb, int atomcount)
   7943 {
   7944 do {
   7945    PCRE2_SPTR scode = first_significant_code(
   7946      code + PRIV(OP_lengths)[*code], FALSE);
   7947    register int op = *scode;
   7948 
   7949    /* Non-capturing brackets */
   7950 
   7951    if (op == OP_BRA  || op == OP_BRAPOS ||
   7952        op == OP_SBRA || op == OP_SBRAPOS)
   7953      {
   7954      if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
   7955      }
   7956 
   7957    /* Capturing brackets */
   7958 
   7959    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   7960             op == OP_SCBRA || op == OP_SCBRAPOS)
   7961      {
   7962      int n = GET2(scode, 1+LINK_SIZE);
   7963      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
   7964      if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
   7965      }
   7966 
   7967    /* Positive forward assertions and conditions */
   7968 
   7969    else if (op == OP_ASSERT || op == OP_COND)
   7970      {
   7971      if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
   7972      }
   7973 
   7974    /* Atomic groups */
   7975 
   7976    else if (op == OP_ONCE || op == OP_ONCE_NC)
   7977      {
   7978      if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
   7979        return FALSE;
   7980      }
   7981 
   7982    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
   7983    it isn't in brackets that are or may be referenced or inside an atomic
   7984    group. There is also an option that disables auto-anchoring. */
   7985 
   7986    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
   7987              op == OP_TYPEPOSSTAR))
   7988      {
   7989      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
   7990          atomcount > 0 || cb->had_pruneorskip ||
   7991          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
   7992        return FALSE;
   7993      }
   7994 
   7995    /* Check for explicit anchoring */
   7996 
   7997    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
   7998 
   7999    code += GET(code, 1);
   8000    }
   8001 while (*code == OP_ALT);   /* Loop for each alternative */
   8002 return TRUE;
   8003 }
   8004 
   8005 
   8006 
   8007 /*************************************************
   8008 *         Check for starting with ^ or .*        *
   8009 *************************************************/
   8010 
   8011 /* This is called to find out if every branch starts with ^ or .* so that
   8012 "first char" processing can be done to speed things up in multiline
   8013 matching and for non-DOTALL patterns that start with .* (which must start at
   8014 the beginning or after \n). As in the case of is_anchored() (see above), we
   8015 have to take account of back references to capturing brackets that contain .*
   8016 because in that case we can't make the assumption. Also, the appearance of .*
   8017 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
   8018 count, because once again the assumption no longer holds.
   8019 
   8020 Arguments:
   8021   code           points to start of the compiled pattern or a group
   8022   bracket_map    a bitmap of which brackets we are inside while testing; this
   8023                    handles up to substring 31; after that we just have to take
   8024                    the less precise approach
   8025   cb             points to the compile data
   8026   atomcount      atomic group level
   8027 
   8028 Returns:         TRUE or FALSE
   8029 */
   8030 
   8031 static BOOL
   8032 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
   8033   int atomcount)
   8034 {
   8035 do {
   8036    PCRE2_SPTR scode = first_significant_code(
   8037      code + PRIV(OP_lengths)[*code], FALSE);
   8038    register int op = *scode;
   8039 
   8040    /* If we are at the start of a conditional assertion group, *both* the
   8041    conditional assertion *and* what follows the condition must satisfy the test
   8042    for start of line. Other kinds of condition fail. Note that there may be an
   8043    auto-callout at the start of a condition. */
   8044 
   8045    if (op == OP_COND)
   8046      {
   8047      scode += 1 + LINK_SIZE;
   8048 
   8049      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
   8050        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
   8051 
   8052      switch (*scode)
   8053        {
   8054        case OP_CREF:
   8055        case OP_DNCREF:
   8056        case OP_RREF:
   8057        case OP_DNRREF:
   8058        case OP_FAIL:
   8059        case OP_FALSE:
   8060        case OP_TRUE:
   8061        return FALSE;
   8062 
   8063        default:     /* Assertion */
   8064        if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
   8065        do scode += GET(scode, 1); while (*scode == OP_ALT);
   8066        scode += 1 + LINK_SIZE;
   8067        break;
   8068        }
   8069      scode = first_significant_code(scode, FALSE);
   8070      op = *scode;
   8071      }
   8072 
   8073    /* Non-capturing brackets */
   8074 
   8075    if (op == OP_BRA  || op == OP_BRAPOS ||
   8076        op == OP_SBRA || op == OP_SBRAPOS)
   8077      {
   8078      if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
   8079      }
   8080 
   8081    /* Capturing brackets */
   8082 
   8083    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   8084             op == OP_SCBRA || op == OP_SCBRAPOS)
   8085      {
   8086      int n = GET2(scode, 1+LINK_SIZE);
   8087      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
   8088      if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
   8089      }
   8090 
   8091    /* Positive forward assertions */
   8092 
   8093    else if (op == OP_ASSERT)
   8094      {
   8095      if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
   8096      }
   8097 
   8098    /* Atomic brackets */
   8099 
   8100    else if (op == OP_ONCE || op == OP_ONCE_NC)
   8101      {
   8102      if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
   8103      }
   8104 
   8105    /* .* means "start at start or after \n" if it isn't in atomic brackets or
   8106    brackets that may be referenced, as long as the pattern does not contain
   8107    *PRUNE or *SKIP, because these break the feature. Consider, for example,
   8108    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
   8109    start of a line. There is also an option that disables this optimization. */
   8110 
   8111    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
   8112      {
   8113      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
   8114          atomcount > 0 || cb->had_pruneorskip ||
   8115          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
   8116        return FALSE;
   8117      }
   8118 
   8119    /* Check for explicit circumflex; anything else gives a FALSE result. Note
   8120    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
   8121    because the number of characters matched by .* cannot be adjusted inside
   8122    them. */
   8123 
   8124    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
   8125 
   8126    /* Move on to the next alternative */
   8127 
   8128    code += GET(code, 1);
   8129    }
   8130 while (*code == OP_ALT);  /* Loop for each alternative */
   8131 return TRUE;
   8132 }
   8133 
   8134 
   8135 
   8136 /*************************************************
   8137 *    Check for asserted fixed first code unit    *
   8138 *************************************************/
   8139 
   8140 /* During compilation, the "first code unit" settings from forward assertions
   8141 are discarded, because they can cause conflicts with actual literals that
   8142 follow. However, if we end up without a first code unit setting for an
   8143 unanchored pattern, it is worth scanning the regex to see if there is an
   8144 initial asserted first code unit. If all branches start with the same asserted
   8145 code unit, or with a non-conditional bracket all of whose alternatives start
   8146 with the same asserted code unit (recurse ad lib), then we return that code
   8147 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
   8148 REQ_NONE in the flags.
   8149 
   8150 Arguments:
   8151   code       points to start of compiled pattern
   8152   flags      points to the first code unit flags
   8153   inassert   TRUE if in an assertion
   8154 
   8155 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
   8156 */
   8157 
   8158 static uint32_t
   8159 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert)
   8160 {
   8161 register uint32_t c = 0;
   8162 int cflags = REQ_NONE;
   8163 
   8164 *flags = REQ_NONE;
   8165 do {
   8166    uint32_t d;
   8167    int dflags;
   8168    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
   8169              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
   8170    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
   8171    register PCRE2_UCHAR op = *scode;
   8172 
   8173    switch(op)
   8174      {
   8175      default:
   8176      return 0;
   8177 
   8178      case OP_BRA:
   8179      case OP_BRAPOS:
   8180      case OP_CBRA:
   8181      case OP_SCBRA:
   8182      case OP_CBRAPOS:
   8183      case OP_SCBRAPOS:
   8184      case OP_ASSERT:
   8185      case OP_ONCE:
   8186      case OP_ONCE_NC:
   8187      d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT);
   8188      if (dflags < 0)
   8189        return 0;
   8190      if (cflags < 0) { c = d; cflags = dflags; }
   8191        else if (c != d || cflags != dflags) return 0;
   8192      break;
   8193 
   8194      case OP_EXACT:
   8195      scode += IMM2_SIZE;
   8196      /* Fall through */
   8197 
   8198      case OP_CHAR:
   8199      case OP_PLUS:
   8200      case OP_MINPLUS:
   8201      case OP_POSPLUS:
   8202      if (!inassert) return 0;
   8203      if (cflags < 0) { c = scode[1]; cflags = 0; }
   8204        else if (c != scode[1]) return 0;
   8205      break;
   8206 
   8207      case OP_EXACTI:
   8208      scode += IMM2_SIZE;
   8209      /* Fall through */
   8210 
   8211      case OP_CHARI:
   8212      case OP_PLUSI:
   8213      case OP_MINPLUSI:
   8214      case OP_POSPLUSI:
   8215      if (!inassert) return 0;
   8216      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
   8217        else if (c != scode[1]) return 0;
   8218      break;
   8219      }
   8220 
   8221    code += GET(code, 1);
   8222    }
   8223 while (*code == OP_ALT);
   8224 
   8225 *flags = cflags;
   8226 return c;
   8227 }
   8228 
   8229 
   8230 
   8231 /*************************************************
   8232 *     Add an entry to the name/number table      *
   8233 *************************************************/
   8234 
   8235 /* This function is called between compiling passes to add an entry to the
   8236 name/number table, maintaining alphabetical order. Checking for permitted
   8237 and forbidden duplicates has already been done.
   8238 
   8239 Arguments:
   8240   cb           the compile data block
   8241   name         the name to add
   8242   length       the length of the name
   8243   groupno      the group number
   8244 
   8245 Returns:       nothing
   8246 */
   8247 
   8248 static void
   8249 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
   8250   unsigned int groupno)
   8251 {
   8252 int i;
   8253 PCRE2_UCHAR *slot = cb->name_table;
   8254 
   8255 for (i = 0; i < cb->names_found; i++)
   8256   {
   8257   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
   8258   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
   8259     crc = -1; /* Current name is a substring */
   8260 
   8261   /* Make space in the table and break the loop for an earlier name. For a
   8262   duplicate or later name, carry on. We do this for duplicates so that in the
   8263   simple case (when ?(| is not used) they are in order of their numbers. In all
   8264   cases they are in the order in which they appear in the pattern. */
   8265 
   8266   if (crc < 0)
   8267     {
   8268     memmove(slot + cb->name_entry_size, slot,
   8269       CU2BYTES((cb->names_found - i) * cb->name_entry_size));
   8270     break;
   8271     }
   8272 
   8273   /* Continue the loop for a later or duplicate name */
   8274 
   8275   slot += cb->name_entry_size;
   8276   }
   8277 
   8278 PUT2(slot, 0, groupno);
   8279 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
   8280 cb->names_found++;
   8281 
   8282 /* Add a terminating zero and fill the rest of the slot with zeroes so that
   8283 the memory is all initialized. Otherwise valgrind moans about uninitialized
   8284 memory when saving serialized compiled patterns. */
   8285 
   8286 memset(slot + IMM2_SIZE + length, 0,
   8287   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
   8288 }
   8289 
   8290 
   8291 
   8292 /*************************************************
   8293 *     External function to compile a pattern     *
   8294 *************************************************/
   8295 
   8296 /* This function reads a regular expression in the form of a string and returns
   8297 a pointer to a block of store holding a compiled version of the expression.
   8298 
   8299 Arguments:
   8300   pattern       the regular expression
   8301   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
   8302   options       option bits
   8303   errorptr      pointer to errorcode
   8304   erroroffset   pointer to error offset
   8305   ccontext      points to a compile context or is NULL
   8306 
   8307 Returns:        pointer to compiled data block, or NULL on error,
   8308                 with errorcode and erroroffset set
   8309 */
   8310 
   8311 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
   8312 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
   8313    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
   8314 {
   8315 BOOL utf;                               /* Set TRUE for UTF mode */
   8316 pcre2_real_code *re = NULL;             /* What we will return */
   8317 compile_block cb;                       /* "Static" compile-time data */
   8318 const uint8_t *tables;                  /* Char tables base pointer */
   8319 
   8320 PCRE2_UCHAR *code;                      /* Current pointer in compiled code */
   8321 PCRE2_SPTR codestart;                   /* Start of compiled code */
   8322 PCRE2_SPTR ptr;                         /* Current pointer in pattern */
   8323 
   8324 size_t length = 1;                      /* Allow or final END opcode */
   8325 size_t usedlength;                      /* Actual length used */
   8326 size_t re_blocksize;                    /* Size of memory block */
   8327 
   8328 int32_t firstcuflags, reqcuflags;       /* Type of first/req code unit */
   8329 uint32_t firstcu, reqcu;                /* Value of first/req code unit */
   8330 uint32_t setflags = 0;                  /* NL and BSR set flags */
   8331 
   8332 uint32_t skipatstart;                   /* When checking (*UTF) etc */
   8333 uint32_t limit_match = UINT32_MAX;      /* Unset match limits */
   8334 uint32_t limit_recursion = UINT32_MAX;
   8335 
   8336 int newline = 0;                        /* Unset; can be set by the pattern */
   8337 int bsr = 0;                            /* Unset; can be set by the pattern */
   8338 int errorcode = 0;                      /* Initialize to avoid compiler warn */
   8339 
   8340 /* Comments at the head of this file explain about these variables. */
   8341 
   8342 PCRE2_UCHAR *copied_pattern = NULL;
   8343 PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE];
   8344 named_group named_groups[NAMED_GROUP_LIST_SIZE];
   8345 
   8346 /* The workspace is used in different ways in the different compiling phases.
   8347 It needs to be 16-bit aligned for the preliminary group scan, and 32-bit
   8348 aligned for the group information cache. */
   8349 
   8350 uint32_t c32workspace[C32_WORK_SIZE];
   8351 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c32workspace;
   8352 
   8353 
   8354 /* -------------- Check arguments and set up the pattern ----------------- */
   8355 
   8356 /* There must be error code and offset pointers. */
   8357 
   8358 if (errorptr == NULL || erroroffset == NULL) return NULL;
   8359 *errorptr = ERR0;
   8360 *erroroffset = 0;
   8361 
   8362 /* There must be a pattern! */
   8363 
   8364 if (pattern == NULL)
   8365   {
   8366   *errorptr = ERR16;
   8367   return NULL;
   8368   }
   8369 
   8370 /* Check that all undefined public option bits are zero. */
   8371 
   8372 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
   8373   {
   8374   *errorptr = ERR17;
   8375   return NULL;
   8376   }
   8377 
   8378 /* A NULL compile context means "use a default context" */
   8379 
   8380 if (ccontext == NULL)
   8381   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
   8382 
   8383 /* A zero-terminated pattern is indicated by the special length value
   8384 PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero,
   8385 to ensure that it is always possible to look one code unit beyond the end of
   8386 the pattern's characters. In both cases, check that the pattern is overlong. */
   8387 
   8388 if (patlen == PCRE2_ZERO_TERMINATED)
   8389   {
   8390   patlen = PRIV(strlen)(pattern);
   8391   if (patlen > ccontext->max_pattern_length)
   8392     {
   8393     *errorptr = ERR88;
   8394     return NULL;
   8395     }
   8396   }
   8397 else
   8398   {
   8399   if (patlen > ccontext->max_pattern_length)
   8400     {
   8401     *errorptr = ERR88;
   8402     return NULL;
   8403     }
   8404   if (patlen < COPIED_PATTERN_SIZE)
   8405     copied_pattern = stack_copied_pattern;
   8406   else
   8407     {
   8408     copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1),
   8409       ccontext->memctl.memory_data);
   8410     if (copied_pattern == NULL)
   8411       {
   8412       *errorptr = ERR21;
   8413       return NULL;
   8414       }
   8415     }
   8416   memcpy(copied_pattern, pattern, CU2BYTES(patlen));
   8417   copied_pattern[patlen] = 0;
   8418   pattern = copied_pattern;
   8419   }
   8420 
   8421 /* ------------ Initialize the "static" compile data -------------- */
   8422 
   8423 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
   8424 
   8425 cb.lcc = tables + lcc_offset;          /* Individual */
   8426 cb.fcc = tables + fcc_offset;          /*   character */
   8427 cb.cbits = tables + cbits_offset;      /*      tables */
   8428 cb.ctypes = tables + ctypes_offset;
   8429 
   8430 cb.assert_depth = 0;
   8431 cb.bracount = cb.final_bracount = 0;
   8432 cb.cx = ccontext;
   8433 cb.dupnames = FALSE;
   8434 cb.end_pattern = pattern + patlen;
   8435 cb.nestptr[0] = cb.nestptr[1] = NULL;
   8436 cb.external_flags = 0;
   8437 cb.external_options = options;
   8438 cb.groupinfo = c32workspace;
   8439 cb.had_recurse = FALSE;
   8440 cb.iscondassert = FALSE;
   8441 cb.max_lookbehind = 0;
   8442 cb.name_entry_size = 0;
   8443 cb.name_table = NULL;
   8444 cb.named_groups = named_groups;
   8445 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
   8446 cb.names_found = 0;
   8447 cb.open_caps = NULL;
   8448 cb.parens_depth = 0;
   8449 cb.req_varyopt = 0;
   8450 cb.start_code = cworkspace;
   8451 cb.start_pattern = pattern;
   8452 cb.start_workspace = cworkspace;
   8453 cb.workspace_size = COMPILE_WORK_SIZE;
   8454 
   8455 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
   8456 references to help in deciding whether (.*) can be treated as anchored or not.
   8457 */
   8458 
   8459 cb.top_backref = 0;
   8460 cb.backref_map = 0;
   8461 
   8462 /* --------------- Start looking at the pattern --------------- */
   8463 
   8464 /* Check for global one-time option settings at the start of the pattern, and
   8465 remember the offset to the actual regex. */
   8466 
   8467 ptr = pattern;
   8468 skipatstart = 0;
   8469 
   8470 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
   8471        ptr[skipatstart+1] == CHAR_ASTERISK)
   8472   {
   8473   unsigned int i;
   8474   for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
   8475     {
   8476     pso *p = pso_list + i;
   8477 
   8478     if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0)
   8479       {
   8480       uint32_t c, pp;
   8481 
   8482       skipatstart += p->length + 2;
   8483       switch(p->type)
   8484         {
   8485         case PSO_OPT:
   8486         cb.external_options |= p->value;
   8487         break;
   8488 
   8489         case PSO_FLG:
   8490         setflags |= p->value;
   8491         break;
   8492 
   8493         case PSO_NL:
   8494         newline = p->value;
   8495         setflags |= PCRE2_NL_SET;
   8496         break;
   8497 
   8498         case PSO_BSR:
   8499         bsr = p->value;
   8500         setflags |= PCRE2_BSR_SET;
   8501         break;
   8502 
   8503         case PSO_LIMM:
   8504         case PSO_LIMR:
   8505         c = 0;
   8506         pp = skipatstart;
   8507         if (!IS_DIGIT(ptr[pp]))
   8508           {
   8509           errorcode = ERR60;
   8510           ptr += pp;
   8511           goto HAD_ERROR;
   8512           }
   8513         while (IS_DIGIT(ptr[pp]))
   8514           {
   8515           if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
   8516           c = c*10 + (ptr[pp++] - CHAR_0);
   8517           }
   8518         if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
   8519           {
   8520           errorcode = ERR60;
   8521           ptr += pp;
   8522           goto HAD_ERROR;
   8523           }
   8524         if (p->type == PSO_LIMM) limit_match = c;
   8525           else limit_recursion = c;
   8526         skipatstart += pp - skipatstart;
   8527         break;
   8528         }
   8529       break;   /* Out of the table scan loop */
   8530       }
   8531     }
   8532   if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
   8533   }
   8534 
   8535 /* End of pattern-start options; advance to start of real regex. */
   8536 
   8537 ptr += skipatstart;
   8538 
   8539 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
   8540 
   8541 #ifndef SUPPORT_UNICODE
   8542 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
   8543   {
   8544   errorcode = ERR32;
   8545   goto HAD_ERROR;
   8546   }
   8547 #endif
   8548 
   8549 /* Check UTF. We have the original options in 'options', with that value as
   8550 modified by (*UTF) etc in cb->external_options. */
   8551 
   8552 utf = (cb.external_options & PCRE2_UTF) != 0;
   8553 if (utf)
   8554   {
   8555   if ((options & PCRE2_NEVER_UTF) != 0)
   8556     {
   8557     errorcode = ERR74;
   8558     goto HAD_ERROR;
   8559     }
   8560   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
   8561        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
   8562     goto HAD_UTF_ERROR;
   8563   }
   8564 
   8565 /* Check UCP lockout. */
   8566 
   8567 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
   8568     (PCRE2_UCP|PCRE2_NEVER_UCP))
   8569   {
   8570   errorcode = ERR75;
   8571   goto HAD_ERROR;
   8572   }
   8573 
   8574 /* Process the BSR setting. */
   8575 
   8576 if (bsr == 0) bsr = ccontext->bsr_convention;
   8577 
   8578 /* Process the newline setting. */
   8579 
   8580 if (newline == 0) newline = ccontext->newline_convention;
   8581 cb.nltype = NLTYPE_FIXED;
   8582 switch(newline)
   8583   {
   8584   case PCRE2_NEWLINE_CR:
   8585   cb.nllen = 1;
   8586   cb.nl[0] = CHAR_CR;
   8587   break;
   8588 
   8589   case PCRE2_NEWLINE_LF:
   8590   cb.nllen = 1;
   8591   cb.nl[0] = CHAR_NL;
   8592   break;
   8593 
   8594   case PCRE2_NEWLINE_CRLF:
   8595   cb.nllen = 2;
   8596   cb.nl[0] = CHAR_CR;
   8597   cb.nl[1] = CHAR_NL;
   8598   break;
   8599 
   8600   case PCRE2_NEWLINE_ANY:
   8601   cb.nltype = NLTYPE_ANY;
   8602   break;
   8603 
   8604   case PCRE2_NEWLINE_ANYCRLF:
   8605   cb.nltype = NLTYPE_ANYCRLF;
   8606   break;
   8607 
   8608   default:
   8609   errorcode = ERR56;
   8610   goto HAD_ERROR;
   8611   }
   8612 
   8613 /* Before we do anything else, do a pre-scan of the pattern in order to
   8614 discover the named groups and their numerical equivalents, so that this
   8615 information is always available for the remaining processing. */
   8616 
   8617 errorcode = scan_for_captures(&ptr, cb.external_options, &cb);
   8618 if (errorcode != 0) goto HAD_ERROR;
   8619 
   8620 /* For obscure debugging this code can be enabled. */
   8621 
   8622 #if 0
   8623   {
   8624   int i;
   8625   named_group *ng = cb.named_groups;
   8626   fprintf(stderr, "+++Captures: %d\n", cb.final_bracount);
   8627   for (i = 0; i < cb.names_found; i++, ng++)
   8628     {
   8629     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
   8630     }
   8631   }
   8632 #endif
   8633 
   8634 /* Reset current bracket count to zero and current pointer to the start of the
   8635 pattern. */
   8636 
   8637 cb.bracount = 0;
   8638 ptr = pattern + skipatstart;
   8639 
   8640 /* Pretend to compile the pattern while actually just accumulating the amount
   8641 of memory required in the 'length' variable. This behaviour is triggered by
   8642 passing a non-NULL final argument to compile_regex(). We pass a block of
   8643 workspace (cworkspace) for it to compile parts of the pattern into; the
   8644 compiled code is discarded when it is no longer needed, so hopefully this
   8645 workspace will never overflow, though there is a test for its doing so.
   8646 
   8647 On error, errorcode will be set non-zero, so we don't need to look at the
   8648 result of the function. The initial options have been put into the cb block so
   8649 that they can be changed if an option setting is found within the regex right
   8650 at the beginning. Bringing initial option settings outside can help speed up
   8651 starting point checks. We still have to pass a separate options variable (the
   8652 first argument) because that may change as the pattern is processed. */
   8653 
   8654 code = cworkspace;
   8655 *code = OP_BRA;
   8656 
   8657 (void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE,
   8658   FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
   8659   &cb, &length);
   8660 
   8661 if (errorcode != 0) goto HAD_ERROR;
   8662 if (length > MAX_PATTERN_SIZE)
   8663   {
   8664   errorcode = ERR20;
   8665   goto HAD_ERROR;
   8666   }
   8667 
   8668 /* Compute the size of, and then get and initialize, the data block for storing
   8669 the compiled pattern and names table. Integer overflow should no longer be
   8670 possible because nowadays we limit the maximum value of cb.names_found and
   8671 cb.name_entry_size. */
   8672 
   8673 re_blocksize = sizeof(pcre2_real_code) +
   8674   CU2BYTES(length + cb.names_found * cb.name_entry_size);
   8675 re = (pcre2_real_code *)
   8676   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
   8677 if (re == NULL)
   8678   {
   8679   errorcode = ERR21;
   8680   goto HAD_ERROR;
   8681   }
   8682 
   8683 re->memctl = ccontext->memctl;
   8684 re->tables = tables;
   8685 re->executable_jit = NULL;
   8686 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
   8687 re->blocksize = re_blocksize;
   8688 re->magic_number = MAGIC_NUMBER;
   8689 re->compile_options = options;
   8690 re->overall_options = cb.external_options;
   8691 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
   8692 re->limit_match = limit_match;
   8693 re->limit_recursion = limit_recursion;
   8694 re->first_codeunit = 0;
   8695 re->last_codeunit = 0;
   8696 re->bsr_convention = bsr;
   8697 re->newline_convention = newline;
   8698 re->max_lookbehind = 0;
   8699 re->minlength = 0;
   8700 re->top_bracket = 0;
   8701 re->top_backref = 0;
   8702 re->name_entry_size = cb.name_entry_size;
   8703 re->name_count = cb.names_found;
   8704 
   8705 /* The basic block is immediately followed by the name table, and the compiled
   8706 code follows after that. */
   8707 
   8708 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
   8709   re->name_entry_size * re->name_count;
   8710 
   8711 /* Workspace is needed to remember information about numbered groups: whether a
   8712 group can match an empty string and what its fixed length is. This is done to
   8713 avoid the possibility of recursive references causing very long compile times
   8714 when checking these features. Unnumbered groups do not have this exposure since
   8715 they cannot be referenced. We use an indexed vector for this purpose. If there
   8716 are sufficiently few groups, it can be the c32workspace vector, as set up
   8717 above. Otherwise we have to get/free a special vector. The vector must be
   8718 initialized to zero. */
   8719 
   8720 if (cb.final_bracount >= C32_WORK_SIZE)
   8721   {
   8722   cb.groupinfo = ccontext->memctl.malloc(
   8723     (cb.final_bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
   8724   if (cb.groupinfo == NULL)
   8725     {
   8726     errorcode = ERR21;
   8727     goto HAD_ERROR;
   8728     }
   8729   }
   8730 memset(cb.groupinfo, 0, (cb.final_bracount + 1) * sizeof(uint32_t));
   8731 
   8732 /* Update the compile data block for the actual compile. The starting points of
   8733 the name/number translation table and of the code are passed around in the
   8734 compile data block. The start/end pattern and initial options are already set
   8735 from the pre-compile phase, as is the name_entry_size field. Reset the bracket
   8736 count and the names_found field. */
   8737 
   8738 cb.parens_depth = 0;
   8739 cb.assert_depth = 0;
   8740 cb.bracount = 0;
   8741 cb.max_lookbehind = 0;
   8742 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
   8743 cb.start_code = codestart;
   8744 cb.iscondassert = FALSE;
   8745 cb.req_varyopt = 0;
   8746 cb.had_accept = FALSE;
   8747 cb.had_pruneorskip = FALSE;
   8748 cb.check_lookbehind = FALSE;
   8749 cb.open_caps = NULL;
   8750 
   8751 /* If any named groups were found, create the name/number table from the list
   8752 created in the pre-pass. */
   8753 
   8754 if (cb.names_found > 0)
   8755   {
   8756   int i = cb.names_found;
   8757   named_group *ng = cb.named_groups;
   8758   cb.names_found = 0;
   8759   for (; i > 0; i--, ng++)
   8760     add_name_to_table(&cb, ng->name, ng->length, ng->number);
   8761   }
   8762 
   8763 /* Set up a starting, non-extracting bracket, then compile the expression. On
   8764 error, errorcode will be set non-zero, so we don't need to look at the result
   8765 of the function here. */
   8766 
   8767 ptr = pattern + skipatstart;
   8768 code = (PCRE2_UCHAR *)codestart;
   8769 *code = OP_BRA;
   8770 (void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE,
   8771    0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
   8772 
   8773 re->top_bracket = cb.bracount;
   8774 re->top_backref = cb.top_backref;
   8775 re->max_lookbehind = cb.max_lookbehind;
   8776 
   8777 if (cb.had_accept)
   8778   {
   8779   reqcu = 0;              /* Must disable after (*ACCEPT) */
   8780   reqcuflags = REQ_NONE;
   8781   }
   8782 
   8783 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
   8784 but the estimated length exceeds the really used length, adjust the value of
   8785 re->blocksize, and if valgrind support is configured, mark the extra allocated
   8786 memory as unaddressable, so that any out-of-bound reads can be detected. */
   8787 
   8788 *code++ = OP_END;
   8789 usedlength = code - codestart;
   8790 if (usedlength > length) errorcode = ERR23; else
   8791   {
   8792   re->blocksize -= CU2BYTES(length - usedlength);
   8793 #ifdef SUPPORT_VALGRIND
   8794   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
   8795 #endif
   8796   }
   8797 
   8798 /* Scan the pattern for recursion/subroutine calls and convert the group
   8799 numbers into offsets. Maintain a small cache so that repeated groups containing
   8800 recursions are efficiently handled. */
   8801 
   8802 #define RSCAN_CACHE_SIZE 8
   8803 
   8804 if (errorcode == 0 && cb.had_recurse)
   8805   {
   8806   PCRE2_UCHAR *rcode;
   8807   PCRE2_SPTR rgroup;
   8808   int ccount = 0;
   8809   int start = RSCAN_CACHE_SIZE;
   8810   recurse_cache rc[RSCAN_CACHE_SIZE];
   8811 
   8812   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
   8813        rcode != NULL;
   8814        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
   8815     {
   8816     int i, p, recno;
   8817 
   8818     recno = (int)GET(rcode, 1);
   8819     if (recno == 0) rgroup = codestart; else
   8820       {
   8821       PCRE2_SPTR search_from = codestart;
   8822       rgroup = NULL;
   8823       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
   8824         {
   8825         if (recno == rc[p].recno)
   8826           {
   8827           rgroup = rc[p].group;
   8828           break;
   8829           }
   8830 
   8831         /* Group n+1 must always start to the right of group n, so we can save
   8832         search time below when the new group number is greater than any of the
   8833         previously found groups. */
   8834 
   8835         if (recno > rc[p].recno) search_from = rc[p].group;
   8836         }
   8837 
   8838       if (rgroup == NULL)
   8839         {
   8840         rgroup = PRIV(find_bracket)(search_from, utf, recno);
   8841         if (rgroup == NULL)
   8842           {
   8843           errorcode = ERR53;
   8844           break;
   8845           }
   8846         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
   8847         rc[start].recno = recno;
   8848         rc[start].group = rgroup;
   8849         if (ccount < RSCAN_CACHE_SIZE) ccount++;
   8850         }
   8851       }
   8852 
   8853     PUT(rcode, 1, rgroup - codestart);
   8854     }
   8855   }
   8856 
   8857 /* In rare debugging situations we sometimes need to look at the compiled code
   8858 at this stage. */
   8859 
   8860 #ifdef CALL_PRINTINT
   8861 pcre2_printint(re, stderr, TRUE);
   8862 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
   8863 #endif
   8864 
   8865 /* After a successful compile, give an error if there's back reference to a
   8866 non-existent capturing subpattern. Then, unless disabled, check whether any
   8867 single character iterators can be auto-possessified. The function overwrites
   8868 the appropriate opcode values, so the type of the pointer must be cast. NOTE:
   8869 the intermediate variable "temp" is used in this code because at least one
   8870 compiler gives a warning about loss of "const" attribute if the cast
   8871 (PCRE2_UCHAR *)codestart is used directly in the function call. */
   8872 
   8873 if (errorcode == 0)
   8874   {
   8875   if (re->top_backref > re->top_bracket) errorcode = ERR15;
   8876   else if ((re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
   8877     {
   8878     PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
   8879     if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
   8880     }
   8881   }
   8882 
   8883 /* If there were any lookbehind assertions that contained OP_RECURSE
   8884 (recursions or subroutine calls), a flag is set for them to be checked here,
   8885 because they may contain forward references. Actual recursions cannot be fixed
   8886 length, but subroutine calls can. It is done like this so that those without
   8887 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
   8888 exceptional ones forgo this. We scan the pattern to check that they are fixed
   8889 length, and set their lengths. */
   8890 
   8891 if (errorcode == 0 && cb.check_lookbehind)
   8892   {
   8893   PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;
   8894 
   8895   /* Loop, searching for OP_REVERSE items, and process those that do not have
   8896   their length set. (Actually, it will also re-process any that have a length
   8897   of zero, but that is a pathological case, and it does no harm.) When we find
   8898   one, we temporarily terminate the branch it is in while we scan it. Note that
   8899   calling find_bracket() with a negative group number returns a pointer to the
   8900   OP_REVERSE item, not the actual lookbehind. */
   8901 
   8902   for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1);
   8903        cc != NULL;
   8904        cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1))
   8905     {
   8906     if (GET(cc, 1) == 0)
   8907       {
   8908       int fixed_length;
   8909       int count = 0;
   8910       PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
   8911       int end_op = *be;
   8912       *be = OP_END;
   8913       fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
   8914       *be = end_op;
   8915       if (fixed_length < 0)
   8916         {
   8917         errorcode = fixed_length_errors[-fixed_length];
   8918         break;
   8919         }
   8920       if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
   8921       PUT(cc, 1, fixed_length);
   8922       }
   8923     cc += 1 + LINK_SIZE;
   8924     }
   8925 
   8926   /* The previous value of the maximum lookbehind was transferred to the
   8927   compiled regex block above. We could have updated this value in the loop
   8928   above, but keep the two values in step, just in case some later code below
   8929   uses the cb value. */
   8930 
   8931   re->max_lookbehind = cb.max_lookbehind;
   8932   }
   8933 
   8934 /* Failed to compile, or error while post-processing. Earlier errors get here
   8935 via the dreaded goto. */
   8936 
   8937 if (errorcode != 0)
   8938   {
   8939   HAD_ERROR:
   8940   *erroroffset = (int)(ptr - pattern);
   8941   HAD_UTF_ERROR:
   8942   *errorptr = errorcode;
   8943   pcre2_code_free(re);
   8944   re = NULL;
   8945   goto EXIT;
   8946   }
   8947 
   8948 /* Successful compile. If the anchored option was not passed, set it if
   8949 we can determine that the pattern is anchored by virtue of ^ characters or \A
   8950 or anything else, such as starting with non-atomic .* when DOTALL is set and
   8951 there are no occurrences of *PRUNE or *SKIP (though there is an option to
   8952 disable this case). */
   8953 
   8954 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
   8955      is_anchored(codestart, 0, &cb, 0))
   8956   re->overall_options |= PCRE2_ANCHORED;
   8957 
   8958 /* If the pattern is still not anchored and we do not have a first code unit,
   8959 see if there is one that is asserted (these are not saved during the compile
   8960 because they can cause conflicts with actual literals that follow). This code
   8961 need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
   8962 create will not be used. */
   8963 
   8964 if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
   8965   {
   8966   if (firstcuflags < 0)
   8967     firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
   8968 
   8969   /* Save the data for a first code unit. */
   8970 
   8971   if (firstcuflags >= 0)
   8972     {
   8973     re->first_codeunit = firstcu;
   8974     re->flags |= PCRE2_FIRSTSET;
   8975 
   8976     /* Handle caseless first code units. */
   8977 
   8978     if ((firstcuflags & REQ_CASELESS) != 0)
   8979       {
   8980       if (firstcu < 128 || (!utf && firstcu < 255))
   8981         {
   8982         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
   8983         }
   8984 
   8985       /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
   8986       8-bit UTF mode, codepoints in the range 128-255 are introductory code
   8987       points and cannot have another case. In 16-bit and 32-bit modes, we can
   8988       check wide characters when UTF (and therefore UCP) is supported. */
   8989 
   8990 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
   8991       else if (firstcu <= MAX_UTF_CODE_POINT &&
   8992                UCD_OTHERCASE(firstcu) != firstcu)
   8993         re->flags |= PCRE2_FIRSTCASELESS;
   8994 #endif
   8995       }
   8996     }
   8997 
   8998   /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
   8999   flag. This is helpful for multiline matches when all branches start with ^
   9000   and also when all branches start with non-atomic .* for non-DOTALL matches
   9001   when *PRUNE and SKIP are not present. (There is an option that disables this
   9002   case.) */
   9003 
   9004   else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
   9005   }
   9006 
   9007 /* Handle the "required code unit", if one is set. In the case of an anchored
   9008 pattern, do this only if it follows a variable length item in the pattern.
   9009 Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
   9010 
   9011 if (reqcuflags >= 0 &&
   9012      ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
   9013       (reqcuflags & REQ_VARY) != 0))
   9014   {
   9015   re->last_codeunit = reqcu;
   9016   re->flags |= PCRE2_LASTSET;
   9017 
   9018   /* Handle caseless required code units as for first code units (above). */
   9019 
   9020   if ((reqcuflags & REQ_CASELESS) != 0)
   9021     {
   9022     if (reqcu < 128 || (!utf && reqcu < 255))
   9023       {
   9024       if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
   9025       }
   9026 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
   9027     else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
   9028       re->flags |= PCRE2_LASTCASELESS;
   9029 #endif
   9030     }
   9031   }
   9032 
   9033 /* Check for a pattern than can match an empty string, so that this information
   9034 can be provided to applications. */
   9035 
   9036 do
   9037   {
   9038   int count = 0;
   9039   int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
   9040   if (rc < 0)
   9041     {
   9042     errorcode = ERR86;
   9043     goto HAD_ERROR;
   9044     }
   9045   if (rc > 0)
   9046     {
   9047     re->flags |= PCRE2_MATCH_EMPTY;
   9048     break;
   9049     }
   9050   codestart += GET(codestart, 1);
   9051   }
   9052 while (*codestart == OP_ALT);
   9053 
   9054 /* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
   9055 to set up information such as a bitmap of starting code units and a minimum
   9056 matching length. */
   9057 
   9058 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
   9059     PRIV(study)(re) != 0)
   9060   {
   9061   errorcode = ERR31;
   9062   goto HAD_ERROR;
   9063   }
   9064 
   9065 /* Control ends up here in all cases. If memory was obtained for a
   9066 zero-terminated copy of the pattern, remember to free it before returning. Also
   9067 free the list of named groups if a larger one had to be obtained, and likewise
   9068 the group information vector. */
   9069 
   9070 EXIT:
   9071 if (copied_pattern != stack_copied_pattern)
   9072   ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data);
   9073 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
   9074   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
   9075 if (cb.groupinfo != c32workspace)
   9076   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
   9077 
   9078 return re;    /* Will be NULL after an error */
   9079 }
   9080 
   9081 /* End of pcre2_compile.c */
   9082