Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10           New API code Copyright (c) 2016-2018 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 
     42 #ifdef HAVE_CONFIG_H
     43 #include "config.h"
     44 #endif
     45 
     46 #define NLBLOCK cb             /* Block containing newline information */
     47 #define PSSTART start_pattern  /* Field containing processed string start */
     48 #define PSEND   end_pattern    /* Field containing processed string end */
     49 
     50 #include "pcre2_internal.h"
     51 
     52 /* In rare error cases debugging might require calling pcre2_printint(). */
     53 
     54 #if 0
     55 #ifdef EBCDIC
     56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
     57 #else
     58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
     59 #endif
     60 #include "pcre2_printint.c"
     61 #define DEBUG_CALL_PRINTINT
     62 #endif
     63 
     64 /* Other debugging code can be enabled by these defines. */
     65 
     66 /* #define DEBUG_SHOW_CAPTURES */
     67 /* #define DEBUG_SHOW_PARSED */
     68 
     69 /* There are a few things that vary with different code unit sizes. Handle them
     70 by defining macros in order to minimize #if usage. */
     71 
     72 #if PCRE2_CODE_UNIT_WIDTH == 8
     73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
     74 #define XDIGIT(c)                xdigitab[c]
     75 
     76 #else  /* Either 16-bit or 32-bit */
     77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
     78 
     79 #if PCRE2_CODE_UNIT_WIDTH == 16
     80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
     81 
     82 #else  /* 32-bit */
     83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
     84 #endif
     85 #endif
     86 
     87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
     88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
     89 them will be able to (i.e. assume a 64-bit world). */
     90 
     91 #if PCRE2_SIZE_MAX <= UINT32_MAX
     92 #define PUTOFFSET(s,p) *p++ = s
     93 #define GETOFFSET(s,p) s = *p++
     94 #define GETPLUSOFFSET(s,p) s = *(++p)
     95 #define READPLUSOFFSET(s,p) s = p[1]
     96 #define SKIPOFFSET(p) p++
     97 #define SIZEOFFSET 1
     98 #else
     99 #define PUTOFFSET(s,p) \
    100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
    101 #define GETOFFSET(s,p) \
    102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
    103 #define GETPLUSOFFSET(s,p) \
    104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
    105 #define READPLUSOFFSET(s,p) \
    106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
    107 #define SKIPOFFSET(p) p += 2
    108 #define SIZEOFFSET 2
    109 #endif
    110 
    111 /* Macros for manipulating elements of the parsed pattern vector. */
    112 
    113 #define META_CODE(x)   (x & 0xffff0000u)
    114 #define META_DATA(x)   (x & 0x0000ffffu)
    115 #define META_DIFF(x,y) ((x-y)>>16)
    116 
    117 /* Function definitions to allow mutual recursion */
    118 
    119 #ifdef SUPPORT_UNICODE
    120 static unsigned int
    121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
    122     compile_block *, const uint32_t *, unsigned int);
    123 #endif
    124 
    125 static int
    126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
    127     uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
    128     compile_block *, PCRE2_SIZE *);
    129 
    130 static int
    131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
    132     compile_block *);
    133 
    134 static BOOL
    135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
    136     compile_block *);
    137 
    138 
    139 
    140 /*************************************************
    141 *      Code parameters and static tables         *
    142 *************************************************/
    143 
    144 #define MAX_GROUP_NUMBER   65535u
    145 #define MAX_REPEAT_COUNT   65535u
    146 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
    147 
    148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
    149 different ways in the different pattern scans. The parsing and group-
    150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
    151 aligned for this. Having defined the size in code units, we set up
    152 C16_WORK_SIZE as the number of elements in the 16-bit vector.
    153 
    154 During the first compiling phase, when determining how much memory is required,
    155 the regex is partly compiled into this space, but the compiled parts are
    156 discarded as soon as they can be, so that hopefully there will never be an
    157 overrun. The code does, however, check for an overrun, which can occur for
    158 pathological patterns. The size of the workspace depends on LINK_SIZE because
    159 the length of compiled items varies with this.
    160 
    161 In the real compile phase, this workspace is not currently used. */
    162 
    163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
    164 
    165 #define C16_WORK_SIZE \
    166   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
    167 
    168 /* A uint32_t vector is used for caching information about the size of
    169 capturing groups, to improve performance. A default is created on the stack of
    170 this size. */
    171 
    172 #define GROUPINFO_DEFAULT_SIZE 256
    173 
    174 /* The overrun tests check for a slightly smaller size so that they detect the
    175 overrun before it actually does run off the end of the data block. */
    176 
    177 #define WORK_SIZE_SAFETY_MARGIN (100)
    178 
    179 /* This value determines the size of the initial vector that is used for
    180 remembering named groups during the pre-compile. It is allocated on the stack,
    181 but if it is too small, it is expanded, in a similar way to the workspace. The
    182 value is the number of slots in the list. */
    183 
    184 #define NAMED_GROUP_LIST_SIZE  20
    185 
    186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
    187 of uint32_t. For short patterns this lives on the stack, with this size. Heap
    188 memory is used for longer patterns. */
    189 
    190 #define PARSED_PATTERN_DEFAULT_SIZE 1024
    191 
    192 /* Maximum length value to check against when making sure that the variable
    193 that holds the compiled pattern length does not overflow. We make it a bit less
    194 than INT_MAX to allow for adding in group terminating code units, so that we
    195 don't have to check them every time. */
    196 
    197 #define OFLOW_MAX (INT_MAX - 20)
    198 
    199 /* Code values for parsed patterns, which are stored in a vector of 32-bit
    200 unsigned ints. Values less than META_END are literal data values. The coding
    201 for identifying the item is in the top 16-bits, leaving 16 bits for the
    202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
    203 macros are used to manipulate parsed pattern elements.
    204 
    205 NOTE: When these definitions are changed, the table of extra lengths for each
    206 code (meta_extra_lengths, just below) must be updated to remain in step. */
    207 
    208 #define META_END              0x80000000u  /* End of pattern */
    209 
    210 #define META_ALT              0x80010000u  /* alternation */
    211 #define META_ATOMIC           0x80020000u  /* atomic group */
    212 #define META_BACKREF          0x80030000u  /* Back ref */
    213 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
    214 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
    215 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
    216 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
    217 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
    218 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
    219 #define META_CLASS            0x800a0000u  /* start non-empty class */
    220 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
    221 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
    222 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
    223 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
    224 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
    225 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
    226 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
    227 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
    228 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
    229 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
    230 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
    231 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
    232 #define META_DOT              0x80170000u  /* . metacharacter */
    233 #define META_ESCAPE           0x80180000u  /* \d and friends */
    234 #define META_KET              0x80190000u  /* closing parenthesis */
    235 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
    236 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
    237 #define META_POSIX            0x801c0000u  /* POSIX class item */
    238 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
    239 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
    240 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
    241 #define META_RECURSE          0x80200000u  /* Recursion */
    242 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
    243 
    244 /* These must be kept together to make it easy to check that an assertion
    245 is present where expected in a conditional group. */
    246 
    247 #define META_LOOKAHEAD        0x80220000u  /* (?= */
    248 #define META_LOOKAHEADNOT     0x80230000u  /* (?! */
    249 #define META_LOOKBEHIND       0x80240000u  /* (?<= */
    250 #define META_LOOKBEHINDNOT    0x80250000u  /* (?<! */
    251 
    252 /* These must be kept in this order, with consecutive values, and the _ARG
    253 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
    254 versions. */
    255 
    256 #define META_MARK             0x80260000u  /* (*MARK) */
    257 #define META_ACCEPT           0x80270000u  /* (*ACCEPT) */
    258 #define META_FAIL             0x80280000u  /* (*FAIL) */
    259 #define META_COMMIT           0x80290000u  /* These               */
    260 #define META_COMMIT_ARG       0x802a0000u  /*   pairs             */
    261 #define META_PRUNE            0x802b0000u  /*     must            */
    262 #define META_PRUNE_ARG        0x802c0000u  /*       be            */
    263 #define META_SKIP             0x802d0000u  /*         kept        */
    264 #define META_SKIP_ARG         0x802e0000u  /*           in        */
    265 #define META_THEN             0x802f0000u  /*             this    */
    266 #define META_THEN_ARG         0x80300000u  /*               order */
    267 
    268 /* These must be kept in groups of adjacent 3 values, and all together. */
    269 
    270 #define META_ASTERISK         0x80310000u  /* *  */
    271 #define META_ASTERISK_PLUS    0x80320000u  /* *+ */
    272 #define META_ASTERISK_QUERY   0x80330000u  /* *? */
    273 #define META_PLUS             0x80340000u  /* +  */
    274 #define META_PLUS_PLUS        0x80350000u  /* ++ */
    275 #define META_PLUS_QUERY       0x80360000u  /* +? */
    276 #define META_QUERY            0x80370000u  /* ?  */
    277 #define META_QUERY_PLUS       0x80380000u  /* ?+ */
    278 #define META_QUERY_QUERY      0x80390000u  /* ?? */
    279 #define META_MINMAX           0x803a0000u  /* {n,m}  repeat */
    280 #define META_MINMAX_PLUS      0x803b0000u  /* {n,m}+ repeat */
    281 #define META_MINMAX_QUERY     0x803c0000u  /* {n,m}? repeat */
    282 
    283 #define META_FIRST_QUANTIFIER META_ASTERISK
    284 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
    285 
    286 /* Table of extra lengths for each of the meta codes. Must be kept in step with
    287 the definitions above. For some items these values are a basic length to which
    288 a variable amount has to be added. */
    289 
    290 static unsigned char meta_extra_lengths[] = {
    291   0,             /* META_END */
    292   0,             /* META_ALT */
    293   0,             /* META_ATOMIC */
    294   0,             /* META_BACKREF - more if group is >= 10 */
    295   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
    296   1,             /* META_BIGVALUE */
    297   3,             /* META_CALLOUT_NUMBER */
    298   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
    299   0,             /* META_CAPTURE */
    300   0,             /* META_CIRCUMFLEX */
    301   0,             /* META_CLASS */
    302   0,             /* META_CLASS_EMPTY */
    303   0,             /* META_CLASS_EMPTY_NOT */
    304   0,             /* META_CLASS_END */
    305   0,             /* META_CLASS_NOT */
    306   0,             /* META_COND_ASSERT */
    307   SIZEOFFSET,    /* META_COND_DEFINE */
    308   1+SIZEOFFSET,  /* META_COND_NAME */
    309   1+SIZEOFFSET,  /* META_COND_NUMBER */
    310   1+SIZEOFFSET,  /* META_COND_RNAME */
    311   1+SIZEOFFSET,  /* META_COND_RNUMBER */
    312   3,             /* META_COND_VERSION */
    313   0,             /* META_DOLLAR */
    314   0,             /* META_DOT */
    315   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
    316   0,             /* META_KET */
    317   0,             /* META_NOCAPTURE */
    318   1,             /* META_OPTIONS */
    319   1,             /* META_POSIX */
    320   1,             /* META_POSIX_NEG */
    321   0,             /* META_RANGE_ESCAPED */
    322   0,             /* META_RANGE_LITERAL */
    323   SIZEOFFSET,    /* META_RECURSE */
    324   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
    325   0,             /* META_LOOKAHEAD */
    326   0,             /* META_LOOKAHEADNOT */
    327   SIZEOFFSET,    /* META_LOOKBEHIND */
    328   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
    329   1,             /* META_MARK - plus the string length */
    330   0,             /* META_ACCEPT */
    331   0,             /* META_FAIL */
    332   0,             /* META_COMMIT */
    333   1,             /* META_COMMIT_ARG - plus the string length */
    334   0,             /* META_PRUNE */
    335   1,             /* META_PRUNE_ARG - plus the string length */
    336   0,             /* META_SKIP */
    337   1,             /* META_SKIP_ARG - plus the string length */
    338   0,             /* META_THEN */
    339   1,             /* META_THEN_ARG - plus the string length */
    340   0,             /* META_ASTERISK */
    341   0,             /* META_ASTERISK_PLUS */
    342   0,             /* META_ASTERISK_QUERY */
    343   0,             /* META_PLUS */
    344   0,             /* META_PLUS_PLUS */
    345   0,             /* META_PLUS_QUERY */
    346   0,             /* META_QUERY */
    347   0,             /* META_QUERY_PLUS */
    348   0,             /* META_QUERY_QUERY */
    349   2,             /* META_MINMAX */
    350   2,             /* META_MINMAX_PLUS */
    351   2              /* META_MINMAX_QUERY */
    352 };
    353 
    354 /* Types for skipping parts of a parsed pattern. */
    355 
    356 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
    357 
    358 /* Macro for setting individual bits in class bitmaps. It took some
    359 experimenting to figure out how to stop gcc 5.3.0 from warning with
    360 -Wconversion. This version gets a warning:
    361 
    362   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
    363 
    364 Let's hope the apparently less efficient version isn't actually so bad if the
    365 compiler is clever with identical subexpressions. */
    366 
    367 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
    368 
    369 /* Private flags added to firstcu and reqcu. */
    370 
    371 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
    372 #define REQ_VARY        (1 << 1)        /* reqcu followed non-literal item */
    373 /* Negative values for the firstcu and reqcu flags */
    374 #define REQ_UNSET       (-2)            /* Not yet found anything */
    375 #define REQ_NONE        (-1)            /* Found not fixed char */
    376 
    377 /* These flags are used in the groupinfo vector. */
    378 
    379 #define GI_SET_FIXED_LENGTH    0x80000000u
    380 #define GI_NOT_FIXED_LENGTH    0x40000000u
    381 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
    382 
    383 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
    384 and is fast (a good compiler can turn it into a subtraction and unsigned
    385 comparison). */
    386 
    387 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
    388 
    389 /* Table to identify hex digits. The tables in chartables are dependent on the
    390 locale, and may mark arbitrary characters as digits. We want to recognize only
    391 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
    392 costs 256 bytes, but it is a lot faster than doing character value tests (at
    393 least in some simple cases I timed), and in some applications one wants PCRE2
    394 to compile efficiently as well as match efficiently. The value in the table is
    395 the binary hex digit value, or 0xff for non-hex digits. */
    396 
    397 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
    398 UTF-8 mode. */
    399 
    400 #ifndef EBCDIC
    401 static const uint8_t xdigitab[] =
    402   {
    403   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
    404   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
    405   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
    406   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
    407   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
    408   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
    409   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
    410   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
    411   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
    412   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
    413   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
    414   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
    415   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
    416   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
    417   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
    418   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
    419   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
    420   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
    421   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
    422   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
    423   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
    424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
    425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
    426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
    427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
    428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
    429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
    430   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
    431   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
    432   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
    433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
    434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
    435 
    436 #else
    437 
    438 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
    439 
    440 static const uint8_t xdigitab[] =
    441   {
    442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
    443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
    444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
    445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
    446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
    447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
    448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
    449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
    450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
    451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
    452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
    453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
    454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
    455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
    456   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
    457   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
    458   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
    459   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
    460   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
    461   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
    462   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
    463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
    464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
    465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
    466   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
    467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
    468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
    469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
    470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
    471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
    472   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
    473   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
    474 #endif  /* EBCDIC */
    475 
    476 
    477 /* Table for handling alphanumeric escaped characters. Positive returns are
    478 simple data values; negative values are for special things like \d and so on.
    479 Zero means further processing is needed (for things like \x), or the escape is
    480 invalid. */
    481 
    482 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
    483 in UTF-8 mode. It runs from '0' to 'z'. */
    484 
    485 #ifndef EBCDIC
    486 #define ESCAPES_FIRST       CHAR_0
    487 #define ESCAPES_LAST        CHAR_z
    488 #define UPPER_CASE(c)       (c-32)
    489 
    490 static const short int escapes[] = {
    491      0,                       0,
    492      0,                       0,
    493      0,                       0,
    494      0,                       0,
    495      0,                       0,
    496      CHAR_COLON,              CHAR_SEMICOLON,
    497      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
    498      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
    499      CHAR_COMMERCIAL_AT,      -ESC_A,
    500      -ESC_B,                  -ESC_C,
    501      -ESC_D,                  -ESC_E,
    502      0,                       -ESC_G,
    503      -ESC_H,                  0,
    504      0,                       -ESC_K,
    505      0,                       0,
    506      -ESC_N,                  0,
    507      -ESC_P,                  -ESC_Q,
    508      -ESC_R,                  -ESC_S,
    509      0,                       0,
    510      -ESC_V,                  -ESC_W,
    511      -ESC_X,                  0,
    512      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
    513      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
    514      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
    515      CHAR_GRAVE_ACCENT,       CHAR_BEL,
    516      -ESC_b,                  0,
    517      -ESC_d,                  CHAR_ESC,
    518      CHAR_FF,                 0,
    519      -ESC_h,                  0,
    520      0,                       -ESC_k,
    521      0,                       0,
    522      CHAR_LF,                 0,
    523      -ESC_p,                  0,
    524      CHAR_CR,                 -ESC_s,
    525      CHAR_HT,                 0,
    526      -ESC_v,                  -ESC_w,
    527      0,                       0,
    528      -ESC_z
    529 };
    530 
    531 #else
    532 
    533 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
    534 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
    535 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
    536 because it is defined as 'a', which of course picks up the ASCII value. */
    537 
    538 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
    539 #define ESCAPES_FIRST       CHAR_a
    540 #define ESCAPES_LAST        CHAR_9
    541 #define UPPER_CASE(c)       (c+64)
    542 #else                              /* Testing in an ASCII environment */
    543 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
    544 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
    545 #define UPPER_CASE(c)  (c-32)
    546 #endif
    547 
    548 static const short int escapes[] = {
    549 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
    550 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
    551 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
    552 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
    553 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
    554 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
    555 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
    556 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
    557 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
    558 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
    559 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
    560 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
    561 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
    562 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
    563 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
    564 /*  F8 */      0,        0
    565 };
    566 
    567 /* We also need a table of characters that may follow \c in an EBCDIC
    568 environment for characters 0-31. */
    569 
    570 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
    571 
    572 #endif   /* EBCDIC */
    573 
    574 
    575 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
    576 searched linearly. Put all the names into a single string, in order to reduce
    577 the number of relocations when a shared library is dynamically linked. The
    578 string is built from string macros so that it works in UTF-8 mode on EBCDIC
    579 platforms. */
    580 
    581 typedef struct verbitem {
    582   unsigned int len;          /* Length of verb name */
    583   uint32_t meta;             /* Base META_ code */
    584   int has_arg;               /* Argument requirement */
    585 } verbitem;
    586 
    587 static const char verbnames[] =
    588   "\0"                       /* Empty name is a shorthand for MARK */
    589   STRING_MARK0
    590   STRING_ACCEPT0
    591   STRING_F0
    592   STRING_FAIL0
    593   STRING_COMMIT0
    594   STRING_PRUNE0
    595   STRING_SKIP0
    596   STRING_THEN;
    597 
    598 static const verbitem verbs[] = {
    599   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
    600   { 4, META_MARK,   +1 },
    601   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
    602   { 1, META_FAIL,   -1 },
    603   { 4, META_FAIL,   -1 },
    604   { 6, META_COMMIT,  0 },
    605   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
    606   { 4, META_SKIP,    0 },
    607   { 4, META_THEN,    0 }
    608 };
    609 
    610 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
    611 
    612 /* Verb opcodes, indexed by their META code offset from META_MARK. */
    613 
    614 static const uint32_t verbops[] = {
    615   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
    616   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
    617 
    618 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
    619 
    620 static uint32_t chartypeoffset[] = {
    621   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
    622   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
    623 
    624 /* Tables of names of POSIX character classes and their lengths. The names are
    625 now all in a single string, to reduce the number of relocations when a shared
    626 library is dynamically loaded. The list of lengths is terminated by a zero
    627 length entry. The first three must be alpha, lower, upper, as this is assumed
    628 for handling case independence. The indices for graph, print, and punct are
    629 needed, so identify them. */
    630 
    631 static const char posix_names[] =
    632   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
    633   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
    634   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
    635   STRING_word0  STRING_xdigit;
    636 
    637 static const uint8_t posix_name_lengths[] = {
    638   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
    639 
    640 #define PC_GRAPH  8
    641 #define PC_PRINT  9
    642 #define PC_PUNCT 10
    643 
    644 /* Table of class bit maps for each POSIX class. Each class is formed from a
    645 base map, with an optional addition or removal of another map. Then, for some
    646 classes, there is some additional tweaking: for [:blank:] the vertical space
    647 characters are removed, and for [:alpha:] and [:alnum:] the underscore
    648 character is removed. The triples in the table consist of the base map offset,
    649 second map offset or -1 if no second map, and a non-negative value for map
    650 addition or a negative value for map subtraction (if there are two maps). The
    651 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
    652 remove vertical space characters, 2 => remove underscore. */
    653 
    654 static const int posix_class_maps[] = {
    655   cbit_word,  cbit_digit, -2,             /* alpha */
    656   cbit_lower, -1,          0,             /* lower */
    657   cbit_upper, -1,          0,             /* upper */
    658   cbit_word,  -1,          2,             /* alnum - word without underscore */
    659   cbit_print, cbit_cntrl,  0,             /* ascii */
    660   cbit_space, -1,          1,             /* blank - a GNU extension */
    661   cbit_cntrl, -1,          0,             /* cntrl */
    662   cbit_digit, -1,          0,             /* digit */
    663   cbit_graph, -1,          0,             /* graph */
    664   cbit_print, -1,          0,             /* print */
    665   cbit_punct, -1,          0,             /* punct */
    666   cbit_space, -1,          0,             /* space */
    667   cbit_word,  -1,          0,             /* word - a Perl extension */
    668   cbit_xdigit,-1,          0              /* xdigit */
    669 };
    670 
    671 #ifdef SUPPORT_UNICODE
    672 
    673 /* The POSIX class Unicode property substitutes that are used in UCP mode must
    674 be in the order of the POSIX class names, defined above. */
    675 
    676 static int posix_substitutes[] = {
    677   PT_GC, ucp_L,     /* alpha */
    678   PT_PC, ucp_Ll,    /* lower */
    679   PT_PC, ucp_Lu,    /* upper */
    680   PT_ALNUM, 0,      /* alnum */
    681   -1, 0,            /* ascii, treat as non-UCP */
    682   -1, 1,            /* blank, treat as \h */
    683   PT_PC, ucp_Cc,    /* cntrl */
    684   PT_PC, ucp_Nd,    /* digit */
    685   PT_PXGRAPH, 0,    /* graph */
    686   PT_PXPRINT, 0,    /* print */
    687   PT_PXPUNCT, 0,    /* punct */
    688   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
    689   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
    690   -1, 0             /* xdigit, treat as non-UCP */
    691 };
    692 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
    693 #endif  /* SUPPORT_UNICODE */
    694 
    695 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
    696 are allowed. */
    697 
    698 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
    699   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
    700    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
    701    PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
    702 
    703 #define PUBLIC_COMPILE_OPTIONS \
    704   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
    705    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
    706    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
    707    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
    708    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
    709    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
    710    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
    711 
    712 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
    713    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
    714 
    715 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
    716    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
    717     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
    718 
    719 /* Compile time error code numbers. They are given names so that they can more
    720 easily be tracked. When a new number is added, the tables called eint1 and
    721 eint2 in pcre2posix.c may need to be updated, and a new error text must be
    722 added to compile_error_texts in pcre2_error.c. */
    723 
    724 enum { ERR0 = COMPILE_ERROR_BASE,
    725        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
    726        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
    727        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
    728        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
    729        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
    730        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
    731        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
    732        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
    733        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
    734        ERR91, ERR92, ERR93, ERR94 };
    735 
    736 /* This is a table of start-of-pattern options such as (*UTF) and settings such
    737 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
    738 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
    739 generic and always supported. */
    740 
    741 enum { PSO_OPT,     /* Value is an option bit */
    742        PSO_FLG,     /* Value is a flag bit */
    743        PSO_NL,      /* Value is a newline type */
    744        PSO_BSR,     /* Value is a \R type */
    745        PSO_LIMH,    /* Read integer value for heap limit */
    746        PSO_LIMM,    /* Read integer value for match limit */
    747        PSO_LIMD };  /* Read integer value for depth limit */
    748 
    749 typedef struct pso {
    750   const uint8_t *name;
    751   uint16_t length;
    752   uint16_t type;
    753   uint32_t value;
    754 } pso;
    755 
    756 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
    757 
    758 static pso pso_list[] = {
    759   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
    760   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
    761   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
    762   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
    763   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
    764   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
    765   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
    766   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
    767   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
    768   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
    769   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
    770   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
    771   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
    772   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
    773   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
    774   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
    775   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
    776   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
    777   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
    778   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
    779   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
    780 };
    781 
    782 /* This table is used when converting repeating opcodes into possessified
    783 versions as a result of an explicit possessive quantifier such as ++. A zero
    784 value means there is no possessified version - in those cases the item in
    785 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
    786 because all relevant opcodes are less than that. */
    787 
    788 static const uint8_t opcode_possessify[] = {
    789   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
    790   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
    791 
    792   0,                       /* NOTI */
    793   OP_POSSTAR, 0,           /* STAR, MINSTAR */
    794   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
    795   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
    796   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
    797   0,                       /* EXACT */
    798   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
    799 
    800   OP_POSSTARI, 0,          /* STARI, MINSTARI */
    801   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
    802   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
    803   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
    804   0,                       /* EXACTI */
    805   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
    806 
    807   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
    808   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
    809   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
    810   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
    811   0,                       /* NOTEXACT */
    812   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
    813 
    814   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
    815   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
    816   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
    817   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
    818   0,                       /* NOTEXACTI */
    819   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
    820 
    821   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
    822   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
    823   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
    824   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
    825   0,                       /* TYPEEXACT */
    826   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
    827 
    828   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
    829   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
    830   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
    831   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
    832   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
    833 
    834   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
    835   0, 0,                    /* REF, REFI */
    836   0, 0,                    /* DNREF, DNREFI */
    837   0, 0                     /* RECURSE, CALLOUT */
    838 };
    839 
    840 
    841 #ifdef DEBUG_SHOW_PARSED
    842 /*************************************************
    843 *     Show the parsed pattern for debugging      *
    844 *************************************************/
    845 
    846 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
    847 can be enabled. */
    848 
    849 static void show_parsed(compile_block *cb)
    850 {
    851 uint32_t *pptr = cb->parsed_pattern;
    852 
    853 for (;;)
    854   {
    855   int max, min;
    856   PCRE2_SIZE offset;
    857   uint32_t i;
    858   uint32_t length;
    859   uint32_t meta_arg = META_DATA(*pptr);
    860 
    861   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
    862 
    863   if (*pptr < META_END)
    864     {
    865     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
    866     pptr++;
    867     }
    868 
    869   else switch (META_CODE(*pptr++))
    870     {
    871     default:
    872     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
    873     return;
    874 
    875     case META_END:
    876     fprintf(stderr, "META_END\n");
    877     return;
    878 
    879     case META_CAPTURE:
    880     fprintf(stderr, "META_CAPTURE %d", meta_arg);
    881     break;
    882 
    883     case META_RECURSE:
    884     GETOFFSET(offset, pptr);
    885     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
    886     break;
    887 
    888     case META_BACKREF:
    889     if (meta_arg < 10)
    890       offset = cb->small_ref_offset[meta_arg];
    891     else
    892       GETOFFSET(offset, pptr);
    893     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
    894     break;
    895 
    896     case META_ESCAPE:
    897     if (meta_arg == ESC_P || meta_arg == ESC_p)
    898       {
    899       uint32_t ptype = *pptr >> 16;
    900       uint32_t pvalue = *pptr++ & 0xffff;
    901       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
    902         ptype, pvalue);
    903       }
    904     else
    905       {
    906       uint32_t cc;
    907       /* There's just one escape we might have here that isn't negated in the
    908       escapes table. */
    909       if (meta_arg == ESC_g) cc = CHAR_g;
    910       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
    911         {
    912         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
    913         }
    914       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
    915       fprintf(stderr, "META \\%c", cc);
    916       }
    917     break;
    918 
    919     case META_MINMAX:
    920     min = *pptr++;
    921     max = *pptr++;
    922     if (max != REPEAT_UNLIMITED)
    923       fprintf(stderr, "META {%d,%d}", min, max);
    924     else
    925       fprintf(stderr, "META {%d,}", min);
    926     break;
    927 
    928     case META_MINMAX_QUERY:
    929     min = *pptr++;
    930     max = *pptr++;
    931     if (max != REPEAT_UNLIMITED)
    932       fprintf(stderr, "META {%d,%d}?", min, max);
    933     else
    934       fprintf(stderr, "META {%d,}?", min);
    935     break;
    936 
    937     case META_MINMAX_PLUS:
    938     min = *pptr++;
    939     max = *pptr++;
    940     if (max != REPEAT_UNLIMITED)
    941       fprintf(stderr, "META {%d,%d}+", min, max);
    942     else
    943       fprintf(stderr, "META {%d,}+", min);
    944     break;
    945 
    946     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
    947     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
    948     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
    949     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
    950     case META_DOT: fprintf(stderr, "META_DOT"); break;
    951     case META_ASTERISK: fprintf(stderr, "META *"); break;
    952     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
    953     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
    954     case META_PLUS: fprintf(stderr, "META +"); break;
    955     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
    956     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
    957     case META_QUERY: fprintf(stderr, "META ?"); break;
    958     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
    959     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
    960 
    961     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
    962     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
    963     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
    964     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
    965     case META_KET: fprintf(stderr, "META )"); break;
    966     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
    967 
    968     case META_CLASS: fprintf(stderr, "META ["); break;
    969     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
    970     case META_CLASS_END: fprintf(stderr, "META ]"); break;
    971     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
    972     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
    973 
    974     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
    975     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
    976 
    977     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
    978     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
    979 
    980     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
    981     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
    982     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
    983     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
    984     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
    985     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
    986 
    987     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
    988 
    989     case META_LOOKBEHIND:
    990     fprintf(stderr, "META (?<= %d offset=", meta_arg);
    991     GETOFFSET(offset, pptr);
    992     fprintf(stderr, "%zd", offset);
    993     break;
    994 
    995     case META_LOOKBEHINDNOT:
    996     fprintf(stderr, "META (?<! %d offset=", meta_arg);
    997     GETOFFSET(offset, pptr);
    998     fprintf(stderr, "%zd", offset);
    999     break;
   1000 
   1001     case META_CALLOUT_NUMBER:
   1002     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
   1003        pptr[1]);
   1004     pptr += 3;
   1005     break;
   1006 
   1007     case META_CALLOUT_STRING:
   1008       {
   1009       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
   1010       uint32_t patlength = *pptr++;    /* Length of next pattern item */
   1011       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
   1012       GETOFFSET(offset, pptr);
   1013       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
   1014       }
   1015     break;
   1016 
   1017     case META_RECURSE_BYNAME:
   1018     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
   1019     GETOFFSET(offset, pptr);
   1020     fprintf(stderr, "%zd", offset);
   1021     break;
   1022 
   1023     case META_BACKREF_BYNAME:
   1024     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
   1025     GETOFFSET(offset, pptr);
   1026     fprintf(stderr, "%zd", offset);
   1027     break;
   1028 
   1029     case META_COND_NUMBER:
   1030     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
   1031     GETOFFSET(offset, pptr);
   1032     fprintf(stderr, "%zd", offset);
   1033     pptr++;
   1034     break;
   1035 
   1036     case META_COND_DEFINE:
   1037     fprintf(stderr, "META (?(DEFINE) offset=");
   1038     GETOFFSET(offset, pptr);
   1039     fprintf(stderr, "%zd", offset);
   1040     break;
   1041 
   1042     case META_COND_VERSION:
   1043     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
   1044     fprintf(stderr, "%d.", *pptr++);
   1045     fprintf(stderr, "%d)", *pptr++);
   1046     break;
   1047 
   1048     case META_COND_NAME:
   1049     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
   1050     GETOFFSET(offset, pptr);
   1051     fprintf(stderr, "%zd", offset);
   1052     break;
   1053 
   1054     case META_COND_RNAME:
   1055     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
   1056     GETOFFSET(offset, pptr);
   1057     fprintf(stderr, "%zd", offset);
   1058     break;
   1059 
   1060     /* This is kept as a name, because it might be. */
   1061 
   1062     case META_COND_RNUMBER:
   1063     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
   1064     GETOFFSET(offset, pptr);
   1065     fprintf(stderr, "%zd", offset);
   1066     break;
   1067 
   1068     case META_MARK:
   1069     fprintf(stderr, "META (*MARK:");
   1070     goto SHOWARG;
   1071 
   1072     case META_COMMIT_ARG:
   1073     fprintf(stderr, "META (*COMMIT:");
   1074     goto SHOWARG;
   1075 
   1076     case META_PRUNE_ARG:
   1077     fprintf(stderr, "META (*PRUNE:");
   1078     goto SHOWARG;
   1079 
   1080     case META_SKIP_ARG:
   1081     fprintf(stderr, "META (*SKIP:");
   1082     goto SHOWARG;
   1083 
   1084     case META_THEN_ARG:
   1085     fprintf(stderr, "META (*THEN:");
   1086     SHOWARG:
   1087     length = *pptr++;
   1088     for (i = 0; i < length; i++)
   1089       {
   1090       uint32_t cc = *pptr++;
   1091       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
   1092         else fprintf(stderr, "\\x{%x}", cc);
   1093       }
   1094     fprintf(stderr, ") length=%u", length);
   1095     break;
   1096     }
   1097   fprintf(stderr, "\n");
   1098   }
   1099 return;
   1100 }
   1101 #endif  /* DEBUG_SHOW_PARSED */
   1102 
   1103 
   1104 
   1105 /*************************************************
   1106 *               Copy compiled code               *
   1107 *************************************************/
   1108 
   1109 /* Compiled JIT code cannot be copied, so the new compiled block has no
   1110 associated JIT data. */
   1111 
   1112 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
   1113 pcre2_code_copy(const pcre2_code *code)
   1114 {
   1115 PCRE2_SIZE* ref_count;
   1116 pcre2_code *newcode;
   1117 
   1118 if (code == NULL) return NULL;
   1119 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
   1120 if (newcode == NULL) return NULL;
   1121 memcpy(newcode, code, code->blocksize);
   1122 newcode->executable_jit = NULL;
   1123 
   1124 /* If the code is one that has been deserialized, increment the reference count
   1125 in the decoded tables. */
   1126 
   1127 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
   1128   {
   1129   ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
   1130   (*ref_count)++;
   1131   }
   1132 
   1133 return newcode;
   1134 }
   1135 
   1136 
   1137 
   1138 /*************************************************
   1139 *     Copy compiled code and character tables    *
   1140 *************************************************/
   1141 
   1142 /* Compiled JIT code cannot be copied, so the new compiled block has no
   1143 associated JIT data. This version of code_copy also makes a separate copy of
   1144 the character tables. */
   1145 
   1146 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
   1147 pcre2_code_copy_with_tables(const pcre2_code *code)
   1148 {
   1149 PCRE2_SIZE* ref_count;
   1150 pcre2_code *newcode;
   1151 uint8_t *newtables;
   1152 
   1153 if (code == NULL) return NULL;
   1154 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
   1155 if (newcode == NULL) return NULL;
   1156 memcpy(newcode, code, code->blocksize);
   1157 newcode->executable_jit = NULL;
   1158 
   1159 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
   1160   code->memctl.memory_data);
   1161 if (newtables == NULL)
   1162   {
   1163   code->memctl.free((void *)newcode, code->memctl.memory_data);
   1164   return NULL;
   1165   }
   1166 memcpy(newtables, code->tables, tables_length);
   1167 ref_count = (PCRE2_SIZE *)(newtables + tables_length);
   1168 *ref_count = 1;
   1169 
   1170 newcode->tables = newtables;
   1171 newcode->flags |= PCRE2_DEREF_TABLES;
   1172 return newcode;
   1173 }
   1174 
   1175 
   1176 
   1177 /*************************************************
   1178 *               Free compiled code               *
   1179 *************************************************/
   1180 
   1181 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
   1182 pcre2_code_free(pcre2_code *code)
   1183 {
   1184 PCRE2_SIZE* ref_count;
   1185 
   1186 if (code != NULL)
   1187   {
   1188   if (code->executable_jit != NULL)
   1189     PRIV(jit_free)(code->executable_jit, &code->memctl);
   1190 
   1191   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
   1192     {
   1193     /* Decoded tables belong to the codes after deserialization, and they must
   1194     be freed when there are no more reference to them. The *ref_count should
   1195     always be > 0. */
   1196 
   1197     ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
   1198     if (*ref_count > 0)
   1199       {
   1200       (*ref_count)--;
   1201       if (*ref_count == 0)
   1202         code->memctl.free((void *)code->tables, code->memctl.memory_data);
   1203       }
   1204     }
   1205 
   1206   code->memctl.free(code, code->memctl.memory_data);
   1207   }
   1208 }
   1209 
   1210 
   1211 
   1212 /*************************************************
   1213 *         Read a number, possibly signed         *
   1214 *************************************************/
   1215 
   1216 /* This function is used to read numbers in the pattern. The initial pointer
   1217 must be the sign or first digit of the number. When relative values (introduced
   1218 by + or -) are allowed, they are relative group numbers, and the result must be
   1219 greater than zero.
   1220 
   1221 Arguments:
   1222   ptrptr      points to the character pointer variable
   1223   ptrend      points to the end of the input string
   1224   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
   1225   max_value   the largest number allowed
   1226   max_error   the error to give for an over-large number
   1227   intptr      where to put the result
   1228   errcodeptr  where to put an error code
   1229 
   1230 Returns:      TRUE  - a number was read
   1231               FALSE - errorcode == 0 => no number was found
   1232                       errorcode != 0 => an error occurred
   1233 */
   1234 
   1235 static BOOL
   1236 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
   1237   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
   1238 {
   1239 int sign = 0;
   1240 uint32_t n = 0;
   1241 PCRE2_SPTR ptr = *ptrptr;
   1242 BOOL yield = FALSE;
   1243 
   1244 *errorcodeptr = 0;
   1245 
   1246 if (allow_sign >= 0 && ptr < ptrend)
   1247   {
   1248   if (*ptr == CHAR_PLUS)
   1249     {
   1250     sign = +1;
   1251     max_value -= allow_sign;
   1252     ptr++;
   1253     }
   1254   else if (*ptr == CHAR_MINUS)
   1255     {
   1256     sign = -1;
   1257     ptr++;
   1258     }
   1259   }
   1260 
   1261 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
   1262 while (ptr < ptrend && IS_DIGIT(*ptr))
   1263   {
   1264   n = n * 10 + *ptr++ - CHAR_0;
   1265   if (n > max_value)
   1266     {
   1267     *errorcodeptr = max_error;
   1268     goto EXIT;
   1269     }
   1270   }
   1271 
   1272 if (allow_sign >= 0 && sign != 0)
   1273   {
   1274   if (n == 0)
   1275     {
   1276     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
   1277     goto EXIT;
   1278     }
   1279 
   1280   if (sign > 0) n += allow_sign;
   1281   else if ((int)n > allow_sign)
   1282     {
   1283     *errorcodeptr = ERR15;  /* Non-existent subpattern */
   1284     goto EXIT;
   1285     }
   1286   else n = allow_sign + 1 - n;
   1287   }
   1288 
   1289 yield = TRUE;
   1290 
   1291 EXIT:
   1292 *intptr = n;
   1293 *ptrptr = ptr;
   1294 return yield;
   1295 }
   1296 
   1297 
   1298 
   1299 /*************************************************
   1300 *         Read repeat counts                     *
   1301 *************************************************/
   1302 
   1303 /* Read an item of the form {n,m} and return the values if non-NULL pointers
   1304 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
   1305 larger value is used for "unlimited". We have to use signed arguments for
   1306 read_number() because it is capable of returning a signed value.
   1307 
   1308 Arguments:
   1309   ptrptr         points to pointer to character after'{'
   1310   ptrend         pointer to end of input
   1311   minp           if not NULL, pointer to int for min
   1312   maxp           if not NULL, pointer to int for max (-1 if no max)
   1313                  returned as -1 if no max
   1314   errorcodeptr   points to error code variable
   1315 
   1316 Returns:         FALSE if not a repeat quantifier, errorcode set zero
   1317                  FALSE on error, with errorcode set non-zero
   1318                  TRUE on success, with pointer updated to point after '}'
   1319 */
   1320 
   1321 static BOOL
   1322 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
   1323   uint32_t *maxp, int *errorcodeptr)
   1324 {
   1325 PCRE2_SPTR p = *ptrptr;
   1326 BOOL yield = FALSE;
   1327 int32_t min = 0;
   1328 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
   1329 
   1330 /* NB read_number() initializes the error code to zero. The only error is for a
   1331 number that is too big. */
   1332 
   1333 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
   1334   goto EXIT;
   1335 
   1336 if (p >= ptrend) goto EXIT;
   1337 
   1338 if (*p == CHAR_RIGHT_CURLY_BRACKET)
   1339   {
   1340   p++;
   1341   max = min;
   1342   }
   1343 
   1344 else
   1345   {
   1346   if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
   1347   if (*p != CHAR_RIGHT_CURLY_BRACKET)
   1348     {
   1349     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
   1350         errorcodeptr) || p >= ptrend ||  *p != CHAR_RIGHT_CURLY_BRACKET)
   1351       goto EXIT;
   1352     if (max < min)
   1353       {
   1354       *errorcodeptr = ERR4;
   1355       goto EXIT;
   1356       }
   1357     }
   1358   p++;
   1359   }
   1360 
   1361 yield = TRUE;
   1362 if (minp != NULL) *minp = (uint32_t)min;
   1363 if (maxp != NULL) *maxp = (uint32_t)max;
   1364 
   1365 /* Update the pattern pointer on success, or after an error, but not when
   1366 the result is "not a repeat quantifier". */
   1367 
   1368 EXIT:
   1369 if (yield || *errorcodeptr != 0) *ptrptr = p;
   1370 return yield;
   1371 
   1372 
   1373 
   1374 }
   1375 
   1376 
   1377 
   1378 /*************************************************
   1379 *            Handle escapes                      *
   1380 *************************************************/
   1381 
   1382 /* This function is called when a \ has been encountered. It either returns a
   1383 positive value for a simple escape such as \d, or 0 for a data character, which
   1384 is placed in chptr. A backreference to group n is returned as negative n. On
   1385 entry, ptr is pointing at the character after \. On exit, it points after the
   1386 final code unit of the escape sequence.
   1387 
   1388 This function is also called from pcre2_substitute() to handle escape sequences
   1389 in replacement strings. In this case, the cb argument is NULL, and in the case
   1390 of escapes that have further processing, only sequences that define a data
   1391 character are recognised. The isclass argument is not relevant; the options
   1392 argument is the final value of the compiled pattern's options.
   1393 
   1394 Arguments:
   1395   ptrptr         points to the input position pointer
   1396   ptrend         points to the end of the input
   1397   chptr          points to a returned data character
   1398   errorcodeptr   points to the errorcode variable (containing zero)
   1399   options        the current options bits
   1400   isclass        TRUE if inside a character class
   1401   cb             compile data block
   1402 
   1403 Returns:         zero => a data character
   1404                  positive => a special escape sequence
   1405                  negative => a numerical back reference
   1406                  on error, errorcodeptr is set non-zero
   1407 */
   1408 
   1409 int
   1410 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
   1411   int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
   1412 {
   1413 BOOL utf = (options & PCRE2_UTF) != 0;
   1414 PCRE2_SPTR ptr = *ptrptr;
   1415 uint32_t c, cc;
   1416 int escape = 0;
   1417 int i;
   1418 
   1419 /* If backslash is at the end of the string, it's an error. */
   1420 
   1421 if (ptr >= ptrend)
   1422   {
   1423   *errorcodeptr = ERR1;
   1424   return 0;
   1425   }
   1426 
   1427 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
   1428 *errorcodeptr = 0;              /* Be optimistic */
   1429 
   1430 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
   1431 value test saves a memory lookup for code points outside the alphanumeric
   1432 range. Otherwise, do a table lookup. A non-zero result is something that can be
   1433 returned immediately. Otherwise further processing is required. */
   1434 
   1435 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
   1436 
   1437 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
   1438   {
   1439   if (i > 0) c = (uint32_t)i; else  /* Positive is a data character */
   1440     {
   1441     escape = -i;                    /* Else return a special escape */
   1442     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
   1443       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
   1444 
   1445     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
   1446     Unicode code points, as well as plain \N for "not newline". PCRE does not
   1447     support \N{name}. However, it does support quantification such as \N{2,3},
   1448     so if \N{ is not followed by U+dddd we check for a quantifier. */
   1449 
   1450     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
   1451       {
   1452       PCRE2_SPTR p = ptr + 1;
   1453 
   1454       /* \N{U+ can be handled by the \x{ code. However, this construction is
   1455       not valid in EBCDIC environments because it specifies a Unicode
   1456       character, not a codepoint in the local code. For example \N{U+0041}
   1457       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
   1458       casing semantics for the entire pattern, so allow it only in UTF (i.e.
   1459       Unicode) mode. */
   1460 
   1461       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
   1462         {
   1463 #ifdef EBCDIC
   1464         *errorcodeptr = ERR93;
   1465 #else
   1466         if (utf)
   1467           {
   1468           ptr = p + 1;
   1469           escape = 0;   /* Not a fancy escape after all */
   1470           goto COME_FROM_NU;
   1471           }
   1472         else *errorcodeptr = ERR93;
   1473 #endif
   1474         }
   1475 
   1476       /* Give an error if what follows is not a quantifier, but don't override
   1477       an error set by the quantifier reader (e.g. number overflow). */
   1478 
   1479       else
   1480         {
   1481         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
   1482              *errorcodeptr == 0)
   1483           *errorcodeptr = ERR37;
   1484         }
   1485       }
   1486     }
   1487   }
   1488 
   1489 /* Escapes that need further processing, including those that are unknown.
   1490 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
   1491 when BSUX is set). */
   1492 
   1493 else
   1494   {
   1495   PCRE2_SPTR oldptr;
   1496   BOOL overflow;
   1497   int s;
   1498 
   1499   /* Filter calls from pcre2_substitute(). */
   1500 
   1501   if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
   1502       (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
   1503     {
   1504     *errorcodeptr = ERR3;
   1505     return 0;
   1506     }
   1507 
   1508   switch (c)
   1509     {
   1510     /* A number of Perl escapes are not handled by PCRE. We give an explicit
   1511     error. */
   1512 
   1513     case CHAR_F:
   1514     case CHAR_l:
   1515     case CHAR_L:
   1516     *errorcodeptr = ERR37;
   1517     break;
   1518 
   1519     /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
   1520     specially, \u must be followed by four hex digits. Otherwise it is a
   1521     lowercase u letter. */
   1522 
   1523     case CHAR_u:
   1524     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
   1525       {
   1526       uint32_t xc;
   1527       if (ptrend - ptr < 4) break;              /* Less than 4 chars */
   1528       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
   1529       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   1530       cc = (cc << 4) | xc;
   1531       if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
   1532       cc = (cc << 4) | xc;
   1533       if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
   1534       c = (cc << 4) | xc;
   1535       ptr += 4;
   1536       if (utf)
   1537         {
   1538         if (c > 0x10ffffU) *errorcodeptr = ERR77;
   1539         else
   1540           if (c >= 0xd800 && c <= 0xdfff &&
   1541             (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
   1542               *errorcodeptr = ERR73;
   1543         }
   1544       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
   1545       }
   1546     break;
   1547 
   1548     /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
   1549     upper case letter. */
   1550 
   1551     case CHAR_U:
   1552     if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
   1553     break;
   1554 
   1555     /* In a character class, \g is just a literal "g". Outside a character
   1556     class, \g must be followed by one of a number of specific things:
   1557 
   1558     (1) A number, either plain or braced. If positive, it is an absolute
   1559     backreference. If negative, it is a relative backreference. This is a Perl
   1560     5.10 feature.
   1561 
   1562     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
   1563     is part of Perl's movement towards a unified syntax for back references. As
   1564     this is synonymous with \k{name}, we fudge it up by pretending it really
   1565     was \k{name}.
   1566 
   1567     (3) For Oniguruma compatibility we also support \g followed by a name or a
   1568     number either in angle brackets or in single quotes. However, these are
   1569     (possibly recursive) subroutine calls, _not_ backreferences. We return
   1570     the ESC_g code.
   1571 
   1572     Summary: Return a negative number for a numerical back reference, ESC_k for
   1573     a named back reference, and ESC_g for a named or numbered subroutine call.
   1574     */
   1575 
   1576     case CHAR_g:
   1577     if (isclass) break;
   1578 
   1579     if (ptr >= ptrend)
   1580       {
   1581       *errorcodeptr = ERR57;
   1582       break;
   1583       }
   1584 
   1585     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
   1586       {
   1587       escape = ESC_g;
   1588       break;
   1589       }
   1590 
   1591     /* If there is a brace delimiter, try to read a numerical reference. If
   1592     there isn't one, assume we have a name and treat it as \k. */
   1593 
   1594     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
   1595       {
   1596       PCRE2_SPTR p = ptr + 1;
   1597       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
   1598           errorcodeptr))
   1599         {
   1600         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
   1601         break;
   1602         }
   1603       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
   1604         {
   1605         *errorcodeptr = ERR57;
   1606         break;
   1607         }
   1608       ptr = p + 1;
   1609       }
   1610 
   1611     /* Read an undelimited number */
   1612 
   1613     else
   1614       {
   1615       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
   1616           errorcodeptr))
   1617         {
   1618         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
   1619         break;
   1620         }
   1621       }
   1622 
   1623     if (s <= 0)
   1624       {
   1625       *errorcodeptr = ERR15;
   1626       break;
   1627       }
   1628 
   1629     escape = -s;
   1630     break;
   1631 
   1632     /* The handling of escape sequences consisting of a string of digits
   1633     starting with one that is not zero is not straightforward. Perl has changed
   1634     over the years. Nowadays \g{} for backreferences and \o{} for octal are
   1635     recommended to avoid the ambiguities in the old syntax.
   1636 
   1637     Outside a character class, the digits are read as a decimal number. If the
   1638     number is less than 10, or if there are that many previous extracting left
   1639     brackets, it is a back reference. Otherwise, up to three octal digits are
   1640     read to form an escaped character code. Thus \123 is likely to be octal 123
   1641     (cf \0123, which is octal 012 followed by the literal 3).
   1642 
   1643     Inside a character class, \ followed by a digit is always either a literal
   1644     8 or 9 or an octal number. */
   1645 
   1646     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
   1647     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   1648 
   1649     if (!isclass)
   1650       {
   1651       oldptr = ptr;
   1652       ptr--;   /* Back to the digit */
   1653       if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
   1654           errorcodeptr))
   1655         break;
   1656 
   1657       /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
   1658       are octal escapes if there are not that many previous captures. */
   1659 
   1660       if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
   1661         {
   1662         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
   1663           else escape = -s;     /* Indicates a back reference */
   1664         break;
   1665         }
   1666       ptr = oldptr;      /* Put the pointer back and fall through */
   1667       }
   1668 
   1669     /* Handle a digit following \ when the number is not a back reference, or
   1670     we are within a character class. If the first digit is 8 or 9, Perl used to
   1671     generate a binary zero and then treat the digit as a following literal. At
   1672     least by Perl 5.18 this changed so as not to insert the binary zero. */
   1673 
   1674     if (c >= CHAR_8) break;
   1675 
   1676     /* Fall through */
   1677 
   1678     /* \0 always starts an octal number, but we may drop through to here with a
   1679     larger first octal digit. The original code used just to take the least
   1680     significant 8 bits of octal numbers (I think this is what early Perls used
   1681     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
   1682     but no more than 3 octal digits. */
   1683 
   1684     case CHAR_0:
   1685     c -= CHAR_0;
   1686     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
   1687         c = c * 8 + *ptr++ - CHAR_0;
   1688 #if PCRE2_CODE_UNIT_WIDTH == 8
   1689     if (!utf && c > 0xff) *errorcodeptr = ERR51;
   1690 #endif
   1691     break;
   1692 
   1693     /* \o is a relatively new Perl feature, supporting a more general way of
   1694     specifying character codes in octal. The only supported form is \o{ddd}. */
   1695 
   1696     case CHAR_o:
   1697     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
   1698       {
   1699       ptr--;
   1700       *errorcodeptr = ERR55;
   1701       }
   1702     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
   1703       *errorcodeptr = ERR78;
   1704     else
   1705       {
   1706       c = 0;
   1707       overflow = FALSE;
   1708       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
   1709         {
   1710         cc = *ptr++;
   1711         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   1712 #if PCRE2_CODE_UNIT_WIDTH == 32
   1713         if (c >= 0x20000000l) { overflow = TRUE; break; }
   1714 #endif
   1715         c = (c << 3) + (cc - CHAR_0);
   1716 #if PCRE2_CODE_UNIT_WIDTH == 8
   1717         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   1718 #elif PCRE2_CODE_UNIT_WIDTH == 16
   1719         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   1720 #elif PCRE2_CODE_UNIT_WIDTH == 32
   1721         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   1722 #endif
   1723         }
   1724       if (overflow)
   1725         {
   1726         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
   1727         *errorcodeptr = ERR34;
   1728         }
   1729       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
   1730         {
   1731         if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
   1732             (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
   1733           {
   1734           ptr--;
   1735           *errorcodeptr = ERR73;
   1736           }
   1737         }
   1738       else
   1739         {
   1740         ptr--;
   1741         *errorcodeptr = ERR64;
   1742         }
   1743       }
   1744     break;
   1745 
   1746     /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
   1747     two hexadecimal digits. Otherwise it is a lowercase x letter. */
   1748 
   1749     case CHAR_x:
   1750     if ((options & PCRE2_ALT_BSUX) != 0)
   1751       {
   1752       uint32_t xc;
   1753       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
   1754       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
   1755       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
   1756       c = (cc << 4) | xc;
   1757       ptr += 2;
   1758       }    /* End PCRE2_ALT_BSUX handling */
   1759 
   1760     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
   1761     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
   1762     digits. If not, { used to be treated as a data character. However, Perl
   1763     seems to read hex digits up to the first non-such, and ignore the rest, so
   1764     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
   1765     now gives an error. */
   1766 
   1767     else
   1768       {
   1769       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
   1770         {
   1771 #ifndef EBCDIC
   1772         COME_FROM_NU:
   1773 #endif
   1774         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
   1775           {
   1776           *errorcodeptr = ERR78;
   1777           break;
   1778           }
   1779         c = 0;
   1780         overflow = FALSE;
   1781 
   1782         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
   1783           {
   1784           ptr++;
   1785           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
   1786 #if PCRE2_CODE_UNIT_WIDTH == 32
   1787           if (c >= 0x10000000l) { overflow = TRUE; break; }
   1788 #endif
   1789           c = (c << 4) | cc;
   1790           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
   1791             {
   1792             overflow = TRUE;
   1793             break;
   1794             }
   1795           }
   1796 
   1797         if (overflow)
   1798           {
   1799           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
   1800           *errorcodeptr = ERR34;
   1801           }
   1802         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
   1803           {
   1804           if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
   1805               (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
   1806             {
   1807             ptr--;
   1808             *errorcodeptr = ERR73;
   1809             }
   1810           }
   1811 
   1812         /* If the sequence of hex digits does not end with '}', give an error.
   1813         We used just to recognize this construct and fall through to the normal
   1814         \x handling, but nowadays Perl gives an error, which seems much more
   1815         sensible, so we do too. */
   1816 
   1817         else
   1818           {
   1819           ptr--;
   1820           *errorcodeptr = ERR67;
   1821           }
   1822         }   /* End of \x{} processing */
   1823 
   1824       /* Read a up to two hex digits after \x */
   1825 
   1826       else
   1827         {
   1828         c = 0;
   1829         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
   1830         ptr++;
   1831         c = cc;
   1832         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
   1833         ptr++;
   1834         c = (c << 4) | cc;
   1835         }     /* End of \xdd handling */
   1836       }       /* End of Perl-style \x handling */
   1837     break;
   1838 
   1839     /* The handling of \c is different in ASCII and EBCDIC environments. In an
   1840     ASCII (or Unicode) environment, an error is given if the character
   1841     following \c is not a printable ASCII character. Otherwise, the following
   1842     character is upper-cased if it is a letter, and after that the 0x40 bit is
   1843     flipped. The result is the value of the escape.
   1844 
   1845     In an EBCDIC environment the handling of \c is compatible with the
   1846     specification in the perlebcdic document. The following character must be
   1847     a letter or one of small number of special characters. These provide a
   1848     means of defining the character values 0-31.
   1849 
   1850     For testing the EBCDIC handling of \c in an ASCII environment, recognize
   1851     the EBCDIC value of 'c' explicitly. */
   1852 
   1853 #if defined EBCDIC && 'a' != 0x81
   1854     case 0x83:
   1855 #else
   1856     case CHAR_c:
   1857 #endif
   1858     if (ptr >= ptrend)
   1859       {
   1860       *errorcodeptr = ERR2;
   1861       break;
   1862       }
   1863     c = *ptr;
   1864     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
   1865 
   1866     /* Handle \c in an ASCII/Unicode environment. */
   1867 
   1868 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
   1869     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
   1870       {
   1871       *errorcodeptr = ERR68;
   1872       break;
   1873       }
   1874     c ^= 0x40;
   1875 
   1876     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
   1877     255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
   1878     encoding. (This is the way Perl indicates that it handles \c?.) The other
   1879     valid sequences correspond to a list of specific characters. */
   1880 
   1881 #else
   1882     if (c == CHAR_QUESTION_MARK)
   1883       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
   1884     else
   1885       {
   1886       for (i = 0; i < 32; i++)
   1887         {
   1888         if (c == ebcdic_escape_c[i]) break;
   1889         }
   1890       if (i < 32) c = i; else *errorcodeptr = ERR68;
   1891       }
   1892 #endif  /* EBCDIC */
   1893 
   1894     ptr++;
   1895     break;
   1896 
   1897     /* Any other alphanumeric following \ is an error. Perl gives an error only
   1898     if in warning mode, but PCRE doesn't have a warning mode. */
   1899 
   1900     default:
   1901     *errorcodeptr = ERR3;
   1902     *ptrptr = ptr - 1;     /* Point to the character at fault */
   1903     return 0;
   1904     }
   1905   }
   1906 
   1907 /* Set the pointer to the next character before returning. */
   1908 
   1909 *ptrptr = ptr;
   1910 *chptr = c;
   1911 return escape;
   1912 }
   1913 
   1914 
   1915 
   1916 #ifdef SUPPORT_UNICODE
   1917 /*************************************************
   1918 *               Handle \P and \p                 *
   1919 *************************************************/
   1920 
   1921 /* This function is called after \P or \p has been encountered, provided that
   1922 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
   1923 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
   1924 after the final code unit of the escape sequence.
   1925 
   1926 Arguments:
   1927   ptrptr         the pattern position pointer
   1928   negptr         a boolean that is set TRUE for negation else FALSE
   1929   ptypeptr       an unsigned int that is set to the type value
   1930   pdataptr       an unsigned int that is set to the detailed property value
   1931   errorcodeptr   the error code variable
   1932   cb             the compile data
   1933 
   1934 Returns:         TRUE if the type value was found, or FALSE for an invalid type
   1935 */
   1936 
   1937 static BOOL
   1938 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
   1939   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
   1940 {
   1941 PCRE2_UCHAR c;
   1942 PCRE2_SIZE i, bot, top;
   1943 PCRE2_SPTR ptr = *ptrptr;
   1944 PCRE2_UCHAR name[32];
   1945 
   1946 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
   1947 c = *ptr++;
   1948 *negptr = FALSE;
   1949 
   1950 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
   1951 negation. */
   1952 
   1953 if (c == CHAR_LEFT_CURLY_BRACKET)
   1954   {
   1955   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
   1956   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
   1957     {
   1958     *negptr = TRUE;
   1959     ptr++;
   1960     }
   1961   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
   1962     {
   1963     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
   1964     c = *ptr++;
   1965     if (c == CHAR_NUL) goto ERROR_RETURN;
   1966     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
   1967     name[i] = c;
   1968     }
   1969   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
   1970   name[i] = 0;
   1971   }
   1972 
   1973 /* Otherwise there is just one following character, which must be an ASCII
   1974 letter. */
   1975 
   1976 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
   1977   {
   1978   name[0] = c;
   1979   name[1] = 0;
   1980   }
   1981 else goto ERROR_RETURN;
   1982 
   1983 *ptrptr = ptr;
   1984 
   1985 /* Search for a recognized property name using binary chop. */
   1986 
   1987 bot = 0;
   1988 top = PRIV(utt_size);
   1989 
   1990 while (bot < top)
   1991   {
   1992   int r;
   1993   i = (bot + top) >> 1;
   1994   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
   1995   if (r == 0)
   1996     {
   1997     *ptypeptr = PRIV(utt)[i].type;
   1998     *pdataptr = PRIV(utt)[i].value;
   1999     return TRUE;
   2000     }
   2001   if (r > 0) bot = i + 1; else top = i;
   2002   }
   2003 *errorcodeptr = ERR47;   /* Unrecognized name */
   2004 return FALSE;
   2005 
   2006 ERROR_RETURN:            /* Malformed \P or \p */
   2007 *errorcodeptr = ERR46;
   2008 *ptrptr = ptr;
   2009 return FALSE;
   2010 }
   2011 #endif
   2012 
   2013 
   2014 
   2015 /*************************************************
   2016 *           Check for POSIX class syntax         *
   2017 *************************************************/
   2018 
   2019 /* This function is called when the sequence "[:" or "[." or "[=" is
   2020 encountered in a character class. It checks whether this is followed by a
   2021 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
   2022 reach an unescaped ']' without the special preceding character, return FALSE.
   2023 
   2024 Originally, this function only recognized a sequence of letters between the
   2025 terminators, but it seems that Perl recognizes any sequence of characters,
   2026 though of course unknown POSIX names are subsequently rejected. Perl gives an
   2027 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
   2028 didn't consider this to be a POSIX class. Likewise for [:1234:].
   2029 
   2030 The problem in trying to be exactly like Perl is in the handling of escapes. We
   2031 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
   2032 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
   2033 below handles the special cases \\ and \], but does not try to do any other
   2034 escape processing. This makes it different from Perl for cases such as
   2035 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
   2036 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
   2037 when Perl does, I think.
   2038 
   2039 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
   2040 It seems that the appearance of a nested POSIX class supersedes an apparent
   2041 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
   2042 a digit. This is handled by returning FALSE if the start of a new group with
   2043 the same terminator is encountered, since the next closing sequence must close
   2044 the nested group, not the outer one.
   2045 
   2046 In Perl, unescaped square brackets may also appear as part of class names. For
   2047 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
   2048 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
   2049 seem right at all. PCRE does not allow closing square brackets in POSIX class
   2050 names.
   2051 
   2052 Arguments:
   2053   ptr      pointer to the character after the initial [ (colon, dot, equals)
   2054   ptrend   pointer to the end of the pattern
   2055   endptr   where to return a pointer to the terminating ':', '.', or '='
   2056 
   2057 Returns:   TRUE or FALSE
   2058 */
   2059 
   2060 static BOOL
   2061 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
   2062 {
   2063 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
   2064 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
   2065 
   2066 for (; ptrend - ptr >= 2; ptr++)
   2067   {
   2068   if (*ptr == CHAR_BACKSLASH &&
   2069       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
   2070     ptr++;
   2071 
   2072   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
   2073             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
   2074 
   2075   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   2076     {
   2077     *endptr = ptr;
   2078     return TRUE;
   2079     }
   2080   }
   2081 
   2082 return FALSE;
   2083 }
   2084 
   2085 
   2086 
   2087 /*************************************************
   2088 *          Check POSIX class name                *
   2089 *************************************************/
   2090 
   2091 /* This function is called to check the name given in a POSIX-style class entry
   2092 such as [:alnum:].
   2093 
   2094 Arguments:
   2095   ptr        points to the first letter
   2096   len        the length of the name
   2097 
   2098 Returns:     a value representing the name, or -1 if unknown
   2099 */
   2100 
   2101 static int
   2102 check_posix_name(PCRE2_SPTR ptr, int len)
   2103 {
   2104 const char *pn = posix_names;
   2105 int yield = 0;
   2106 while (posix_name_lengths[yield] != 0)
   2107   {
   2108   if (len == posix_name_lengths[yield] &&
   2109     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
   2110   pn += posix_name_lengths[yield] + 1;
   2111   yield++;
   2112   }
   2113 return -1;
   2114 }
   2115 
   2116 
   2117 
   2118 /*************************************************
   2119 *       Read a subpattern or VERB name           *
   2120 *************************************************/
   2121 
   2122 /* This function is called from parse_regex() below whenever it needs to read
   2123 the name of a subpattern or a (*VERB). The initial pointer must be to the
   2124 character before the name. If that character is '*' we are reading a verb name.
   2125 The pointer is updated to point after the name, for a VERB, or after tha name's
   2126 terminator for a subpattern name. Returning both the offset and the name
   2127 pointer is redundant information, but some callers use one and some the other,
   2128 so it is simplest just to return both.
   2129 
   2130 Arguments:
   2131   ptrptr      points to the character pointer variable
   2132   ptrend      points to the end of the input string
   2133   terminator  the terminator of a subpattern name must be this
   2134   offsetptr   where to put the offset from the start of the pattern
   2135   nameptr     where to put a pointer to the name in the input
   2136   namelenptr  where to put the length of the name
   2137   errcodeptr  where to put an error code
   2138   cb          pointer to the compile data block
   2139 
   2140 Returns:    TRUE if a name was read
   2141             FALSE otherwise, with error code set
   2142 */
   2143 
   2144 static BOOL
   2145 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator,
   2146   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
   2147   int *errorcodeptr, compile_block *cb)
   2148 {
   2149 PCRE2_SPTR ptr = *ptrptr;
   2150 BOOL is_verb = (*ptr == CHAR_ASTERISK);
   2151 uint32_t namelen = 0;
   2152 uint32_t ctype = is_verb? ctype_letter : ctype_word;
   2153 
   2154 if (++ptr >= ptrend)
   2155   {
   2156   *errorcodeptr = is_verb? ERR60:  /* Verb not recognized or malformed */
   2157                            ERR62;  /* Subpattern name expected */
   2158   goto FAILED;
   2159   }
   2160 
   2161 *nameptr = ptr;
   2162 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
   2163 
   2164 if (IS_DIGIT(*ptr))
   2165   {
   2166   *errorcodeptr = ERR44;   /* Group name must not start with digit */
   2167   goto FAILED;
   2168   }
   2169 
   2170 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)
   2171   {
   2172   ptr++;
   2173   namelen++;
   2174   if (namelen > MAX_NAME_SIZE)
   2175     {
   2176     *errorcodeptr = ERR48;
   2177     goto FAILED;
   2178     }
   2179   }
   2180 
   2181 /* Subpattern names must not be empty, and their terminator is checked here.
   2182 (What follows a verb name is checked separately.) */
   2183 
   2184 if (!is_verb)
   2185   {
   2186   if (namelen == 0)
   2187     {
   2188     *errorcodeptr = ERR62;   /* Subpattern name expected */
   2189     goto FAILED;
   2190     }
   2191   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
   2192     {
   2193     *errorcodeptr = ERR42;
   2194     goto FAILED;
   2195     }
   2196   ptr++;
   2197   }
   2198 
   2199 *namelenptr = namelen;
   2200 *ptrptr = ptr;
   2201 return TRUE;
   2202 
   2203 FAILED:
   2204 *ptrptr = ptr;
   2205 return FALSE;
   2206 }
   2207 
   2208 
   2209 
   2210 /*************************************************
   2211 *          Manage callouts at start of cycle     *
   2212 *************************************************/
   2213 
   2214 /* At the start of a new item in parse_regex() we are able to record the
   2215 details of the previous item in a prior callout, and also to set up an
   2216 automatic callout if enabled. Avoid having two adjacent automatic callouts,
   2217 which would otherwise happen for items such as \Q that contribute nothing to
   2218 the parsed pattern.
   2219 
   2220 Arguments:
   2221   ptr              current pattern pointer
   2222   pcalloutptr      points to a pointer to previous callout, or NULL
   2223   auto_callout     TRUE if auto_callouts are enabled
   2224   parsed_pattern   the parsed pattern pointer
   2225   cb               compile block
   2226 
   2227 Returns: possibly updated parsed_pattern pointer.
   2228 */
   2229 
   2230 static uint32_t *
   2231 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
   2232   uint32_t *parsed_pattern, compile_block *cb)
   2233 {
   2234 uint32_t *previous_callout = *pcalloutptr;
   2235 
   2236 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
   2237   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
   2238 
   2239 if (!auto_callout) previous_callout = NULL; else
   2240   {
   2241   if (previous_callout == NULL ||
   2242       previous_callout != parsed_pattern - 4 ||
   2243       previous_callout[3] != 255)
   2244     {
   2245     previous_callout = parsed_pattern;  /* Set up new automatic callout */
   2246     parsed_pattern += 4;
   2247     previous_callout[0] = META_CALLOUT_NUMBER;
   2248     previous_callout[2] = 0;
   2249     previous_callout[3] = 255;
   2250     }
   2251   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
   2252   }
   2253 
   2254 *pcalloutptr = previous_callout;
   2255 return parsed_pattern;
   2256 }
   2257 
   2258 
   2259 
   2260 /*************************************************
   2261 *      Parse regex and identify named groups     *
   2262 *************************************************/
   2263 
   2264 /* This function is called first of all. It scans the pattern and does two
   2265 things: (1) It identifies capturing groups and makes a table of named capturing
   2266 groups so that information about them is fully available to both the compiling
   2267 scans. (2) It writes a parsed version of the pattern with comments omitted and
   2268 escapes processed into the parsed_pattern vector.
   2269 
   2270 Arguments:
   2271   ptr             points to the start of the pattern
   2272   options         compiling dynamic options (may change during the scan)
   2273   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
   2274   cb              pointer to the compile data block
   2275 
   2276 Returns:   zero on success or a non-zero error code, with the
   2277              error offset placed in the cb field
   2278 */
   2279 
   2280 /* A structure and some flags for dealing with nested groups. */
   2281 
   2282 typedef struct nest_save {
   2283   uint16_t  nest_depth;
   2284   uint16_t  reset_group;
   2285   uint16_t  max_group;
   2286   uint16_t  flags;
   2287   uint32_t  options;
   2288 } nest_save;
   2289 
   2290 #define NSF_RESET          0x0001u
   2291 #define NSF_CONDASSERT     0x0002u
   2292 
   2293 /* Options that are changeable within the pattern must be tracked during
   2294 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
   2295 but all must be tracked so that META_OPTIONS items set the correct values for
   2296 the main compiling phase. */
   2297 
   2298 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
   2299   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
   2300   PCRE2_UNGREEDY)
   2301 
   2302 /* States used for analyzing ranges in character classes. The two OK values
   2303 must be last. */
   2304 
   2305 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
   2306 
   2307 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates
   2308 the storing of literal values in the parsed pattern. */
   2309 
   2310 #if PCRE2_CODE_UNIT_WIDTH == 32
   2311 #define PARSED_LITERAL(c, p) \
   2312   { \
   2313   if (c >= META_END) *p++ = META_BIGVALUE; \
   2314   *p++ = c; \
   2315   okquantifier = TRUE; \
   2316   }
   2317 #else
   2318 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
   2319 #endif
   2320 
   2321 /* Here's the actual function. */
   2322 
   2323 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
   2324   compile_block *cb)
   2325 {
   2326 uint32_t c;
   2327 uint32_t delimiter;
   2328 uint32_t namelen;
   2329 uint32_t class_range_state;
   2330 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
   2331 uint32_t *previous_callout = NULL;
   2332 uint32_t *parsed_pattern = cb->parsed_pattern;
   2333 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
   2334 uint32_t meta_quantifier = 0;
   2335 uint32_t add_after_mark = 0;
   2336 uint16_t nest_depth = 0;
   2337 int after_manual_callout = 0;
   2338 int expect_cond_assert = 0;
   2339 int errorcode = 0;
   2340 int escape;
   2341 int i;
   2342 BOOL inescq = FALSE;
   2343 BOOL inverbname = FALSE;
   2344 BOOL utf = (options & PCRE2_UTF) != 0;
   2345 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
   2346 BOOL isdupname;
   2347 BOOL negate_class;
   2348 BOOL okquantifier = FALSE;
   2349 PCRE2_SPTR thisptr;
   2350 PCRE2_SPTR name;
   2351 PCRE2_SPTR ptrend = cb->end_pattern;
   2352 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
   2353 named_group *ng;
   2354 nest_save *top_nest, *end_nests;
   2355 
   2356 /* Insert leading items for word and line matching (features provided for the
   2357 benefit of pcre2grep). */
   2358 
   2359 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
   2360   {
   2361   *parsed_pattern++ = META_CIRCUMFLEX;
   2362   *parsed_pattern++ = META_NOCAPTURE;
   2363   }
   2364 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
   2365   {
   2366   *parsed_pattern++ = META_ESCAPE + ESC_b;
   2367   *parsed_pattern++ = META_NOCAPTURE;
   2368   }
   2369 
   2370 /* If the pattern is actually a literal string, process it separately to avoid
   2371 cluttering up the main loop. */
   2372 
   2373 if ((options & PCRE2_LITERAL) != 0)
   2374   {
   2375   while (ptr < ptrend)
   2376     {
   2377     if (parsed_pattern >= parsed_pattern_end)
   2378       {
   2379       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
   2380       goto FAILED;
   2381       }
   2382     thisptr = ptr;
   2383     GETCHARINCTEST(c, ptr);
   2384     if (auto_callout)
   2385       parsed_pattern = manage_callouts(thisptr, &previous_callout,
   2386         auto_callout, parsed_pattern, cb);
   2387     PARSED_LITERAL(c, parsed_pattern);
   2388     }
   2389   goto PARSED_END;
   2390   }
   2391 
   2392 /* Process a real regex which may contain meta-characters. */
   2393 
   2394 top_nest = NULL;
   2395 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
   2396 
   2397 /* The size of the nest_save structure might not be a factor of the size of the
   2398 workspace. Therefore we must round down end_nests so as to correctly avoid
   2399 creating a nest_save that spans the end of the workspace. */
   2400 
   2401 end_nests = (nest_save *)((char *)end_nests -
   2402   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
   2403 
   2404 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
   2405 
   2406 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
   2407 
   2408 /* Now scan the pattern */
   2409 
   2410 while (ptr < ptrend)
   2411   {
   2412   int prev_expect_cond_assert;
   2413   uint32_t min_repeat, max_repeat;
   2414   uint32_t set, unset, *optset;
   2415   uint32_t terminator;
   2416   uint32_t prev_meta_quantifier;
   2417   BOOL prev_okquantifier;
   2418   PCRE2_SPTR tempptr;
   2419   PCRE2_SIZE offset;
   2420 
   2421   if (parsed_pattern >= parsed_pattern_end)
   2422     {
   2423     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
   2424     goto FAILED;
   2425     }
   2426 
   2427   if (nest_depth > cb->cx->parens_nest_limit)
   2428     {
   2429     errorcode = ERR19;
   2430     goto FAILED;        /* Parentheses too deeply nested */
   2431     }
   2432 
   2433   /* Get next input character, save its position for callout handling. */
   2434 
   2435   thisptr = ptr;
   2436   GETCHARINCTEST(c, ptr);
   2437 
   2438   /* Copy quoted literals until \E, allowing for the possibility of automatic
   2439   callouts, except when processing a (*VERB) "name".  */
   2440 
   2441   if (inescq)
   2442     {
   2443     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
   2444       {
   2445       inescq = FALSE;
   2446       ptr++;   /* Skip E */
   2447       }
   2448     else
   2449       {
   2450       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
   2451         {                           /* expecting a conditional assertion, */
   2452         ptr--;                      /* but an empty \Q\E sequence is OK.  */
   2453         errorcode = ERR28;
   2454         goto FAILED;
   2455         }
   2456       if (!inverbname && after_manual_callout-- <= 0)
   2457         parsed_pattern = manage_callouts(thisptr, &previous_callout,
   2458           auto_callout, parsed_pattern, cb);
   2459       PARSED_LITERAL(c, parsed_pattern);
   2460       meta_quantifier = 0;
   2461       }
   2462     continue;  /* Next character */
   2463     }
   2464 
   2465   /* If we are processing the "name" part of a (*VERB:NAME) item, all
   2466   characters up to the closing parenthesis are literals except when
   2467   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
   2468   and \E and escaped characters are allowed (no character types such as \d). If
   2469   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
   2470   this by not entering the special (*VERB:NAME) processing - they are then
   2471   picked up below. Note that c is a character, not a code unit, so we must not
   2472   use MAX_255 to test its size because MAX_255 tests code units and is assumed
   2473   TRUE in 8-bit mode. */
   2474 
   2475   if (inverbname &&
   2476        (
   2477         /* EITHER: not both options set */
   2478         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
   2479                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
   2480 #ifdef SUPPORT_UNICODE
   2481         /* OR: character > 255 AND not Unicode Pattern White Space */
   2482         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
   2483 #endif
   2484         /* OR: not a # comment or isspace() white space */
   2485         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
   2486 #ifdef SUPPORT_UNICODE
   2487         /* and not CHAR_NEL when Unicode is supported */
   2488           && c != CHAR_NEL
   2489 #endif
   2490        )))
   2491     {
   2492     PCRE2_SIZE verbnamelength;
   2493 
   2494     switch(c)
   2495       {
   2496       default:
   2497       PARSED_LITERAL(c, parsed_pattern);
   2498       break;
   2499 
   2500       case CHAR_RIGHT_PARENTHESIS:
   2501       inverbname = FALSE;
   2502       okquantifier = FALSE;   /* Was probably set by literals */
   2503       /* This is the length in characters */
   2504       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
   2505       /* But the limit on the length is in code units */
   2506       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
   2507         {
   2508         ptr--;
   2509         errorcode = ERR76;
   2510         goto FAILED;
   2511         }
   2512       *verblengthptr = (uint32_t)verbnamelength;
   2513 
   2514       /* If this name was on a verb such as (*ACCEPT) which does not continue,
   2515       a (*MARK) was generated for the name. We now add the original verb as the
   2516       next item. */
   2517 
   2518       if (add_after_mark != 0)
   2519         {
   2520         *parsed_pattern++ = add_after_mark;
   2521         add_after_mark = 0;
   2522         }
   2523       break;
   2524 
   2525       case CHAR_BACKSLASH:
   2526       if ((options & PCRE2_ALT_VERBNAMES) != 0)
   2527         {
   2528         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
   2529           FALSE, cb);
   2530         if (errorcode != 0) goto FAILED;
   2531         }
   2532       else escape = 0;   /* Treat all as literal */
   2533 
   2534       switch(escape)
   2535         {
   2536         case 0:
   2537         PARSED_LITERAL(c, parsed_pattern);
   2538         break;
   2539 
   2540         case ESC_Q:
   2541         inescq = TRUE;
   2542         break;
   2543 
   2544         case ESC_E:           /* Ignore */
   2545         break;
   2546 
   2547         default:
   2548         errorcode = ERR40;    /* Invalid in verb name */
   2549         goto FAILED;
   2550         }
   2551       }
   2552     continue;   /* Next character in pattern */
   2553     }
   2554 
   2555   /* Not a verb name character. At this point we must process everything that
   2556   must not change the quantification state. This is mainly comments, but we
   2557   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
   2558   A+, as in Perl. An isolated \E is ignored. */
   2559 
   2560   if (c == CHAR_BACKSLASH && ptr < ptrend)
   2561     {
   2562     if (*ptr == CHAR_Q || *ptr == CHAR_E)
   2563       {
   2564       inescq = *ptr == CHAR_Q;
   2565       ptr++;
   2566       continue;
   2567       }
   2568     }
   2569 
   2570   /* Skip over whitespace and # comments in extended mode. Note that c is a
   2571   character, not a code unit, so we must not use MAX_255 to test its size
   2572   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
   2573   whitespace characters are those designated as "Pattern White Space" by
   2574   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
   2575   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
   2576   subset of space characters that match \h and \v. */
   2577 
   2578   if ((options & PCRE2_EXTENDED) != 0)
   2579     {
   2580     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
   2581 #ifdef SUPPORT_UNICODE
   2582     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
   2583 #endif
   2584     if (c == CHAR_NUMBER_SIGN)
   2585       {
   2586       while (ptr < ptrend)
   2587         {
   2588         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
   2589           {                       /* IS_NEWLINE sets cb->nllen. */
   2590           ptr += cb->nllen;
   2591           break;
   2592           }
   2593         ptr++;
   2594 #ifdef SUPPORT_UNICODE
   2595         if (utf) FORWARDCHARTEST(ptr, ptrend);
   2596 #endif
   2597         }
   2598       continue;  /* Next character in pattern */
   2599       }
   2600     }
   2601 
   2602   /* Skip over bracketed comments */
   2603 
   2604   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
   2605       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
   2606     {
   2607     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
   2608     if (ptr >= ptrend)
   2609       {
   2610       errorcode = ERR18;  /* A special error for missing ) in a comment */
   2611       goto FAILED;        /* to make it easier to debug. */
   2612       }
   2613     ptr++;
   2614     continue;  /* Next character in pattern */
   2615     }
   2616 
   2617   /* If the next item is not a quantifier, fill in length of any previous
   2618   callout and create an auto callout if required. */
   2619 
   2620   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
   2621        (c != CHAR_LEFT_CURLY_BRACKET ||
   2622          (tempptr = ptr,
   2623          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
   2624     {
   2625     if (after_manual_callout-- <= 0)
   2626       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
   2627         parsed_pattern, cb);
   2628     }
   2629 
   2630   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
   2631   assertion, possibly preceded by a callout. If the value is 1, we have just
   2632   had the callout and expect an assertion. There must be at least 3 more
   2633   characters in all cases. When expect_cond_assert is 2, we know that the
   2634   current character is an opening parenthesis, as otherwise we wouldn't be
   2635   here. However, when it is 1, we need to check, and it's easiest just to check
   2636   always. Note that expect_cond_assert may be negative, since all callouts just
   2637   decrement it. */
   2638 
   2639   if (expect_cond_assert > 0)
   2640     {
   2641     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
   2642               ptr[0] == CHAR_QUESTION_MARK;
   2643     if (ok) switch(ptr[1])
   2644       {
   2645       case CHAR_C:
   2646       ok = expect_cond_assert == 2;
   2647       break;
   2648 
   2649       case CHAR_EQUALS_SIGN:
   2650       case CHAR_EXCLAMATION_MARK:
   2651       break;
   2652 
   2653       case CHAR_LESS_THAN_SIGN:
   2654       ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
   2655       break;
   2656 
   2657       default:
   2658       ok = FALSE;
   2659       }
   2660 
   2661     if (!ok)
   2662       {
   2663       ptr--;   /* Adjust error offset */
   2664       errorcode = ERR28;
   2665       goto FAILED;
   2666       }
   2667     }
   2668 
   2669   /* Remember whether we are expecting a conditional assertion, and set the
   2670   default for this item. */
   2671 
   2672   prev_expect_cond_assert = expect_cond_assert;
   2673   expect_cond_assert = 0;
   2674 
   2675   /* Remember quantification status for the previous significant item, then set
   2676   default for this item. */
   2677 
   2678   prev_okquantifier = okquantifier;
   2679   prev_meta_quantifier = meta_quantifier;
   2680   okquantifier = FALSE;
   2681   meta_quantifier = 0;
   2682 
   2683   /* If the previous significant item was a quantifier, adjust the parsed code
   2684   if there is a following modifier. The base meta value is always followed by
   2685   the PLUS and QUERY values, in that order. We do this here rather than after
   2686   reading a quantifier so that intervening comments and /x whitespace can be
   2687   ignored without having to replicate code. */
   2688 
   2689   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
   2690     {
   2691     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
   2692       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
   2693         0x00020000u : 0x00010000u);
   2694     continue;  /* Next character in pattern */
   2695     }
   2696 
   2697 
   2698   /* Process the next item in the main part of a pattern. */
   2699 
   2700   switch(c)
   2701     {
   2702     default:              /* Non-special character */
   2703     PARSED_LITERAL(c, parsed_pattern);
   2704     break;
   2705 
   2706 
   2707     /* ---- Escape sequence ---- */
   2708 
   2709     case CHAR_BACKSLASH:
   2710     tempptr = ptr;
   2711     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
   2712       FALSE, cb);
   2713     if (errorcode != 0)
   2714       {
   2715       ESCAPE_FAILED:
   2716       if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
   2717         goto FAILED;
   2718       ptr = tempptr;
   2719       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
   2720         {
   2721         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
   2722         }
   2723       escape = 0;                 /* Treat as literal character */
   2724       }
   2725 
   2726     /* The escape was a data escape or literal character. */
   2727 
   2728     if (escape == 0)
   2729       {
   2730       PARSED_LITERAL(c, parsed_pattern);
   2731       }
   2732 
   2733     /* The escape was a back (or forward) reference. We keep the offset in
   2734     order to give a more useful diagnostic for a bad forward reference. For
   2735     references to groups numbered less than 10 we can't use more than two items
   2736     in parsed_pattern because they may be just two characters in the input (and
   2737     in a 64-bit world an offset may need two elements). So for them, the offset
   2738     of the first occurrent is held in a special vector. */
   2739 
   2740     else if (escape < 0)
   2741       {
   2742       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
   2743       escape = -escape;
   2744       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
   2745       if (escape < 10)
   2746         {
   2747         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
   2748           cb->small_ref_offset[escape] = offset;
   2749         }
   2750       else
   2751         {
   2752         PUTOFFSET(offset, parsed_pattern);
   2753         }
   2754       okquantifier = TRUE;
   2755       }
   2756 
   2757     /* The escape was a character class such as \d etc. or other special
   2758     escape indicator such as \A or \X. Most of them generate just a single
   2759     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
   2760     value. They are supported only when Unicode is available. The type and
   2761     value are packed into a single 32-bit value so that the whole sequences
   2762     uses only two elements in the parsed_vector. This is because the same
   2763     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
   2764     set.
   2765 
   2766     There are also some cases where the escape sequence is followed by a name:
   2767     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
   2768     and \g'name' are subroutine calls by name; \g{name} is a synonym for
   2769     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
   2770     and returned as a negative value (handled above). A name is coded as an
   2771     offset into the pattern and a length. */
   2772 
   2773     else switch (escape)
   2774       {
   2775       case ESC_C:
   2776 #ifdef NEVER_BACKSLASH_C
   2777       errorcode = ERR85;
   2778       goto ESCAPE_FAILED;
   2779 #else
   2780       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
   2781         {
   2782         errorcode = ERR83;
   2783         goto ESCAPE_FAILED;
   2784         }
   2785 #endif
   2786       okquantifier = TRUE;
   2787       *parsed_pattern++ = META_ESCAPE + escape;
   2788       break;
   2789 
   2790       case ESC_X:
   2791 #ifndef SUPPORT_UNICODE
   2792       errorcode = ERR45;   /* Supported only with Unicode support */
   2793       goto ESCAPE_FAILED;
   2794 #endif
   2795       case ESC_H:
   2796       case ESC_h:
   2797       case ESC_N:
   2798       case ESC_R:
   2799       case ESC_V:
   2800       case ESC_v:
   2801       okquantifier = TRUE;
   2802       *parsed_pattern++ = META_ESCAPE + escape;
   2803       break;
   2804 
   2805       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
   2806       *parsed_pattern++ = META_ESCAPE + escape;
   2807       break;
   2808 
   2809       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
   2810       without Unicode support because it is checked when pcre2_compile() is
   2811       called. */
   2812 
   2813       case ESC_d:
   2814       case ESC_D:
   2815       case ESC_s:
   2816       case ESC_S:
   2817       case ESC_w:
   2818       case ESC_W:
   2819       okquantifier = TRUE;
   2820       if ((options & PCRE2_UCP) == 0)
   2821         {
   2822         *parsed_pattern++ = META_ESCAPE + escape;
   2823         }
   2824       else
   2825         {
   2826         *parsed_pattern++ = META_ESCAPE +
   2827           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
   2828             ESC_p : ESC_P);
   2829         switch(escape)
   2830           {
   2831           case ESC_d:
   2832           case ESC_D:
   2833           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
   2834           break;
   2835 
   2836           case ESC_s:
   2837           case ESC_S:
   2838           *parsed_pattern++ = PT_SPACE << 16;
   2839           break;
   2840 
   2841           case ESC_w:
   2842           case ESC_W:
   2843           *parsed_pattern++ = PT_WORD << 16;
   2844           break;
   2845           }
   2846         }
   2847       break;
   2848 
   2849       /* Unicode property matching */
   2850 
   2851       case ESC_P:
   2852       case ESC_p:
   2853 #ifdef SUPPORT_UNICODE
   2854         {
   2855         BOOL negated;
   2856         uint16_t ptype = 0, pdata = 0;
   2857         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
   2858           goto ESCAPE_FAILED;
   2859         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
   2860         *parsed_pattern++ = META_ESCAPE + escape;
   2861         *parsed_pattern++ = (ptype << 16) | pdata;
   2862         okquantifier = TRUE;
   2863         }
   2864 #else
   2865       errorcode = ERR45;
   2866       goto ESCAPE_FAILED;
   2867 #endif
   2868       break;  /* End \P and \p */
   2869 
   2870       /* When \g is used with quotes or angle brackets as delimiters, it is a
   2871       numerical or named subroutine call, and control comes here. When used
   2872       with brace delimiters it is a numberical back reference and does not come
   2873       here because check_escape() returns it directly as a reference. \k is
   2874       always a named back reference. */
   2875 
   2876       case ESC_g:
   2877       case ESC_k:
   2878       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
   2879           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
   2880         {
   2881         errorcode = (escape == ESC_g)? ERR57 : ERR69;
   2882         goto ESCAPE_FAILED;
   2883         }
   2884       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
   2885         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
   2886         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
   2887 
   2888       /* For a non-braced \g, check for a numerical recursion. */
   2889 
   2890       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
   2891         {
   2892         PCRE2_SPTR p = ptr + 1;
   2893 
   2894         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
   2895             &errorcode))
   2896           {
   2897           if (p >= ptrend || *p != terminator)
   2898             {
   2899             errorcode = ERR57;
   2900             goto ESCAPE_FAILED;
   2901             }
   2902           ptr = p;
   2903           goto SET_RECURSION;
   2904           }
   2905         if (errorcode != 0) goto ESCAPE_FAILED;
   2906         }
   2907 
   2908       /* Not a numerical recursion */
   2909 
   2910       if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
   2911           &errorcode, cb)) goto ESCAPE_FAILED;
   2912 
   2913       /* \k and \g when used with braces are back references, whereas \g used
   2914       with quotes or angle brackets is a recursion */
   2915 
   2916       *parsed_pattern++ =
   2917         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
   2918           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
   2919       *parsed_pattern++ = namelen;
   2920 
   2921       PUTOFFSET(offset, parsed_pattern);
   2922       okquantifier = TRUE;
   2923       break;  /* End special escape processing */
   2924       }
   2925     break;    /* End escape sequence processing */
   2926 
   2927 
   2928     /* ---- Single-character special items ---- */
   2929 
   2930     case CHAR_CIRCUMFLEX_ACCENT:
   2931     *parsed_pattern++ = META_CIRCUMFLEX;
   2932     break;
   2933 
   2934     case CHAR_DOLLAR_SIGN:
   2935     *parsed_pattern++ = META_DOLLAR;
   2936     break;
   2937 
   2938     case CHAR_DOT:
   2939     *parsed_pattern++ = META_DOT;
   2940     okquantifier = TRUE;
   2941     break;
   2942 
   2943 
   2944     /* ---- Single-character quantifiers ---- */
   2945 
   2946     case CHAR_ASTERISK:
   2947     meta_quantifier = META_ASTERISK;
   2948     goto CHECK_QUANTIFIER;
   2949 
   2950     case CHAR_PLUS:
   2951     meta_quantifier = META_PLUS;
   2952     goto CHECK_QUANTIFIER;
   2953 
   2954     case CHAR_QUESTION_MARK:
   2955     meta_quantifier = META_QUERY;
   2956     goto CHECK_QUANTIFIER;
   2957 
   2958 
   2959     /* ---- Potential {n,m} quantifier ---- */
   2960 
   2961     case CHAR_LEFT_CURLY_BRACKET:
   2962     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
   2963         &errorcode))
   2964       {
   2965       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
   2966       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
   2967       break;                               /* No more quantifier processing */
   2968       }
   2969     meta_quantifier = META_MINMAX;
   2970     /* Fall through */
   2971 
   2972 
   2973     /* ---- Quantifier post-processing ---- */
   2974 
   2975     /* Check that a quantifier is allowed after the previous item. */
   2976 
   2977     CHECK_QUANTIFIER:
   2978     if (!prev_okquantifier)
   2979       {
   2980       errorcode = ERR9;
   2981       goto FAILED_BACK;
   2982       }
   2983 
   2984     /* Now we can put the quantifier into the parsed pattern vector. At this
   2985     stage, we have only the basic quantifier. The check for a following + or ?
   2986     modifier happens at the top of the loop, after any intervening comments
   2987     have been removed. */
   2988 
   2989     *parsed_pattern++ = meta_quantifier;
   2990     if (c == CHAR_LEFT_CURLY_BRACKET)
   2991       {
   2992       *parsed_pattern++ = min_repeat;
   2993       *parsed_pattern++ = max_repeat;
   2994       }
   2995     break;
   2996 
   2997 
   2998     /* ---- Character class ---- */
   2999 
   3000     case CHAR_LEFT_SQUARE_BRACKET:
   3001     okquantifier = TRUE;
   3002 
   3003     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
   3004     used for "start of word" and "end of word". As these are otherwise illegal
   3005     sequences, we don't break anything by recognizing them. They are replaced
   3006     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
   3007     erroneous and are handled by the normal code below. */
   3008 
   3009     if (ptrend - ptr >= 6 &&
   3010          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
   3011           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
   3012       {
   3013       *parsed_pattern++ = META_ESCAPE + ESC_b;
   3014 
   3015       if (ptr[2] == CHAR_LESS_THAN_SIGN)
   3016         {
   3017         *parsed_pattern++ = META_LOOKAHEAD;
   3018         }
   3019       else
   3020         {
   3021         *parsed_pattern++ = META_LOOKBEHIND;
   3022         *has_lookbehind = TRUE;
   3023 
   3024         /* The offset is used only for the "non-fixed length" error; this won't
   3025         occur here, so just store zero. */
   3026 
   3027         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
   3028         }
   3029 
   3030       if ((options & PCRE2_UCP) == 0)
   3031         *parsed_pattern++ = META_ESCAPE + ESC_w;
   3032       else
   3033         {
   3034         *parsed_pattern++ = META_ESCAPE + ESC_p;
   3035         *parsed_pattern++ = PT_WORD << 16;
   3036         }
   3037       *parsed_pattern++ = META_KET;
   3038       ptr += 6;
   3039       break;
   3040       }
   3041 
   3042     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
   3043     they are encountered at the top level, so we'll do that too. */
   3044 
   3045     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
   3046          *ptr == CHAR_EQUALS_SIGN) &&
   3047         check_posix_syntax(ptr, ptrend, &tempptr))
   3048       {
   3049       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
   3050       goto FAILED;
   3051       }
   3052 
   3053     /* Process a regular character class. If the first character is '^', set
   3054     the negation flag. If the first few characters (either before or after ^)
   3055     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
   3056     This makes for compatibility with Perl. */
   3057 
   3058     negate_class = FALSE;
   3059     while (ptr < ptrend)
   3060       {
   3061       GETCHARINCTEST(c, ptr);
   3062       if (c == CHAR_BACKSLASH)
   3063         {
   3064         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
   3065         else if (ptrend - ptr >= 3 &&
   3066              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
   3067           ptr += 3;
   3068         else
   3069           break;
   3070         }
   3071       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
   3072                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
   3073         continue;
   3074       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   3075         negate_class = TRUE;
   3076       else break;
   3077       }
   3078 
   3079     /* Now the real contents of the class; c has the first "real" character.
   3080     Empty classes are permitted only if the option is set. */
   3081 
   3082     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   3083         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
   3084       {
   3085       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
   3086       break;  /* End of class processing */
   3087       }
   3088 
   3089     /* Process a non-empty class. */
   3090 
   3091     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
   3092     class_range_state = RANGE_NO;
   3093 
   3094     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
   3095     because there are holes in the encoding, and simply using the range A-Z
   3096     (for example) would include the characters in the holes. This applies only
   3097     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
   3098     in this respect. In order to accommodate this, we keep track of whether
   3099     character values are literal or not, and a state variable for handling
   3100     ranges. */
   3101 
   3102     /* Loop for the contents of the class */
   3103 
   3104     for (;;)
   3105       {
   3106       BOOL char_is_literal = TRUE;
   3107 
   3108       /* Inside \Q...\E everything is literal except \E */
   3109 
   3110       if (inescq)
   3111         {
   3112         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
   3113           {
   3114           inescq = FALSE;                   /* Reset literal state */
   3115           ptr++;                            /* Skip the 'E' */
   3116           goto CLASS_CONTINUE;
   3117           }
   3118         goto CLASS_LITERAL;
   3119         }
   3120 
   3121       /* Skip over space and tab (only) in extended-more mode. */
   3122 
   3123       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
   3124           (c == CHAR_SPACE || c == CHAR_HT))
   3125         goto CLASS_CONTINUE;
   3126 
   3127       /* Handle POSIX class names. Perl allows a negation extension of the
   3128       form [:^name:]. A square bracket that doesn't match the syntax is
   3129       treated as a literal. We also recognize the POSIX constructions
   3130       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
   3131       5.6 and 5.8 do. */
   3132 
   3133       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   3134           ptrend - ptr >= 3 &&
   3135           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
   3136            *ptr == CHAR_EQUALS_SIGN) &&
   3137           check_posix_syntax(ptr, ptrend, &tempptr))
   3138         {
   3139         BOOL posix_negate = FALSE;
   3140         int posix_class;
   3141 
   3142         /* Perl treats a hyphen before a POSIX class as a literal, not the
   3143         start of a range. However, it gives a warning in its warning mode. PCRE
   3144         does not have a warning mode, so we give an error, because this is
   3145         likely an error on the user's part. */
   3146 
   3147         if (class_range_state == RANGE_STARTED)
   3148           {
   3149           errorcode = ERR50;
   3150           goto FAILED;
   3151           }
   3152 
   3153         if (*ptr != CHAR_COLON)
   3154           {
   3155           errorcode = ERR13;
   3156           goto FAILED_BACK;
   3157           }
   3158 
   3159         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
   3160           {
   3161           posix_negate = TRUE;
   3162           ptr++;
   3163           }
   3164 
   3165         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
   3166         if (posix_class < 0)
   3167           {
   3168           errorcode = ERR30;
   3169           goto FAILED;
   3170           }
   3171         ptr = tempptr + 2;
   3172 
   3173         /* Perl treats a hyphen after a POSIX class as a literal, not the
   3174         start of a range. However, it gives a warning in its warning mode
   3175         unless the hyphen is the last character in the class. PCRE does not
   3176         have a warning mode, so we give an error, because this is likely an
   3177         error on the user's part. */
   3178 
   3179         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
   3180             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
   3181           {
   3182           errorcode = ERR50;
   3183           goto FAILED;
   3184           }
   3185 
   3186         /* Set "a hyphen is not the start of a range" for the -] case, and also
   3187         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
   3188         fuzzers do that kind of thing) and *then* a hyphen. This causes that
   3189         hyphen to be treated as a literal. I don't think it's worth setting up
   3190         special apparatus to do otherwise. */
   3191 
   3192         class_range_state = RANGE_NO;
   3193 
   3194         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
   3195         use Unicode properties \p or \P or, in one case, \h or \H. The
   3196         substitutes table has two values per class, containing the type and
   3197         value of a \p or \P item. The special cases are specified with a
   3198         negative type: a non-zero value causes \h or \H to be used, and a zero
   3199         value falls through to behave like a non-UCP POSIX class. */
   3200 
   3201 #ifdef SUPPORT_UNICODE
   3202         if ((options & PCRE2_UCP) != 0)
   3203           {
   3204           int ptype = posix_substitutes[2*posix_class];
   3205           int pvalue = posix_substitutes[2*posix_class + 1];
   3206           if (ptype >= 0)
   3207             {
   3208             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
   3209             *parsed_pattern++ = (ptype << 16) | pvalue;
   3210             goto CLASS_CONTINUE;
   3211             }
   3212 
   3213           if (pvalue != 0)
   3214             {
   3215             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
   3216             goto CLASS_CONTINUE;
   3217             }
   3218 
   3219           /* Fall through */
   3220           }
   3221 #endif  /* SUPPORT_UNICODE */
   3222 
   3223         /* Non-UCP POSIX class */
   3224 
   3225         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
   3226         *parsed_pattern++ = posix_class;
   3227         }
   3228 
   3229       /* Handle potential start of range */
   3230 
   3231       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
   3232         {
   3233         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
   3234           META_RANGE_LITERAL : META_RANGE_ESCAPED;
   3235         class_range_state = RANGE_STARTED;
   3236         }
   3237 
   3238       /* Handle a literal character */
   3239 
   3240       else if (c != CHAR_BACKSLASH)
   3241         {
   3242         CLASS_LITERAL:
   3243         if (class_range_state == RANGE_STARTED)
   3244           {
   3245           if (c == parsed_pattern[-2])       /* Optimize one-char range */
   3246             parsed_pattern--;
   3247           else if (parsed_pattern[-2] > c)   /* Check range is in order */
   3248             {
   3249             errorcode = ERR8;
   3250             goto FAILED_BACK;
   3251             }
   3252           else
   3253             {
   3254             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
   3255               parsed_pattern[-1] = META_RANGE_ESCAPED;
   3256             PARSED_LITERAL(c, parsed_pattern);
   3257             }
   3258           class_range_state = RANGE_NO;
   3259           }
   3260         else  /* Potential start of range */
   3261           {
   3262           class_range_state = char_is_literal?
   3263             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
   3264           PARSED_LITERAL(c, parsed_pattern);
   3265           }
   3266         }
   3267 
   3268       /* Handle escapes in a class */
   3269 
   3270       else
   3271         {
   3272         tempptr = ptr;
   3273         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
   3274           options, TRUE, cb);
   3275         if (errorcode != 0)
   3276           {
   3277           CLASS_ESCAPE_FAILED:
   3278           if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
   3279             goto FAILED;
   3280           ptr = tempptr;
   3281           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
   3282             {
   3283             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
   3284             }
   3285           escape = 0;                 /* Treat as literal character */
   3286           }
   3287 
   3288         if (escape == 0)  /* Escaped character code point is in c */
   3289           {
   3290           char_is_literal = FALSE;
   3291           goto CLASS_LITERAL;
   3292           }
   3293 
   3294         /* These three escapes do not alter the class range state. */
   3295 
   3296         if (escape == ESC_b)
   3297           {
   3298           c = CHAR_BS;   /* \b is backspace in a class */
   3299           char_is_literal = FALSE;
   3300           goto CLASS_LITERAL;
   3301           }
   3302 
   3303         else if (escape == ESC_Q)
   3304           {
   3305           inescq = TRUE;  /* Enter literal mode */
   3306           goto CLASS_CONTINUE;
   3307           }
   3308 
   3309         else if (escape == ESC_E)  /* Ignore orphan \E */
   3310           goto CLASS_CONTINUE;
   3311 
   3312         /* The second part of a range can be a single-character escape
   3313         sequence (detected above), but not any of the other escapes. Perl
   3314         treats a hyphen as a literal in such circumstances. However, in Perl's
   3315         warning mode, a warning is given, so PCRE now faults it, as it is
   3316         almost certainly a mistake on the user's part. */
   3317 
   3318         if (class_range_state == RANGE_STARTED)
   3319           {
   3320           errorcode = ERR50;
   3321           goto CLASS_ESCAPE_FAILED;
   3322           }
   3323 
   3324         /* Of the remaining escapes, only those that define characters are
   3325         allowed in a class. None may start a range. */
   3326 
   3327         class_range_state = RANGE_NO;
   3328         switch(escape)
   3329           {
   3330           case ESC_N:
   3331           errorcode = ERR71;  /* Not supported in a class */
   3332           goto CLASS_ESCAPE_FAILED;
   3333 
   3334           case ESC_H:
   3335           case ESC_h:
   3336           case ESC_V:
   3337           case ESC_v:
   3338           *parsed_pattern++ = META_ESCAPE + escape;
   3339           break;
   3340 
   3341           /* These escapes are converted to Unicode property tests when
   3342           PCRE2_UCP is set. */
   3343 
   3344           case ESC_d:
   3345           case ESC_D:
   3346           case ESC_s:
   3347           case ESC_S:
   3348           case ESC_w:
   3349           case ESC_W:
   3350           if ((options & PCRE2_UCP) == 0)
   3351             {
   3352             *parsed_pattern++ = META_ESCAPE + escape;
   3353             }
   3354           else
   3355             {
   3356             *parsed_pattern++ = META_ESCAPE +
   3357               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
   3358                 ESC_p : ESC_P);
   3359             switch(escape)
   3360               {
   3361               case ESC_d:
   3362               case ESC_D:
   3363               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
   3364               break;
   3365 
   3366               case ESC_s:
   3367               case ESC_S:
   3368               *parsed_pattern++ = PT_SPACE << 16;
   3369               break;
   3370 
   3371               case ESC_w:
   3372               case ESC_W:
   3373               *parsed_pattern++ = PT_WORD << 16;
   3374               break;
   3375               }
   3376             }
   3377           break;
   3378 
   3379           /* Explicit Unicode property matching */
   3380 
   3381           case ESC_P:
   3382           case ESC_p:
   3383 #ifdef SUPPORT_UNICODE
   3384             {
   3385             BOOL negated;
   3386             uint16_t ptype = 0, pdata = 0;
   3387             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
   3388               goto FAILED;
   3389             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
   3390             *parsed_pattern++ = META_ESCAPE + escape;
   3391             *parsed_pattern++ = (ptype << 16) | pdata;
   3392             }
   3393 #else
   3394           errorcode = ERR45;
   3395           goto CLASS_ESCAPE_FAILED;
   3396 #endif
   3397           break;  /* End \P and \p */
   3398 
   3399           default:    /* All others are not allowed in a class */
   3400           errorcode = ERR7;
   3401           ptr--;
   3402           goto CLASS_ESCAPE_FAILED;
   3403           }
   3404 
   3405         /* Perl gives a warning unless a following hyphen is the last character
   3406         in the class. PCRE throws an error. */
   3407 
   3408         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
   3409             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
   3410           {
   3411           errorcode = ERR50;
   3412           goto FAILED;
   3413           }
   3414         }
   3415 
   3416       /* Proceed to next thing in the class. */
   3417 
   3418       CLASS_CONTINUE:
   3419       if (ptr >= ptrend)
   3420         {
   3421         errorcode = ERR6;  /* Missing terminating ']' */
   3422         goto FAILED;
   3423         }
   3424       GETCHARINCTEST(c, ptr);
   3425       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
   3426       }     /* End of class-processing loop */
   3427 
   3428     if (class_range_state == RANGE_STARTED)
   3429       {
   3430       parsed_pattern[-1] = CHAR_MINUS;
   3431       class_range_state = RANGE_NO;
   3432       }
   3433 
   3434     *parsed_pattern++ = META_CLASS_END;
   3435     break;  /* End of character class */
   3436 
   3437 
   3438     /* ---- Opening parenthesis ---- */
   3439 
   3440     case CHAR_LEFT_PARENTHESIS:
   3441     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3442 
   3443     /* If ( is not followed by ? it is either a capture or a special verb. */
   3444 
   3445     if (*ptr != CHAR_QUESTION_MARK)
   3446       {
   3447       const char *vn;
   3448 
   3449       /* Handle capturing brackets (or non-capturing if auto-capture is turned
   3450       off). */
   3451 
   3452       if (*ptr != CHAR_ASTERISK)
   3453         {
   3454         nest_depth++;
   3455         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
   3456           {
   3457           cb->bracount++;
   3458           *parsed_pattern++ = META_CAPTURE | cb->bracount;
   3459           }
   3460         else *parsed_pattern++ = META_NOCAPTURE;
   3461         }
   3462 
   3463 
   3464       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
   3465 
   3466       /* Do nothing for (*) so it gives a "bad quantifier" error rather than
   3467       "(*MARK) must have an argument". */
   3468 
   3469       else if (ptrend - ptr > 1 && ptr[1] != CHAR_RIGHT_PARENTHESIS)
   3470         {
   3471         vn = verbnames;
   3472         if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
   3473           cb)) goto FAILED;
   3474         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
   3475                               *ptr != CHAR_RIGHT_PARENTHESIS))
   3476           {
   3477           errorcode = ERR60;  /* Malformed */
   3478           goto FAILED;
   3479           }
   3480 
   3481         /* Scan the table of verb names */
   3482 
   3483         for (i = 0; i < verbcount; i++)
   3484           {
   3485           if (namelen == verbs[i].len &&
   3486               PRIV(strncmp_c8)(name, vn, namelen) == 0)
   3487             break;
   3488           vn += verbs[i].len + 1;
   3489           }
   3490 
   3491         if (i >= verbcount)
   3492           {
   3493           errorcode = ERR60;  /* Verb not recognized */
   3494           goto FAILED;
   3495           }
   3496 
   3497         /* An empty argument is treated as no argument. */
   3498 
   3499         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
   3500              ptr[1] == CHAR_RIGHT_PARENTHESIS)
   3501           ptr++;    /* Advance to the closing parens */
   3502 
   3503         /* Check for mandatory non-empty argument; this is (*MARK) */
   3504 
   3505         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
   3506           {
   3507           errorcode = ERR66;
   3508           goto FAILED;
   3509           }
   3510 
   3511         /* It appears that Perl allows any characters whatsoever, other than a
   3512         closing parenthesis, to appear in arguments ("names"), so we no longer
   3513         insist on letters, digits, and underscores. Perl does not, however, do
   3514         any interpretation within arguments, and has no means of including a
   3515         closing parenthesis. PCRE supports escape processing but only when it
   3516         is requested by an option. We set inverbname TRUE here, and let the
   3517         main loop take care of this so that escape and \x processing is done by
   3518         the main code above. */
   3519 
   3520         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
   3521           {
   3522           /* Some optional arguments can be treated as a preceding (*MARK) */
   3523 
   3524           if (verbs[i].has_arg < 0)
   3525             {
   3526             add_after_mark = verbs[i].meta;
   3527             *parsed_pattern++ = META_MARK;
   3528             }
   3529 
   3530           /* The remaining verbs with arguments (except *MARK) need a different
   3531           opcode. */
   3532 
   3533           else
   3534             {
   3535             *parsed_pattern++ = verbs[i].meta +
   3536               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
   3537             }
   3538 
   3539           /* Set up for reading the name in the main loop. */
   3540 
   3541           verblengthptr = parsed_pattern++;
   3542           verbnamestart = ptr;
   3543           inverbname = TRUE;
   3544           }
   3545         else  /* No verb "name" argument */
   3546           {
   3547           *parsed_pattern++ = verbs[i].meta;
   3548           }
   3549         }     /* End of (*VERB) handling */
   3550       break;  /* Done with this parenthesis */
   3551       }       /* End of groups that don't start with (? */
   3552 
   3553 
   3554     /* ---- Items starting (? ---- */
   3555 
   3556     /* The type of item is determined by what follows (?. Handle (?| and option
   3557     changes under "default" because both need a new block on the nest stack.
   3558     Comments starting with (?# are handled above. Note that there is some
   3559     ambiguity about the sequence (?- because if a digit follows it's a relative
   3560     recursion or subroutine call whereas otherwise it's an option unsetting. */
   3561 
   3562     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3563 
   3564     switch(*ptr)
   3565       {
   3566       default:
   3567       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
   3568         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
   3569 
   3570       /* We now have either (?| or a (possibly empty) option setting,
   3571       optionally followed by a non-capturing group. */
   3572 
   3573       nest_depth++;
   3574       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
   3575       else if (++top_nest >= end_nests)
   3576         {
   3577         errorcode = ERR84;
   3578         goto FAILED;
   3579         }
   3580       top_nest->nest_depth = nest_depth;
   3581       top_nest->flags = 0;
   3582       top_nest->options = options & PARSE_TRACKED_OPTIONS;
   3583 
   3584       /* Start of non-capturing group that resets the capture count for each
   3585       branch. */
   3586 
   3587       if (*ptr == CHAR_VERTICAL_LINE)
   3588         {
   3589         top_nest->reset_group = (uint16_t)cb->bracount;
   3590         top_nest->max_group = (uint16_t)cb->bracount;
   3591         top_nest->flags |= NSF_RESET;
   3592         cb->external_flags |= PCRE2_DUPCAPUSED;
   3593         *parsed_pattern++ = META_NOCAPTURE;
   3594         ptr++;
   3595         }
   3596 
   3597       /* Scan for options imnsxJU to be set or unset. */
   3598 
   3599       else
   3600         {
   3601         BOOL hyphenok = TRUE;
   3602         uint32_t oldoptions = options;
   3603 
   3604         top_nest->reset_group = 0;
   3605         top_nest->max_group = 0;
   3606         set = unset = 0;
   3607         optset = &set;
   3608 
   3609         /* ^ at the start unsets imnsx and disables the subsequent use of - */
   3610 
   3611         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
   3612           {
   3613           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
   3614                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
   3615           hyphenok = FALSE;
   3616           ptr++;
   3617           }
   3618 
   3619         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
   3620                                *ptr != CHAR_COLON)
   3621           {
   3622           switch (*ptr++)
   3623             {
   3624             case CHAR_MINUS:
   3625             if (!hyphenok)
   3626               {
   3627               errorcode = ERR94;
   3628               ptr--;  /* Correct the offset */
   3629               goto FAILED;
   3630               }
   3631             optset = &unset;
   3632             hyphenok = FALSE;
   3633             break;
   3634 
   3635             case CHAR_J:  /* Record that it changed in the external options */
   3636             *optset |= PCRE2_DUPNAMES;
   3637             cb->external_flags |= PCRE2_JCHANGED;
   3638             break;
   3639 
   3640             case CHAR_i: *optset |= PCRE2_CASELESS; break;
   3641             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
   3642             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
   3643             case CHAR_s: *optset |= PCRE2_DOTALL; break;
   3644             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
   3645 
   3646             /* If x appears twice it sets the extended extended option. */
   3647 
   3648             case CHAR_x:
   3649             *optset |= PCRE2_EXTENDED;
   3650             if (ptr < ptrend && *ptr == CHAR_x)
   3651               {
   3652               *optset |= PCRE2_EXTENDED_MORE;
   3653               ptr++;
   3654               }
   3655             break;
   3656 
   3657             default:
   3658             errorcode = ERR11;
   3659             ptr--;    /* Correct the offset */
   3660             goto FAILED;
   3661             }
   3662           }
   3663 
   3664         /* If we are setting extended without extended-more, ensure that any
   3665         existing extended-more gets unset. Also, unsetting extended must also
   3666         unset extended-more. */
   3667 
   3668         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
   3669             (unset & PCRE2_EXTENDED) != 0)
   3670           unset |= PCRE2_EXTENDED_MORE;
   3671 
   3672         options = (options | set) & (~unset);
   3673 
   3674         /* If the options ended with ')' this is not the start of a nested
   3675         group with option changes, so the options change at this level.
   3676         In this case, if the previous level set up a nest block, discard the
   3677         one we have just created. Otherwise adjust it for the previous level.
   3678         If the options ended with ':' we are starting a non-capturing group,
   3679         possibly with an options setting. */
   3680 
   3681         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3682         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
   3683           {
   3684           nest_depth--;  /* This is not a nested group after all. */
   3685           if (top_nest > (nest_save *)(cb->start_workspace) &&
   3686               (top_nest-1)->nest_depth == nest_depth) top_nest--;
   3687           else top_nest->nest_depth = nest_depth;
   3688           }
   3689         else *parsed_pattern++ = META_NOCAPTURE;
   3690 
   3691         /* If nothing changed, no need to record. */
   3692 
   3693         if (options != oldoptions)
   3694           {
   3695           *parsed_pattern++ = META_OPTIONS;
   3696           *parsed_pattern++ = options;
   3697           }
   3698         }     /* End options processing */
   3699       break;  /* End default case after (? */
   3700 
   3701 
   3702       /* ---- Python syntax support ---- */
   3703 
   3704       case CHAR_P:
   3705       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3706 
   3707       /* (?P<name> is the same as (?<name>, which defines a named group. */
   3708 
   3709       if (*ptr == CHAR_LESS_THAN_SIGN)
   3710         {
   3711         terminator = CHAR_GREATER_THAN_SIGN;
   3712         goto DEFINE_NAME;
   3713         }
   3714 
   3715       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
   3716       call. */
   3717 
   3718       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
   3719 
   3720       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
   3721       else after (?P is an error. */
   3722 
   3723       if (*ptr != CHAR_EQUALS_SIGN)
   3724         {
   3725         errorcode = ERR41;
   3726         goto FAILED;
   3727         }
   3728       if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
   3729           &namelen, &errorcode, cb)) goto FAILED;
   3730       *parsed_pattern++ = META_BACKREF_BYNAME;
   3731       *parsed_pattern++ = namelen;
   3732       PUTOFFSET(offset, parsed_pattern);
   3733       okquantifier = TRUE;
   3734       break;   /* End of (?P processing */
   3735 
   3736 
   3737       /* ---- Recursion/subroutine calls by number ---- */
   3738 
   3739       case CHAR_R:
   3740       i = 0;         /* (?R) == (?R0) */
   3741       ptr++;
   3742       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
   3743         {
   3744         errorcode = ERR58;
   3745         goto FAILED;
   3746         }
   3747       goto SET_RECURSION;
   3748 
   3749       /* An item starting (?- followed by a digit comes here via the "default"
   3750       case because (?- followed by a non-digit is an options setting. */
   3751 
   3752       case CHAR_PLUS:
   3753       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
   3754         {
   3755         errorcode = ERR29;   /* Missing number */
   3756         goto FAILED;
   3757         }
   3758       /* Fall through */
   3759 
   3760       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
   3761       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   3762       RECURSION_BYNUMBER:
   3763       if (!read_number(&ptr, ptrend,
   3764           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
   3765           MAX_GROUP_NUMBER, ERR61,
   3766           &i, &errorcode)) goto FAILED;
   3767       if (i < 0)  /* NB (?0) is permitted */
   3768         {
   3769         errorcode = ERR15;   /* Unknown group */
   3770         goto FAILED_BACK;
   3771         }
   3772       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
   3773         goto UNCLOSED_PARENTHESIS;
   3774 
   3775       SET_RECURSION:
   3776       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
   3777       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
   3778       ptr++;
   3779       PUTOFFSET(offset, parsed_pattern);
   3780       okquantifier = TRUE;
   3781       break;  /* End of recursive call by number handling */
   3782 
   3783 
   3784       /* ---- Recursion/subroutine calls by name ---- */
   3785 
   3786       case CHAR_AMPERSAND:
   3787       RECURSE_BY_NAME:
   3788       if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
   3789           &namelen, &errorcode, cb)) goto FAILED;
   3790       *parsed_pattern++ = META_RECURSE_BYNAME;
   3791       *parsed_pattern++ = namelen;
   3792       PUTOFFSET(offset, parsed_pattern);
   3793       okquantifier = TRUE;
   3794       break;
   3795 
   3796       /* ---- Callout with numerical or string argument ---- */
   3797 
   3798       case CHAR_C:
   3799       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3800 
   3801       /* If the previous item was a condition starting (?(? an assertion,
   3802       optionally preceded by a callout, is expected. This is checked later on,
   3803       during actual compilation. However we need to identify this kind of
   3804       assertion in this pass because it must not be qualified. The value of
   3805       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
   3806       for a callout - still leaving a positive value that identifies the
   3807       assertion. Multiple callouts or any other items will make it zero or
   3808       less, which doesn't matter because they will cause an error later. */
   3809 
   3810       expect_cond_assert = prev_expect_cond_assert - 1;
   3811 
   3812       /* If previous_callout is not NULL, it means this follows a previous
   3813       callout. If it was a manual callout, do nothing; this means its "length
   3814       of next pattern item" field will remain zero. If it was an automatic
   3815       callout, abolish it. */
   3816 
   3817       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
   3818           previous_callout == parsed_pattern - 4 &&
   3819           parsed_pattern[-1] == 255)
   3820         parsed_pattern = previous_callout;
   3821 
   3822       /* Save for updating next pattern item length, and skip one item before
   3823       completing. */
   3824 
   3825       previous_callout = parsed_pattern;
   3826       after_manual_callout = 1;
   3827 
   3828       /* Handle a string argument; specific delimiter is required. */
   3829 
   3830       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
   3831         {
   3832         PCRE2_SIZE calloutlength;
   3833         PCRE2_SPTR startptr = ptr;
   3834 
   3835         delimiter = 0;
   3836         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
   3837           {
   3838           if (*ptr == PRIV(callout_start_delims)[i])
   3839             {
   3840             delimiter = PRIV(callout_end_delims)[i];
   3841             break;
   3842             }
   3843           }
   3844         if (delimiter == 0)
   3845           {
   3846           errorcode = ERR82;
   3847           goto FAILED;
   3848           }
   3849 
   3850         *parsed_pattern = META_CALLOUT_STRING;
   3851         parsed_pattern += 3;   /* Skip pattern info */
   3852 
   3853         for (;;)
   3854           {
   3855           if (++ptr >= ptrend)
   3856             {
   3857             errorcode = ERR81;
   3858             ptr = startptr;   /* To give a more useful message */
   3859             goto FAILED;
   3860             }
   3861           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
   3862             break;
   3863           }
   3864 
   3865         calloutlength = (PCRE2_SIZE)(ptr - startptr);
   3866         if (calloutlength > UINT32_MAX)
   3867           {
   3868           errorcode = ERR72;
   3869           goto FAILED;
   3870           }
   3871         *parsed_pattern++ = (uint32_t)calloutlength;
   3872         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
   3873         PUTOFFSET(offset, parsed_pattern);
   3874         }
   3875 
   3876       /* Handle a callout with an optional numerical argument, which must be
   3877       less than or equal to 255. A missing argument gives 0. */
   3878 
   3879       else
   3880         {
   3881         int n = 0;
   3882         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
   3883         parsed_pattern += 3;                       /* Skip pattern info */
   3884         while (ptr < ptrend && IS_DIGIT(*ptr))
   3885           {
   3886           n = n * 10 + *ptr++ - CHAR_0;
   3887           if (n > 255)
   3888             {
   3889             errorcode = ERR38;
   3890             goto FAILED;
   3891             }
   3892           }
   3893         *parsed_pattern++ = n;
   3894         }
   3895 
   3896       /* Both formats must have a closing parenthesis */
   3897 
   3898       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
   3899         {
   3900         errorcode = ERR39;
   3901         goto FAILED;
   3902         }
   3903       ptr++;
   3904 
   3905       /* Remember the offset to the next item in the pattern, and set a default
   3906       length. This should get updated after the next item is read. */
   3907 
   3908       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
   3909       previous_callout[2] = 0;
   3910       break;                  /* End callout */
   3911 
   3912 
   3913       /* ---- Conditional group ---- */
   3914 
   3915       /* A condition can be an assertion, a number (referring to a numbered
   3916       group's having been set), a name (referring to a named group), or 'R',
   3917       referring to overall recursion. R<digits> and R&name are also permitted
   3918       for recursion state tests. Numbers may be preceded by + or - to specify a
   3919       relative group number.
   3920 
   3921       There are several syntaxes for testing a named group: (?(name)) is used
   3922       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
   3923 
   3924       There are two unfortunate ambiguities. 'R' can be the recursive thing or
   3925       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
   3926       the Perl DEFINE feature or the Python named test. We look for a name
   3927       first; if not found, we try the other case.
   3928 
   3929       For compatibility with auto-callouts, we allow a callout to be specified
   3930       before a condition that is an assertion. */
   3931 
   3932       case CHAR_LEFT_PARENTHESIS:
   3933       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
   3934       nest_depth++;
   3935 
   3936       /* If the next character is ? there must be an assertion next (optionally
   3937       preceded by a callout). We do not check this here, but instead we set
   3938       expect_cond_assert to 2. If this is still greater than zero (callouts
   3939       decrement it) when the next assertion is read, it will be marked as a
   3940       condition that must not be repeated. A value greater than zero also
   3941       causes checking that an assertion (possibly with callout) follows. */
   3942 
   3943       if (*ptr == CHAR_QUESTION_MARK)
   3944         {
   3945         *parsed_pattern++ = META_COND_ASSERT;
   3946         ptr--;   /* Pull pointer back to the opening parenthesis. */
   3947         expect_cond_assert = 2;
   3948         break;  /* End of conditional */
   3949         }
   3950 
   3951       /* Handle (?([+-]number)... */
   3952 
   3953       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
   3954           &errorcode))
   3955         {
   3956         if (i <= 0)
   3957           {
   3958           errorcode = ERR15;
   3959           goto FAILED;
   3960           }
   3961         *parsed_pattern++ = META_COND_NUMBER;
   3962         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
   3963         PUTOFFSET(offset, parsed_pattern);
   3964         *parsed_pattern++ = i;
   3965         }
   3966       else if (errorcode != 0) goto FAILED;   /* Number too big */
   3967 
   3968       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
   3969 
   3970       else if (ptrend - ptr >= 10 &&
   3971                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
   3972                ptr[7] != CHAR_RIGHT_PARENTHESIS)
   3973         {
   3974         uint32_t ge = 0;
   3975         int major = 0;
   3976         int minor = 0;
   3977 
   3978         ptr += 7;
   3979         if (*ptr == CHAR_GREATER_THAN_SIGN)
   3980           {
   3981           ge = 1;
   3982           ptr++;
   3983           }
   3984 
   3985         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
   3986         references its argument twice. */
   3987 
   3988         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
   3989           goto BAD_VERSION_CONDITION;
   3990 
   3991         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
   3992           goto FAILED;
   3993 
   3994         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
   3995         if (*ptr == CHAR_DOT)
   3996           {
   3997           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
   3998           minor = (*ptr++ - CHAR_0) * 10;
   3999           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
   4000           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
   4001             goto BAD_VERSION_CONDITION;
   4002           }
   4003 
   4004         *parsed_pattern++ = META_COND_VERSION;
   4005         *parsed_pattern++ = ge;
   4006         *parsed_pattern++ = major;
   4007         *parsed_pattern++ = minor;
   4008         }
   4009 
   4010       /* All the remaining cases now require us to read a name. We cannot at
   4011       this stage distinguish ambiguous cases such as (?(R12) which might be a
   4012       recursion test by number or a name, because the named groups have not yet
   4013       all been identified. Those cases are treated as names, but given a
   4014       different META code. */
   4015 
   4016       else
   4017         {
   4018         BOOL was_r_ampersand = FALSE;
   4019 
   4020         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
   4021           {
   4022           terminator = CHAR_RIGHT_PARENTHESIS;
   4023           was_r_ampersand = TRUE;
   4024           ptr++;
   4025           }
   4026         else if (*ptr == CHAR_LESS_THAN_SIGN)
   4027           terminator = CHAR_GREATER_THAN_SIGN;
   4028         else if (*ptr == CHAR_APOSTROPHE)
   4029           terminator = CHAR_APOSTROPHE;
   4030         else
   4031           {
   4032           terminator = CHAR_RIGHT_PARENTHESIS;
   4033           ptr--;   /* Point to char before name */
   4034           }
   4035         if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
   4036             &errorcode, cb)) goto FAILED;
   4037 
   4038         /* Handle (?(R&name) */
   4039 
   4040         if (was_r_ampersand)
   4041           {
   4042           *parsed_pattern = META_COND_RNAME;
   4043           ptr--;   /* Back to closing parens */
   4044           }
   4045 
   4046         /* Handle (?(name). If the name is "DEFINE" we identify it with a
   4047         special code. Likewise if the name consists of R followed only by
   4048         digits. Otherwise, handle it like a quoted name. */
   4049 
   4050         else if (terminator == CHAR_RIGHT_PARENTHESIS)
   4051           {
   4052           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
   4053             *parsed_pattern = META_COND_DEFINE;
   4054           else
   4055             {
   4056             for (i = 1; i < (int)namelen; i++)
   4057               if (!IS_DIGIT(name[i])) break;
   4058             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
   4059               META_COND_RNUMBER : META_COND_NAME;
   4060             }
   4061           ptr--;   /* Back to closing parens */
   4062           }
   4063 
   4064         /* Handle (?('name') or (?(<name>) */
   4065 
   4066         else *parsed_pattern = META_COND_NAME;
   4067 
   4068         /* All these cases except DEFINE end with the name length and offset;
   4069         DEFINE just has an offset (for the "too many branches" error). */
   4070 
   4071         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
   4072         PUTOFFSET(offset, parsed_pattern);
   4073         }  /* End cases that read a name */
   4074 
   4075       /* Check the closing parenthesis of the condition */
   4076 
   4077       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
   4078         {
   4079         errorcode = ERR24;
   4080         goto FAILED;
   4081         }
   4082       ptr++;
   4083       break;  /* End of condition processing */
   4084 
   4085 
   4086       /* ---- Atomic group ---- */
   4087 
   4088       case CHAR_GREATER_THAN_SIGN:
   4089       *parsed_pattern++ = META_ATOMIC;
   4090       nest_depth++;
   4091       ptr++;
   4092       break;
   4093 
   4094 
   4095       /* ---- Lookahead assertions ---- */
   4096 
   4097       case CHAR_EQUALS_SIGN:
   4098       *parsed_pattern++ = META_LOOKAHEAD;
   4099       ptr++;
   4100       goto POST_ASSERTION;
   4101 
   4102       case CHAR_EXCLAMATION_MARK:
   4103       *parsed_pattern++ = META_LOOKAHEADNOT;
   4104       ptr++;
   4105       goto POST_ASSERTION;
   4106 
   4107 
   4108       /* ---- Lookbehind assertions ---- */
   4109 
   4110       /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
   4111       start of the name of a capturing group. */
   4112 
   4113       case CHAR_LESS_THAN_SIGN:
   4114       if (ptrend - ptr <= 1 ||
   4115          (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
   4116         {
   4117         terminator = CHAR_GREATER_THAN_SIGN;
   4118         goto DEFINE_NAME;
   4119         }
   4120       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
   4121         META_LOOKBEHIND : META_LOOKBEHINDNOT;
   4122       *has_lookbehind = TRUE;
   4123       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
   4124       PUTOFFSET(offset, parsed_pattern);
   4125       ptr += 2;
   4126       /* Fall through */
   4127 
   4128       /* If the previous item was a condition starting (?(? an assertion,
   4129       optionally preceded by a callout, is expected. This is checked later on,
   4130       during actual compilation. However we need to identify this kind of
   4131       assertion in this pass because it must not be qualified. The value of
   4132       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
   4133       for a callout - still leaving a positive value that identifies the
   4134       assertion. Multiple callouts or any other items will make it zero or
   4135       less, which doesn't matter because they will cause an error later. */
   4136 
   4137       POST_ASSERTION:
   4138       nest_depth++;
   4139       if (prev_expect_cond_assert > 0)
   4140         {
   4141         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
   4142         else if (++top_nest >= end_nests)
   4143           {
   4144           errorcode = ERR84;
   4145           goto FAILED;
   4146           }
   4147         top_nest->nest_depth = nest_depth;
   4148         top_nest->flags = NSF_CONDASSERT;
   4149         top_nest->options = options & PARSE_TRACKED_OPTIONS;
   4150         }
   4151       break;
   4152 
   4153 
   4154       /* ---- Define a named group ---- */
   4155 
   4156       /* A named group may be defined as (?'name') or (?<name>). In the latter
   4157       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
   4158       terminator set to '>'. */
   4159 
   4160       case CHAR_APOSTROPHE:
   4161       terminator = CHAR_APOSTROPHE;    /* Terminator */
   4162 
   4163       DEFINE_NAME:
   4164       if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
   4165           &errorcode, cb)) goto FAILED;
   4166 
   4167       /* We have a name for this capturing group. It is also assigned a number,
   4168       which is its primary means of identification. */
   4169 
   4170       cb->bracount++;
   4171       *parsed_pattern++ = META_CAPTURE | cb->bracount;
   4172       nest_depth++;
   4173 
   4174       /* Check not too many names */
   4175 
   4176       if (cb->names_found >= MAX_NAME_COUNT)
   4177         {
   4178         errorcode = ERR49;
   4179         goto FAILED;
   4180         }
   4181 
   4182       /* Adjust the entry size to accommodate the longest name found. */
   4183 
   4184       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
   4185         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
   4186 
   4187       /* Scan the list to check for duplicates. For duplicate names, if the
   4188       number is the same, break the loop, which causes the name to be
   4189       discarded; otherwise, if DUPNAMES is not set, give an error.
   4190       If it is set, allow the name with a different number, but continue
   4191       scanning in case this is a duplicate with the same number. For
   4192       non-duplicate names, give an error if the number is duplicated. */
   4193 
   4194       isdupname = FALSE;
   4195       ng = cb->named_groups;
   4196       for (i = 0; i < cb->names_found; i++, ng++)
   4197         {
   4198         if (namelen == ng->length &&
   4199             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
   4200           {
   4201           if (ng->number == cb->bracount) break;
   4202           if ((options & PCRE2_DUPNAMES) == 0)
   4203             {
   4204             errorcode = ERR43;
   4205             goto FAILED;
   4206             }
   4207           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
   4208           cb->dupnames = TRUE;              /* Duplicate names exist */
   4209           }
   4210         else if (ng->number == cb->bracount)
   4211           {
   4212           errorcode = ERR65;
   4213           goto FAILED;
   4214           }
   4215         }
   4216 
   4217       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
   4218 
   4219       /* Increase the list size if necessary */
   4220 
   4221       if (cb->names_found >= cb->named_group_list_size)
   4222         {
   4223         uint32_t newsize = cb->named_group_list_size * 2;
   4224         named_group *newspace =
   4225           cb->cx->memctl.malloc(newsize * sizeof(named_group),
   4226           cb->cx->memctl.memory_data);
   4227         if (newspace == NULL)
   4228           {
   4229           errorcode = ERR21;
   4230           goto FAILED;
   4231           }
   4232 
   4233         memcpy(newspace, cb->named_groups,
   4234           cb->named_group_list_size * sizeof(named_group));
   4235         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
   4236           cb->cx->memctl.free((void *)cb->named_groups,
   4237           cb->cx->memctl.memory_data);
   4238         cb->named_groups = newspace;
   4239         cb->named_group_list_size = newsize;
   4240         }
   4241 
   4242       /* Add this name to the list */
   4243 
   4244       cb->named_groups[cb->names_found].name = name;
   4245       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
   4246       cb->named_groups[cb->names_found].number = cb->bracount;
   4247       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
   4248       cb->names_found++;
   4249       break;
   4250       }        /* End of (? switch */
   4251     break;     /* End of ( handling */
   4252 
   4253 
   4254     /* ---- Branch terminators ---- */
   4255 
   4256     /* Alternation: reset the capture count if we are in a (?| group. */
   4257 
   4258     case CHAR_VERTICAL_LINE:
   4259     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
   4260         (top_nest->flags & NSF_RESET) != 0)
   4261       {
   4262       if (cb->bracount > top_nest->max_group)
   4263         top_nest->max_group = (uint16_t)cb->bracount;
   4264       cb->bracount = top_nest->reset_group;
   4265       }
   4266     *parsed_pattern++ = META_ALT;
   4267     break;
   4268 
   4269     /* End of group; reset the capture count to the maximum if we are in a (?|
   4270     group and/or reset the options that are tracked during parsing. Disallow
   4271     quantifier for a condition that is an assertion. */
   4272 
   4273     case CHAR_RIGHT_PARENTHESIS:
   4274     okquantifier = TRUE;
   4275     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
   4276       {
   4277       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
   4278       if ((top_nest->flags & NSF_RESET) != 0 &&
   4279           top_nest->max_group > cb->bracount)
   4280         cb->bracount = top_nest->max_group;
   4281       if ((top_nest->flags & NSF_CONDASSERT) != 0)
   4282         okquantifier = FALSE;
   4283       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
   4284         else top_nest--;
   4285       }
   4286     if (nest_depth == 0)    /* Unmatched closing parenthesis */
   4287       {
   4288       errorcode = ERR22;
   4289       goto FAILED_BACK;
   4290       }
   4291     nest_depth--;
   4292     *parsed_pattern++ = META_KET;
   4293     break;
   4294     }  /* End of switch on pattern character */
   4295   }    /* End of main character scan loop */
   4296 
   4297 /* End of pattern reached. Check for missing ) at the end of a verb name. */
   4298 
   4299 if (inverbname && ptr >= ptrend)
   4300   {
   4301   errorcode = ERR60;
   4302   goto FAILED;
   4303   }
   4304 
   4305 /* Manage callout for the final item */
   4306 
   4307 PARSED_END:
   4308 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
   4309   parsed_pattern, cb);
   4310 
   4311 /* Insert trailing items for word and line matching (features provided for the
   4312 benefit of pcre2grep). */
   4313 
   4314 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
   4315   {
   4316   *parsed_pattern++ = META_KET;
   4317   *parsed_pattern++ = META_DOLLAR;
   4318   }
   4319 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
   4320   {
   4321   *parsed_pattern++ = META_KET;
   4322   *parsed_pattern++ = META_ESCAPE + ESC_b;
   4323   }
   4324 
   4325 /* Terminate the parsed pattern, then return success if all groups are closed.
   4326 Otherwise we have unclosed parentheses. */
   4327 
   4328 if (parsed_pattern >= parsed_pattern_end)
   4329   {
   4330   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
   4331   goto FAILED;
   4332   }
   4333 
   4334 *parsed_pattern = META_END;
   4335 if (nest_depth == 0) return 0;
   4336 
   4337 UNCLOSED_PARENTHESIS:
   4338 errorcode = ERR14;
   4339 
   4340 /* Come here for all failures. */
   4341 
   4342 FAILED:
   4343 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
   4344 return errorcode;
   4345 
   4346 /* Some errors need to indicate the previous character. */
   4347 
   4348 FAILED_BACK:
   4349 ptr--;
   4350 goto FAILED;
   4351 
   4352 /* This failure happens several times. */
   4353 
   4354 BAD_VERSION_CONDITION:
   4355 errorcode = ERR79;
   4356 goto FAILED;
   4357 }
   4358 
   4359 
   4360 
   4361 /*************************************************
   4362 *       Find first significant opcode            *
   4363 *************************************************/
   4364 
   4365 /* This is called by several functions that scan a compiled expression looking
   4366 for a fixed first character, or an anchoring opcode etc. It skips over things
   4367 that do not influence this. For some calls, it makes sense to skip negative
   4368 forward and all backward assertions, and also the \b assertion; for others it
   4369 does not.
   4370 
   4371 Arguments:
   4372   code         pointer to the start of the group
   4373   skipassert   TRUE if certain assertions are to be skipped
   4374 
   4375 Returns:       pointer to the first significant opcode
   4376 */
   4377 
   4378 static const PCRE2_UCHAR*
   4379 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
   4380 {
   4381 for (;;)
   4382   {
   4383   switch ((int)*code)
   4384     {
   4385     case OP_ASSERT_NOT:
   4386     case OP_ASSERTBACK:
   4387     case OP_ASSERTBACK_NOT:
   4388     if (!skipassert) return code;
   4389     do code += GET(code, 1); while (*code == OP_ALT);
   4390     code += PRIV(OP_lengths)[*code];
   4391     break;
   4392 
   4393     case OP_WORD_BOUNDARY:
   4394     case OP_NOT_WORD_BOUNDARY:
   4395     if (!skipassert) return code;
   4396     /* Fall through */
   4397 
   4398     case OP_CALLOUT:
   4399     case OP_CREF:
   4400     case OP_DNCREF:
   4401     case OP_RREF:
   4402     case OP_DNRREF:
   4403     case OP_FALSE:
   4404     case OP_TRUE:
   4405     code += PRIV(OP_lengths)[*code];
   4406     break;
   4407 
   4408     case OP_CALLOUT_STR:
   4409     code += GET(code, 1 + 2*LINK_SIZE);
   4410     break;
   4411 
   4412     case OP_SKIPZERO:
   4413     code += 2 + GET(code, 2) + LINK_SIZE;
   4414     break;
   4415 
   4416     case OP_COND:
   4417     case OP_SCOND:
   4418     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
   4419         code[GET(code, 1)] != OP_KET)      /* More than one branch */
   4420       return code;
   4421     code += GET(code, 1) + 1 + LINK_SIZE;
   4422     break;
   4423 
   4424     default:
   4425     return code;
   4426     }
   4427   }
   4428 /* Control never reaches here */
   4429 }
   4430 
   4431 
   4432 
   4433 #ifdef SUPPORT_UNICODE
   4434 /*************************************************
   4435 *           Get othercase range                  *
   4436 *************************************************/
   4437 
   4438 /* This function is passed the start and end of a class range in UCP mode. It
   4439 searches up the characters, looking for ranges of characters in the "other"
   4440 case. Each call returns the next one, updating the start address. A character
   4441 with multiple other cases is returned on its own with a special return value.
   4442 
   4443 Arguments:
   4444   cptr        points to starting character value; updated
   4445   d           end value
   4446   ocptr       where to put start of othercase range
   4447   odptr       where to put end of othercase range
   4448 
   4449 Yield:        -1 when no more
   4450                0 when a range is returned
   4451               >0 the CASESET offset for char with multiple other cases
   4452                 in this case, ocptr contains the original
   4453 */
   4454 
   4455 static int
   4456 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
   4457   uint32_t *odptr)
   4458 {
   4459 uint32_t c, othercase, next;
   4460 unsigned int co;
   4461 
   4462 /* Find the first character that has an other case. If it has multiple other
   4463 cases, return its case offset value. */
   4464 
   4465 for (c = *cptr; c <= d; c++)
   4466   {
   4467   if ((co = UCD_CASESET(c)) != 0)
   4468     {
   4469     *ocptr = c++;   /* Character that has the set */
   4470     *cptr = c;      /* Rest of input range */
   4471     return (int)co;
   4472     }
   4473   if ((othercase = UCD_OTHERCASE(c)) != c) break;
   4474   }
   4475 
   4476 if (c > d) return -1;  /* Reached end of range */
   4477 
   4478 /* Found a character that has a single other case. Search for the end of the
   4479 range, which is either the end of the input range, or a character that has zero
   4480 or more than one other cases. */
   4481 
   4482 *ocptr = othercase;
   4483 next = othercase + 1;
   4484 
   4485 for (++c; c <= d; c++)
   4486   {
   4487   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
   4488   next++;
   4489   }
   4490 
   4491 *odptr = next - 1;     /* End of othercase range */
   4492 *cptr = c;             /* Rest of input range */
   4493 return 0;
   4494 }
   4495 #endif  /* SUPPORT_UNICODE */
   4496 
   4497 
   4498 
   4499 /*************************************************
   4500 * Add a character or range to a class (internal) *
   4501 *************************************************/
   4502 
   4503 /* This function packages up the logic of adding a character or range of
   4504 characters to a class. The character values in the arguments will be within the
   4505 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
   4506 called only from within the "add to class" group of functions, some of which
   4507 are recursive and mutually recursive. The external entry point is
   4508 add_to_class().
   4509 
   4510 Arguments:
   4511   classbits     the bit map for characters < 256
   4512   uchardptr     points to the pointer for extra data
   4513   options       the options word
   4514   cb            compile data
   4515   start         start of range character
   4516   end           end of range character
   4517 
   4518 Returns:        the number of < 256 characters added
   4519                 the pointer to extra data is updated
   4520 */
   4521 
   4522 static unsigned int
   4523 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
   4524   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
   4525 {
   4526 uint32_t c;
   4527 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
   4528 unsigned int n8 = 0;
   4529 
   4530 /* If caseless matching is required, scan the range and process alternate
   4531 cases. In Unicode, there are 8-bit characters that have alternate cases that
   4532 are greater than 255 and vice-versa. Sometimes we can just extend the original
   4533 range. */
   4534 
   4535 if ((options & PCRE2_CASELESS) != 0)
   4536   {
   4537 #ifdef SUPPORT_UNICODE
   4538   if ((options & PCRE2_UTF) != 0)
   4539     {
   4540     int rc;
   4541     uint32_t oc, od;
   4542 
   4543     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
   4544     c = start;
   4545 
   4546     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
   4547       {
   4548       /* Handle a single character that has more than one other case. */
   4549 
   4550       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
   4551         PRIV(ucd_caseless_sets) + rc, oc);
   4552 
   4553       /* Do nothing if the other case range is within the original range. */
   4554 
   4555       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
   4556 
   4557       /* Extend the original range if there is overlap, noting that if oc < c, we
   4558       can't have od > end because a subrange is always shorter than the basic
   4559       range. Otherwise, use a recursive call to add the additional range. */
   4560 
   4561       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
   4562       else if (od > end && oc <= end + 1)
   4563         {
   4564         end = od;       /* Extend upwards */
   4565         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
   4566         }
   4567       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
   4568       }
   4569     }
   4570   else
   4571 #endif  /* SUPPORT_UNICODE */
   4572 
   4573   /* Not UTF mode */
   4574 
   4575   for (c = start; c <= classbits_end; c++)
   4576     {
   4577     SETBIT(classbits, cb->fcc[c]);
   4578     n8++;
   4579     }
   4580   }
   4581 
   4582 /* Now handle the originally supplied range. Adjust the final value according
   4583 to the bit length - this means that the same lists of (e.g.) horizontal spaces
   4584 can be used in all cases. */
   4585 
   4586 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
   4587   end = MAX_NON_UTF_CHAR;
   4588 
   4589 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
   4590 
   4591 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
   4592 
   4593 for (c = start; c <= classbits_end; c++)
   4594   {
   4595   /* Regardless of start, c will always be <= 255. */
   4596   SETBIT(classbits, c);
   4597   n8++;
   4598   }
   4599 
   4600 #ifdef SUPPORT_WIDE_CHARS
   4601 if (start <= 0xff) start = 0xff + 1;
   4602 
   4603 if (end >= start)
   4604   {
   4605   PCRE2_UCHAR *uchardata = *uchardptr;
   4606 
   4607 #ifdef SUPPORT_UNICODE
   4608   if ((options & PCRE2_UTF) != 0)
   4609     {
   4610     if (start < end)
   4611       {
   4612       *uchardata++ = XCL_RANGE;
   4613       uchardata += PRIV(ord2utf)(start, uchardata);
   4614       uchardata += PRIV(ord2utf)(end, uchardata);
   4615       }
   4616     else if (start == end)
   4617       {
   4618       *uchardata++ = XCL_SINGLE;
   4619       uchardata += PRIV(ord2utf)(start, uchardata);
   4620       }
   4621     }
   4622   else
   4623 #endif  /* SUPPORT_UNICODE */
   4624 
   4625   /* Without UTF support, character values are constrained by the bit length,
   4626   and can only be > 256 for 16-bit and 32-bit libraries. */
   4627 
   4628 #if PCRE2_CODE_UNIT_WIDTH == 8
   4629     {}
   4630 #else
   4631   if (start < end)
   4632     {
   4633     *uchardata++ = XCL_RANGE;
   4634     *uchardata++ = start;
   4635     *uchardata++ = end;
   4636     }
   4637   else if (start == end)
   4638     {
   4639     *uchardata++ = XCL_SINGLE;
   4640     *uchardata++ = start;
   4641     }
   4642 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
   4643   *uchardptr = uchardata;   /* Updata extra data pointer */
   4644   }
   4645 #else  /* SUPPORT_WIDE_CHARS */
   4646   (void)uchardptr;          /* Avoid compiler warning */
   4647 #endif /* SUPPORT_WIDE_CHARS */
   4648 
   4649 return n8;    /* Number of 8-bit characters */
   4650 }
   4651 
   4652 
   4653 
   4654 #ifdef SUPPORT_UNICODE
   4655 /*************************************************
   4656 * Add a list of characters to a class (internal) *
   4657 *************************************************/
   4658 
   4659 /* This function is used for adding a list of case-equivalent characters to a
   4660 class when in UTF mode. This function is called only from within
   4661 add_to_class_internal(), with which it is mutually recursive.
   4662 
   4663 Arguments:
   4664   classbits     the bit map for characters < 256
   4665   uchardptr     points to the pointer for extra data
   4666   options       the options word
   4667   cb            contains pointers to tables etc.
   4668   p             points to row of 32-bit values, terminated by NOTACHAR
   4669   except        character to omit; this is used when adding lists of
   4670                   case-equivalent characters to avoid including the one we
   4671                   already know about
   4672 
   4673 Returns:        the number of < 256 characters added
   4674                 the pointer to extra data is updated
   4675 */
   4676 
   4677 static unsigned int
   4678 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
   4679   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
   4680 {
   4681 unsigned int n8 = 0;
   4682 while (p[0] < NOTACHAR)
   4683   {
   4684   unsigned int n = 0;
   4685   if (p[0] != except)
   4686     {
   4687     while(p[n+1] == p[0] + n + 1) n++;
   4688     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
   4689     }
   4690   p += n + 1;
   4691   }
   4692 return n8;
   4693 }
   4694 #endif
   4695 
   4696 
   4697 
   4698 /*************************************************
   4699 *   External entry point for add range to class  *
   4700 *************************************************/
   4701 
   4702 /* This function sets the overall range so that the internal functions can try
   4703 to avoid duplication when handling case-independence.
   4704 
   4705 Arguments:
   4706   classbits     the bit map for characters < 256
   4707   uchardptr     points to the pointer for extra data
   4708   options       the options word
   4709   cb            compile data
   4710   start         start of range character
   4711   end           end of range character
   4712 
   4713 Returns:        the number of < 256 characters added
   4714                 the pointer to extra data is updated
   4715 */
   4716 
   4717 static unsigned int
   4718 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
   4719   compile_block *cb, uint32_t start, uint32_t end)
   4720 {
   4721 cb->class_range_start = start;
   4722 cb->class_range_end = end;
   4723 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
   4724 }
   4725 
   4726 
   4727 /*************************************************
   4728 *   External entry point for add list to class   *
   4729 *************************************************/
   4730 
   4731 /* This function is used for adding a list of horizontal or vertical whitespace
   4732 characters to a class. The list must be in order so that ranges of characters
   4733 can be detected and handled appropriately. This function sets the overall range
   4734 so that the internal functions can try to avoid duplication when handling
   4735 case-independence.
   4736 
   4737 Arguments:
   4738   classbits     the bit map for characters < 256
   4739   uchardptr     points to the pointer for extra data
   4740   options       the options word
   4741   cb            contains pointers to tables etc.
   4742   p             points to row of 32-bit values, terminated by NOTACHAR
   4743   except        character to omit; this is used when adding lists of
   4744                   case-equivalent characters to avoid including the one we
   4745                   already know about
   4746 
   4747 Returns:        the number of < 256 characters added
   4748                 the pointer to extra data is updated
   4749 */
   4750 
   4751 static unsigned int
   4752 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
   4753   compile_block *cb, const uint32_t *p, unsigned int except)
   4754 {
   4755 unsigned int n8 = 0;
   4756 while (p[0] < NOTACHAR)
   4757   {
   4758   unsigned int n = 0;
   4759   if (p[0] != except)
   4760     {
   4761     while(p[n+1] == p[0] + n + 1) n++;
   4762     cb->class_range_start = p[0];
   4763     cb->class_range_end = p[n];
   4764     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
   4765     }
   4766   p += n + 1;
   4767   }
   4768 return n8;
   4769 }
   4770 
   4771 
   4772 
   4773 /*************************************************
   4774 *    Add characters not in a list to a class     *
   4775 *************************************************/
   4776 
   4777 /* This function is used for adding the complement of a list of horizontal or
   4778 vertical whitespace to a class. The list must be in order.
   4779 
   4780 Arguments:
   4781   classbits     the bit map for characters < 256
   4782   uchardptr     points to the pointer for extra data
   4783   options       the options word
   4784   cb            contains pointers to tables etc.
   4785   p             points to row of 32-bit values, terminated by NOTACHAR
   4786 
   4787 Returns:        the number of < 256 characters added
   4788                 the pointer to extra data is updated
   4789 */
   4790 
   4791 static unsigned int
   4792 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
   4793   uint32_t options, compile_block *cb, const uint32_t *p)
   4794 {
   4795 BOOL utf = (options & PCRE2_UTF) != 0;
   4796 unsigned int n8 = 0;
   4797 if (p[0] > 0)
   4798   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
   4799 while (p[0] < NOTACHAR)
   4800   {
   4801   while (p[1] == p[0] + 1) p++;
   4802   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
   4803     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
   4804   p++;
   4805   }
   4806 return n8;
   4807 }
   4808 
   4809 
   4810 
   4811 /*************************************************
   4812 *    Find details of duplicate group names       *
   4813 *************************************************/
   4814 
   4815 /* This is called from compile_branch() when it needs to know the index and
   4816 count of duplicates in the names table when processing named backreferences,
   4817 either directly, or as conditions.
   4818 
   4819 Arguments:
   4820   name          points to the name
   4821   length        the length of the name
   4822   indexptr      where to put the index
   4823   countptr      where to put the count of duplicates
   4824   errorcodeptr  where to put an error code
   4825   cb            the compile block
   4826 
   4827 Returns:        TRUE if OK, FALSE if not, error code set
   4828 */
   4829 
   4830 static BOOL
   4831 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
   4832   int *countptr, int *errorcodeptr, compile_block *cb)
   4833 {
   4834 uint32_t i, groupnumber;
   4835 int count;
   4836 PCRE2_UCHAR *slot = cb->name_table;
   4837 
   4838 /* Find the first entry in the table */
   4839 
   4840 for (i = 0; i < cb->names_found; i++)
   4841   {
   4842   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
   4843       slot[IMM2_SIZE+length] == 0) break;
   4844   slot += cb->name_entry_size;
   4845   }
   4846 
   4847 /* This should not occur, because this function is called only when we know we
   4848 have duplicate names. Give an internal error. */
   4849 
   4850 if (i >= cb->names_found)
   4851   {
   4852   *errorcodeptr = ERR53;
   4853   cb->erroroffset = name - cb->start_pattern;
   4854   return FALSE;
   4855   }
   4856 
   4857 /* Record the index and then see how many duplicates there are, updating the
   4858 backref map and maximum back reference as we do. */
   4859 
   4860 *indexptr = i;
   4861 count = 0;
   4862 
   4863 for (;;)
   4864   {
   4865   count++;
   4866   groupnumber = GET2(slot,0);
   4867   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
   4868   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
   4869   if (++i >= cb->names_found) break;
   4870   slot += cb->name_entry_size;
   4871   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
   4872     (slot+IMM2_SIZE)[length] != 0) break;
   4873   }
   4874 
   4875 *countptr = count;
   4876 return TRUE;
   4877 }
   4878 
   4879 
   4880 
   4881 /*************************************************
   4882 *           Compile one branch                   *
   4883 *************************************************/
   4884 
   4885 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
   4886 the options are changed during the branch, the pointer is used to change the
   4887 external options bits. This function is used during the pre-compile phase when
   4888 we are trying to find out the amount of memory needed, as well as during the
   4889 real compile phase. The value of lengthptr distinguishes the two phases.
   4890 
   4891 Arguments:
   4892   optionsptr        pointer to the option bits
   4893   codeptr           points to the pointer to the current code point
   4894   pptrptr           points to the current parsed pattern pointer
   4895   errorcodeptr      points to error code variable
   4896   firstcuptr        place to put the first required code unit
   4897   firstcuflagsptr   place to put the first code unit flags, or a negative number
   4898   reqcuptr          place to put the last required code unit
   4899   reqcuflagsptr     place to put the last required code unit flags, or a negative number
   4900   bcptr             points to current branch chain
   4901   cb                contains pointers to tables etc.
   4902   lengthptr         NULL during the real compile phase
   4903                     points to length accumulator during pre-compile phase
   4904 
   4905 Returns:            0 There's been an error, *errorcodeptr is non-zero
   4906                    +1 Success, this branch must match at least one character
   4907                    -1 Success, this branch may match an empty string
   4908 */
   4909 
   4910 static int
   4911 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
   4912   int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
   4913   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
   4914   compile_block *cb, PCRE2_SIZE *lengthptr)
   4915 {
   4916 int bravalue = 0;
   4917 int okreturn = -1;
   4918 int group_return = 0;
   4919 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
   4920 uint32_t greedy_default, greedy_non_default;
   4921 uint32_t repeat_type, op_type;
   4922 uint32_t options = *optionsptr;               /* May change dynamically */
   4923 uint32_t firstcu, reqcu;
   4924 uint32_t zeroreqcu, zerofirstcu;
   4925 uint32_t escape;
   4926 uint32_t *pptr = *pptrptr;
   4927 uint32_t meta, meta_arg;
   4928 int32_t firstcuflags, reqcuflags;
   4929 int32_t zeroreqcuflags, zerofirstcuflags;
   4930 int32_t req_caseopt, reqvary, tempreqvary;
   4931 PCRE2_SIZE offset = 0;
   4932 PCRE2_SIZE length_prevgroup = 0;
   4933 PCRE2_UCHAR *code = *codeptr;
   4934 PCRE2_UCHAR *last_code = code;
   4935 PCRE2_UCHAR *orig_code = code;
   4936 PCRE2_UCHAR *tempcode;
   4937 PCRE2_UCHAR *previous = NULL;
   4938 PCRE2_UCHAR op_previous;
   4939 BOOL groupsetfirstcu = FALSE;
   4940 BOOL matched_char = FALSE;
   4941 BOOL previous_matched_char = FALSE;
   4942 const uint8_t *cbits = cb->cbits;
   4943 uint8_t classbits[32];
   4944 
   4945 /* We can fish out the UTF setting once and for all into a BOOL, but we must
   4946 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
   4947 dynamically as we process the pattern. */
   4948 
   4949 #ifdef SUPPORT_UNICODE
   4950 BOOL utf = (options & PCRE2_UTF) != 0;
   4951 #else  /* No UTF support */
   4952 BOOL utf = FALSE;
   4953 #endif
   4954 
   4955 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
   4956 class_uchardata always so that it can be passed to add_to_class() always,
   4957 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
   4958 alternative calls for the different cases. */
   4959 
   4960 PCRE2_UCHAR *class_uchardata;
   4961 #ifdef SUPPORT_WIDE_CHARS
   4962 BOOL xclass;
   4963 PCRE2_UCHAR *class_uchardata_base;
   4964 #endif
   4965 
   4966 /* Set up the default and non-default settings for greediness */
   4967 
   4968 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
   4969 greedy_non_default = greedy_default ^ 1;
   4970 
   4971 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
   4972 matching encountered yet". It gets changed to REQ_NONE if we hit something that
   4973 matches a non-fixed first unit; reqcu just remains unset if we never find one.
   4974 
   4975 When we hit a repeat whose minimum is zero, we may have to adjust these values
   4976 to take the zero repeat into account. This is implemented by setting them to
   4977 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
   4978 item types that can be repeated set these backoff variables appropriately. */
   4979 
   4980 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
   4981 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
   4982 
   4983 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
   4984 according to the current setting of the caseless flag. The REQ_CASELESS value
   4985 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
   4986 to record the case status of the value. This is used only for ASCII characters.
   4987 */
   4988 
   4989 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
   4990 
   4991 /* Switch on next META item until the end of the branch */
   4992 
   4993 for (;; pptr++)
   4994   {
   4995 #ifdef SUPPORT_WIDE_CHARS
   4996   BOOL xclass_has_prop;
   4997 #endif
   4998   BOOL negate_class;
   4999   BOOL should_flip_negation;
   5000   BOOL match_all_or_no_wide_chars;
   5001   BOOL possessive_quantifier;
   5002   BOOL note_group_empty;
   5003   int class_has_8bitchar;
   5004   int i;
   5005   uint32_t mclength;
   5006   uint32_t skipunits;
   5007   uint32_t subreqcu, subfirstcu;
   5008   uint32_t groupnumber;
   5009   uint32_t verbarglen, verbculen;
   5010   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
   5011   open_capitem *oc;
   5012   PCRE2_UCHAR mcbuffer[8];
   5013 
   5014   /* Get next META item in the pattern and its potential argument. */
   5015 
   5016   meta = META_CODE(*pptr);
   5017   meta_arg = META_DATA(*pptr);
   5018 
   5019   /* If we are in the pre-compile phase, accumulate the length used for the
   5020   previous cycle of this loop, unless the next item is a quantifier. */
   5021 
   5022   if (lengthptr != NULL)
   5023     {
   5024     if (code > cb->start_workspace + cb->workspace_size -
   5025         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
   5026       {
   5027       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
   5028         ERR52 : ERR86;
   5029       return 0;
   5030       }
   5031 
   5032     /* There is at least one situation where code goes backwards: this is the
   5033     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
   5034     is processed, the whole class is eliminated. However, it is created first,
   5035     so we have to allow memory for it. Therefore, don't ever reduce the length
   5036     at this point. */
   5037 
   5038     if (code < last_code) code = last_code;
   5039 
   5040     /* If the next thing is not a quantifier, we add the length of the previous
   5041     item into the total, and reset the code pointer to the start of the
   5042     workspace. Otherwise leave the previous item available to be quantified. */
   5043 
   5044     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
   5045       {
   5046       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
   5047         {
   5048         *errorcodeptr = ERR20;   /* Integer overflow */
   5049         return 0;
   5050         }
   5051       *lengthptr += (PCRE2_SIZE)(code - orig_code);
   5052       if (*lengthptr > MAX_PATTERN_SIZE)
   5053         {
   5054         *errorcodeptr = ERR20;   /* Pattern is too large */
   5055         return 0;
   5056         }
   5057       code = orig_code;
   5058       }
   5059 
   5060     /* Remember where this code item starts so we can catch the "backwards"
   5061     case above next time round. */
   5062 
   5063     last_code = code;
   5064     }
   5065 
   5066   /* Process the next parsed pattern item. If it is not a quantifier, remember
   5067   where it starts so that it can be quantified when a quantifier follows.
   5068   Checking for the legality of quantifiers happens in parse_regex(), except for
   5069   a quantifier after an assertion that is a condition. */
   5070 
   5071   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
   5072     {
   5073     previous = code;
   5074     if (matched_char) okreturn = 1;
   5075     }
   5076 
   5077   previous_matched_char = matched_char;
   5078   matched_char = FALSE;
   5079   note_group_empty = FALSE;
   5080   skipunits = 0;         /* Default value for most subgroups */
   5081 
   5082   switch(meta)
   5083     {
   5084     /* ===================================================================*/
   5085     /* The branch terminates at pattern end or | or ) */
   5086 
   5087     case META_END:
   5088     case META_ALT:
   5089     case META_KET:
   5090     *firstcuptr = firstcu;
   5091     *firstcuflagsptr = firstcuflags;
   5092     *reqcuptr = reqcu;
   5093     *reqcuflagsptr = reqcuflags;
   5094     *codeptr = code;
   5095     *pptrptr = pptr;
   5096     return okreturn;
   5097 
   5098 
   5099     /* ===================================================================*/
   5100     /* Handle single-character metacharacters. In multiline mode, ^ disables
   5101     the setting of any following char as a first character. */
   5102 
   5103     case META_CIRCUMFLEX:
   5104     if ((options & PCRE2_MULTILINE) != 0)
   5105       {
   5106       if (firstcuflags == REQ_UNSET)
   5107         zerofirstcuflags = firstcuflags = REQ_NONE;
   5108       *code++ = OP_CIRCM;
   5109       }
   5110     else *code++ = OP_CIRC;
   5111     break;
   5112 
   5113     case META_DOLLAR:
   5114     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
   5115     break;
   5116 
   5117     /* There can never be a first char if '.' is first, whatever happens about
   5118     repeats. The value of reqcu doesn't change either. */
   5119 
   5120     case META_DOT:
   5121     matched_char = TRUE;
   5122     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5123     zerofirstcu = firstcu;
   5124     zerofirstcuflags = firstcuflags;
   5125     zeroreqcu = reqcu;
   5126     zeroreqcuflags = reqcuflags;
   5127     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
   5128     break;
   5129 
   5130 
   5131     /* ===================================================================*/
   5132     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
   5133     Otherwise, an initial ']' is taken as a data character. When empty classes
   5134     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
   5135     match any character, so generate OP_ALLANY. */
   5136 
   5137     case META_CLASS_EMPTY:
   5138     case META_CLASS_EMPTY_NOT:
   5139     matched_char = TRUE;
   5140     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
   5141     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5142     zerofirstcu = firstcu;
   5143     zerofirstcuflags = firstcuflags;
   5144     break;
   5145 
   5146 
   5147     /* ===================================================================*/
   5148     /* Non-empty character class. If the included characters are all < 256, we
   5149     build a 32-byte bitmap of the permitted characters, except in the special
   5150     case where there is only one such character. For negated classes, we build
   5151     the map as usual, then invert it at the end. However, we use a different
   5152     opcode so that data characters > 255 can be handled correctly.
   5153 
   5154     If the class contains characters outside the 0-255 range, a different
   5155     opcode is compiled. It may optionally have a bit map for characters < 256,
   5156     but those above are are explicitly listed afterwards. A flag code unit
   5157     tells whether the bitmap is present, and whether this is a negated class or
   5158     not. */
   5159 
   5160     case META_CLASS_NOT:
   5161     case META_CLASS:
   5162     matched_char = TRUE;
   5163     negate_class = meta == META_CLASS_NOT;
   5164 
   5165     /* We can optimize the case of a single character in a class by generating
   5166     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
   5167     negative. In the negative case there can be no first char if this item is
   5168     first, whatever repeat count may follow. In the case of reqcu, save the
   5169     previous value for reinstating. */
   5170 
   5171     /* NOTE: at present this optimization is not effective if the only
   5172     character in a class in 32-bit, non-UCP mode has its top bit set. */
   5173 
   5174     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
   5175       {
   5176 #ifdef SUPPORT_UNICODE
   5177       uint32_t d;
   5178 #endif
   5179       uint32_t c = pptr[1];
   5180 
   5181       pptr += 2;                 /* Move on to class end */
   5182       if (meta == META_CLASS)    /* A positive one-char class can be */
   5183         {                        /* handled as a normal literal character. */
   5184         meta = c;                /* Set up the character */
   5185         goto NORMAL_CHAR_SET;
   5186         }
   5187 
   5188       /* Handle a negative one-character class */
   5189 
   5190       zeroreqcu = reqcu;
   5191       zeroreqcuflags = reqcuflags;
   5192       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5193       zerofirstcu = firstcu;
   5194       zerofirstcuflags = firstcuflags;
   5195 
   5196       /* For caseless UTF mode, check whether this character has more than
   5197       one other case. If so, generate a special OP_NOTPROP item instead of
   5198       OP_NOTI. */
   5199 
   5200 #ifdef SUPPORT_UNICODE
   5201       if (utf && (options & PCRE2_CASELESS) != 0 &&
   5202           (d = UCD_CASESET(c)) != 0)
   5203         {
   5204         *code++ = OP_NOTPROP;
   5205         *code++ = PT_CLIST;
   5206         *code++ = d;
   5207         break;   /* We are finished with this class */
   5208         }
   5209 #endif
   5210       /* Char has only one other case, or UCP not available */
   5211 
   5212       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
   5213       code += PUTCHAR(c, code);
   5214       break;   /* We are finished with this class */
   5215       }        /* End of 1-char optimization */
   5216 
   5217     /* Handle character classes that contain more than just one literal
   5218     character. */
   5219 
   5220     /* If a non-extended class contains a negative special such as \S, we need
   5221     to flip the negation flag at the end, so that support for characters > 255
   5222     works correctly (they are all included in the class). An extended class may
   5223     need to insert specific matching or non-matching code for wide characters.
   5224     */
   5225 
   5226     should_flip_negation = match_all_or_no_wide_chars = FALSE;
   5227 
   5228     /* Extended class (xclass) will be used when characters > 255
   5229     might match. */
   5230 
   5231 #ifdef SUPPORT_WIDE_CHARS
   5232     xclass = FALSE;
   5233     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
   5234     class_uchardata_base = class_uchardata;   /* Save the start */
   5235 #endif
   5236 
   5237     /* For optimization purposes, we track some properties of the class:
   5238     class_has_8bitchar will be non-zero if the class contains at least one
   5239     character with a code point less than 256; xclass_has_prop will be TRUE if
   5240     Unicode property checks are present in the class. */
   5241 
   5242     class_has_8bitchar = 0;
   5243 #ifdef SUPPORT_WIDE_CHARS
   5244     xclass_has_prop = FALSE;
   5245 #endif
   5246 
   5247     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
   5248     in a temporary bit of memory, in case the class contains fewer than two
   5249     8-bit characters because in that case the compiled code doesn't use the bit
   5250     map. */
   5251 
   5252     memset(classbits, 0, 32 * sizeof(uint8_t));
   5253 
   5254     /* Process items until META_CLASS_END is reached. */
   5255 
   5256     while ((meta = *(++pptr)) != META_CLASS_END)
   5257       {
   5258       /* Handle POSIX classes such as [:alpha:] etc. */
   5259 
   5260       if (meta == META_POSIX || meta == META_POSIX_NEG)
   5261         {
   5262         BOOL local_negate = (meta == META_POSIX_NEG);
   5263         int posix_class = *(++pptr);
   5264         int taboffset, tabopt;
   5265         uint8_t pbits[32];
   5266 
   5267         should_flip_negation = local_negate;  /* Note negative special */
   5268 
   5269         /* If matching is caseless, upper and lower are converted to alpha.
   5270         This relies on the fact that the class table starts with alpha,
   5271         lower, upper as the first 3 entries. */
   5272 
   5273         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
   5274           posix_class = 0;
   5275 
   5276         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
   5277         different escape sequences that use Unicode properties \p or \P.
   5278         Others that are not available via \p or \P have to generate
   5279         XCL_PROP/XCL_NOTPROP directly, which is done here. */
   5280 
   5281 #ifdef SUPPORT_UNICODE
   5282         if ((options & PCRE2_UCP) != 0) switch(posix_class)
   5283           {
   5284           case PC_GRAPH:
   5285           case PC_PRINT:
   5286           case PC_PUNCT:
   5287           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
   5288           *class_uchardata++ = (PCRE2_UCHAR)
   5289             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
   5290              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
   5291           *class_uchardata++ = 0;
   5292           xclass_has_prop = TRUE;
   5293           goto CONTINUE_CLASS;
   5294 
   5295           /* For the other POSIX classes (ascii, xdigit) we are going to
   5296           fall through to the non-UCP case and build a bit map for
   5297           characters with code points less than 256. However, if we are in
   5298           a negated POSIX class, characters with code points greater than
   5299           255 must either all match or all not match, depending on whether
   5300           the whole class is not or is negated. For example, for
   5301           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
   5302           they must not.
   5303 
   5304           In the special case where there are no xclass items, this is
   5305           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
   5306           explicit range is needed for OP_XCLASS. Setting a flag here
   5307           causes the range to be generated later when it is known that
   5308           OP_XCLASS is required. In the 8-bit library this is relevant only in
   5309           utf mode, since no wide characters can exist otherwise. */
   5310 
   5311           default:
   5312 #if PCRE2_CODE_UNIT_WIDTH == 8
   5313           if (utf)
   5314 #endif
   5315           match_all_or_no_wide_chars |= local_negate;
   5316           break;
   5317           }
   5318 #endif  /* SUPPORT_UNICODE */
   5319 
   5320         /* In the non-UCP case, or when UCP makes no difference, we build the
   5321         bit map for the POSIX class in a chunk of local store because we may
   5322         be adding and subtracting from it, and we don't want to subtract bits
   5323         that may be in the main map already. At the end we or the result into
   5324         the bit map that is being built. */
   5325 
   5326         posix_class *= 3;
   5327 
   5328         /* Copy in the first table (always present) */
   5329 
   5330         memcpy(pbits, cbits + posix_class_maps[posix_class],
   5331           32 * sizeof(uint8_t));
   5332 
   5333         /* If there is a second table, add or remove it as required. */
   5334 
   5335         taboffset = posix_class_maps[posix_class + 1];
   5336         tabopt = posix_class_maps[posix_class + 2];
   5337 
   5338         if (taboffset >= 0)
   5339           {
   5340           if (tabopt >= 0)
   5341             for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
   5342           else
   5343             for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
   5344           }
   5345 
   5346         /* Now see if we need to remove any special characters. An option
   5347         value of 1 removes vertical space and 2 removes underscore. */
   5348 
   5349         if (tabopt < 0) tabopt = -tabopt;
   5350         if (tabopt == 1) pbits[1] &= ~0x3c;
   5351           else if (tabopt == 2) pbits[11] &= 0x7f;
   5352 
   5353         /* Add the POSIX table or its complement into the main table that is
   5354         being built and we are done. */
   5355 
   5356         if (local_negate)
   5357           for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
   5358         else
   5359           for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
   5360 
   5361         /* Every class contains at least one < 256 character. */
   5362 
   5363         class_has_8bitchar = 1;
   5364         goto CONTINUE_CLASS;    /* End of POSIX handling */
   5365         }
   5366 
   5367       /* Other than POSIX classes, the only items we should encounter are
   5368       \d-type escapes and literal characters (possibly as ranges). */
   5369 
   5370       if (meta == META_BIGVALUE)
   5371         {
   5372         meta = *(++pptr);
   5373         goto CLASS_LITERAL;
   5374         }
   5375 
   5376       /* Any other non-literal must be an escape */
   5377 
   5378       if (meta >= META_END)
   5379         {
   5380         if (META_CODE(meta) != META_ESCAPE)
   5381           {
   5382 #ifdef DEBUG_SHOW_PARSED
   5383           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
   5384                           "in character class\n", meta);
   5385 #endif
   5386           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
   5387           return 0;
   5388           }
   5389         escape = META_DATA(meta);
   5390 
   5391         /* Every class contains at least one < 256 character. */
   5392 
   5393         class_has_8bitchar++;
   5394 
   5395         switch(escape)
   5396           {
   5397           case ESC_d:
   5398           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
   5399           break;
   5400 
   5401           case ESC_D:
   5402           should_flip_negation = TRUE;
   5403           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
   5404           break;
   5405 
   5406           case ESC_w:
   5407           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
   5408           break;
   5409 
   5410           case ESC_W:
   5411           should_flip_negation = TRUE;
   5412           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
   5413           break;
   5414 
   5415           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
   5416           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
   5417           previously set by something earlier in the character class.
   5418           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
   5419           we could just adjust the appropriate bit. From PCRE 8.34 we no
   5420           longer treat \s and \S specially. */
   5421 
   5422           case ESC_s:
   5423           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
   5424           break;
   5425 
   5426           case ESC_S:
   5427           should_flip_negation = TRUE;
   5428           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
   5429           break;
   5430 
   5431           /* When adding the horizontal or vertical space lists to a class, or
   5432           their complements, disable PCRE2_CASELESS, because it justs wastes
   5433           time, and in the "not-x" UTF cases can create unwanted duplicates in
   5434           the XCLASS list (provoked by characters that have more than one other
   5435           case and by both cases being in the same "not-x" sublist). */
   5436 
   5437           case ESC_h:
   5438           (void)add_list_to_class(classbits, &class_uchardata,
   5439             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
   5440           break;
   5441 
   5442           case ESC_H:
   5443           (void)add_not_list_to_class(classbits, &class_uchardata,
   5444             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
   5445           break;
   5446 
   5447           case ESC_v:
   5448           (void)add_list_to_class(classbits, &class_uchardata,
   5449             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
   5450           break;
   5451 
   5452           case ESC_V:
   5453           (void)add_not_list_to_class(classbits, &class_uchardata,
   5454             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
   5455           break;
   5456 
   5457           /* If Unicode is not supported, \P and \p are not allowed and are
   5458           faulted at parse time, so will never appear here. */
   5459 
   5460 #ifdef SUPPORT_UNICODE
   5461           case ESC_p:
   5462           case ESC_P:
   5463             {
   5464             uint32_t ptype = *(++pptr) >> 16;
   5465             uint32_t pdata = *pptr & 0xffff;
   5466             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
   5467             *class_uchardata++ = ptype;
   5468             *class_uchardata++ = pdata;
   5469             xclass_has_prop = TRUE;
   5470             class_has_8bitchar--;                /* Undo! */
   5471             }
   5472           break;
   5473 #endif
   5474           }
   5475 
   5476         goto CONTINUE_CLASS;
   5477         }  /* End handling \d-type escapes */
   5478 
   5479       /* A literal character may be followed by a range meta. At parse time
   5480       there are checks for out-of-order characters, for ranges where the two
   5481       characters are equal, and for hyphens that cannot indicate a range. At
   5482       this point, therefore, no checking is needed. */
   5483 
   5484       else
   5485         {
   5486         uint32_t c, d;
   5487 
   5488         CLASS_LITERAL:
   5489         c = d = meta;
   5490 
   5491         /* Remember if \r or \n were explicitly used */
   5492 
   5493         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
   5494 
   5495         /* Process a character range */
   5496 
   5497         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
   5498           {
   5499 #ifdef EBCDIC
   5500           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
   5501 #endif
   5502           pptr += 2;
   5503           d = *pptr;
   5504           if (d == META_BIGVALUE) d = *(++pptr);
   5505 
   5506           /* Remember an explicit \r or \n, and add the range to the class. */
   5507 
   5508           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
   5509 
   5510           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
   5511           because there are holes in the encoding, and simply using the range
   5512           A-Z (for example) would include the characters in the holes. This
   5513           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
   5514 
   5515 #ifdef EBCDIC
   5516           if (range_is_literal &&
   5517                (cb->ctypes[c] & ctype_letter) != 0 &&
   5518                (cb->ctypes[d] & ctype_letter) != 0 &&
   5519                (d <= CHAR_z) == (d <= CHAR_z))
   5520             {
   5521             uint32_t uc = (d <= CHAR_z)? 0 : 64;
   5522             uint32_t C = d - uc;
   5523             uint32_t D = d - uc;
   5524 
   5525             if (C <= CHAR_i)
   5526               {
   5527               class_has_8bitchar +=
   5528                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   5529                   ((D < CHAR_i)? D : CHAR_i) + uc);
   5530               C = CHAR_j;
   5531               }
   5532 
   5533             if (C <= D && C <= CHAR_r)
   5534               {
   5535               class_has_8bitchar +=
   5536                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   5537                   ((D < CHAR_r)? D : CHAR_r) + uc);
   5538               C = CHAR_s;
   5539               }
   5540 
   5541             if (C <= D)
   5542               {
   5543               class_has_8bitchar +=
   5544                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
   5545                   D + uc);
   5546               }
   5547             }
   5548           else
   5549 #endif
   5550           /* Not an EBCDIC special range */
   5551 
   5552           class_has_8bitchar +=
   5553             add_to_class(classbits, &class_uchardata, options, cb, c, d);
   5554           goto CONTINUE_CLASS;   /* Go get the next char in the class */
   5555           }  /* End of range handling */
   5556 
   5557 
   5558         /* Handle a single character. */
   5559 
   5560         class_has_8bitchar +=
   5561           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
   5562         }
   5563 
   5564       /* Continue to the next item in the class. */
   5565 
   5566       CONTINUE_CLASS:
   5567 
   5568 #ifdef SUPPORT_WIDE_CHARS
   5569       /* If any wide characters or Unicode properties have been encountered,
   5570       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
   5571       of the extra data and reset the pointer. This is so that very large
   5572       classes that contain a zillion wide characters or Unicode property tests
   5573       do not overwrite the workspace (which is on the stack). */
   5574 
   5575       if (class_uchardata > class_uchardata_base)
   5576         {
   5577         xclass = TRUE;
   5578         if (lengthptr != NULL)
   5579           {
   5580           *lengthptr += class_uchardata - class_uchardata_base;
   5581           class_uchardata = class_uchardata_base;
   5582           }
   5583         }
   5584 #endif
   5585 
   5586       continue;  /* Needed to avoid error when not supporting wide chars */
   5587       }   /* End of main class-processing loop */
   5588 
   5589     /* If this class is the first thing in the branch, there can be no first
   5590     char setting, whatever the repeat count. Any reqcu setting must remain
   5591     unchanged after any kind of repeat. */
   5592 
   5593     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5594     zerofirstcu = firstcu;
   5595     zerofirstcuflags = firstcuflags;
   5596     zeroreqcu = reqcu;
   5597     zeroreqcuflags = reqcuflags;
   5598 
   5599     /* If there are characters with values > 255, or Unicode property settings
   5600     (\p or \P), we have to compile an extended class, with its own opcode,
   5601     unless there were no property settings and there was a negated special such
   5602     as \S in the class, and PCRE2_UCP is not set, because in that case all
   5603     characters > 255 are in or not in the class, so any that were explicitly
   5604     given as well can be ignored.
   5605 
   5606     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
   5607     [^:xdigit:]) were present in a class, we either have to match or not match
   5608     all wide characters (depending on whether the whole class is or is not
   5609     negated). This requirement is indicated by match_all_or_no_wide_chars being
   5610     true. We do this by including an explicit range, which works in both cases.
   5611     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
   5612     cannot be any wide characters in 8-bit non-UTF mode.
   5613 
   5614     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
   5615     class where \S etc is present without PCRE2_UCP, causing an extended class
   5616     to be compiled, we make sure that all characters > 255 are included by
   5617     forcing match_all_or_no_wide_chars to be true.
   5618 
   5619     If, when generating an xclass, there are no characters < 256, we can omit
   5620     the bitmap in the actual compiled code. */
   5621 
   5622 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
   5623     if (xclass && (
   5624 #ifdef SUPPORT_UNICODE
   5625         (options & PCRE2_UCP) != 0 ||
   5626 #endif
   5627         xclass_has_prop || !should_flip_negation))
   5628       {
   5629       if (match_all_or_no_wide_chars || (
   5630 #if PCRE2_CODE_UNIT_WIDTH == 8
   5631            utf &&
   5632 #endif
   5633            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
   5634         {
   5635         *class_uchardata++ = XCL_RANGE;
   5636         if (utf)   /* Will always be utf in the 8-bit library */
   5637           {
   5638           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
   5639           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
   5640           }
   5641         else       /* Can only happen for the 16-bit & 32-bit libraries */
   5642           {
   5643 #if PCRE2_CODE_UNIT_WIDTH == 16
   5644           *class_uchardata++ = 0x100;
   5645           *class_uchardata++ = 0xffffu;
   5646 #elif PCRE2_CODE_UNIT_WIDTH == 32
   5647           *class_uchardata++ = 0x100;
   5648           *class_uchardata++ = 0xffffffffu;
   5649 #endif
   5650           }
   5651         }
   5652       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
   5653       *code++ = OP_XCLASS;
   5654       code += LINK_SIZE;
   5655       *code = negate_class? XCL_NOT:0;
   5656       if (xclass_has_prop) *code |= XCL_HASPROP;
   5657 
   5658       /* If the map is required, move up the extra data to make room for it;
   5659       otherwise just move the code pointer to the end of the extra data. */
   5660 
   5661       if (class_has_8bitchar > 0)
   5662         {
   5663         *code++ |= XCL_MAP;
   5664         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
   5665           CU2BYTES(class_uchardata - code));
   5666         if (negate_class && !xclass_has_prop)
   5667           for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
   5668         memcpy(code, classbits, 32);
   5669         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
   5670         }
   5671       else code = class_uchardata;
   5672 
   5673       /* Now fill in the complete length of the item */
   5674 
   5675       PUT(previous, 1, (int)(code - previous));
   5676       break;   /* End of class handling */
   5677       }
   5678 #endif  /* SUPPORT_WIDE_CHARS */
   5679 
   5680     /* If there are no characters > 255, or they are all to be included or
   5681     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
   5682     whole class was negated and whether there were negative specials such as \S
   5683     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
   5684     negating it if necessary. */
   5685 
   5686     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
   5687     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
   5688       {
   5689       if (negate_class)
   5690         for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
   5691       memcpy(code, classbits, 32);
   5692       }
   5693     code += 32 / sizeof(PCRE2_UCHAR);
   5694     break;  /* End of class processing */
   5695 
   5696 
   5697     /* ===================================================================*/
   5698     /* Deal with (*VERB)s. */
   5699 
   5700     /* Check for open captures before ACCEPT and close those that are within
   5701     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
   5702     assertion. In the first pass, just accumulate the length required;
   5703     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
   5704     workspace overflow. Do not set firstcu after *ACCEPT. */
   5705 
   5706     case META_ACCEPT:
   5707     cb->had_accept = TRUE;
   5708     for (oc = cb->open_caps;
   5709          oc != NULL && oc->assert_depth >= cb->assert_depth;
   5710          oc = oc->next)
   5711       {
   5712       if (lengthptr != NULL)
   5713         {
   5714         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
   5715         }
   5716       else
   5717         {
   5718         *code++ = OP_CLOSE;
   5719         PUT2INC(code, 0, oc->number);
   5720         }
   5721       }
   5722     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
   5723     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   5724     break;
   5725 
   5726     case META_PRUNE:
   5727     case META_SKIP:
   5728     cb->had_pruneorskip = TRUE;
   5729     /* Fall through */
   5730     case META_COMMIT:
   5731     case META_FAIL:
   5732     *code++ = verbops[(meta - META_MARK) >> 16];
   5733     break;
   5734 
   5735     case META_THEN:
   5736     cb->external_flags |= PCRE2_HASTHEN;
   5737     *code++ = OP_THEN;
   5738     break;
   5739 
   5740     /* Handle verbs with arguments. Arguments can be very long, especially in
   5741     16- and 32-bit modes, and can overflow the workspace in the first pass.
   5742     However, the argument length is constrained to be small enough to fit in
   5743     one code unit. This check happens in parse_regex(). In the first pass,
   5744     instead of putting the argument into memory, we just update the length
   5745     counter and set up an empty argument. */
   5746 
   5747     case META_THEN_ARG:
   5748     cb->external_flags |= PCRE2_HASTHEN;
   5749     goto VERB_ARG;
   5750 
   5751     case META_PRUNE_ARG:
   5752     case META_SKIP_ARG:
   5753     cb->had_pruneorskip = TRUE;
   5754     /* Fall through */
   5755     case META_MARK:
   5756     case META_COMMIT_ARG:
   5757     VERB_ARG:
   5758     *code++ = verbops[(meta - META_MARK) >> 16];
   5759     /* The length is in characters. */
   5760     verbarglen = *(++pptr);
   5761     verbculen = 0;
   5762     tempcode = code++;
   5763     for (i = 0; i < (int)verbarglen; i++)
   5764       {
   5765       meta = *(++pptr);
   5766 #ifdef SUPPORT_UNICODE
   5767       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
   5768 #endif
   5769         {
   5770         mclength = 1;
   5771         mcbuffer[0] = meta;
   5772         }
   5773       if (lengthptr != NULL) *lengthptr += mclength; else
   5774         {
   5775         memcpy(code, mcbuffer, CU2BYTES(mclength));
   5776         code += mclength;
   5777         verbculen += mclength;
   5778         }
   5779       }
   5780 
   5781     *tempcode = verbculen;   /* Fill in the code unit length */
   5782     *code++ = 0;             /* Terminating zero */
   5783     break;
   5784 
   5785 
   5786     /* ===================================================================*/
   5787     /* Handle options change. The new setting must be passed back for use in
   5788     subsequent branches. Reset the greedy defaults and the case value for
   5789     firstcu and reqcu. */
   5790 
   5791     case META_OPTIONS:
   5792     *optionsptr = options = *(++pptr);
   5793     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
   5794     greedy_non_default = greedy_default ^ 1;
   5795     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
   5796     break;
   5797 
   5798 
   5799     /* ===================================================================*/
   5800     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
   5801     because it could be a numerical check on recursion, or a name check on a
   5802     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
   5803     we can handle it either way. We first try for a name; if not found, process
   5804     the number. */
   5805 
   5806     case META_COND_RNUMBER:   /* (?(Rdigits) */
   5807     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
   5808     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
   5809     bravalue = OP_COND;
   5810       {
   5811       int count, index;
   5812       PCRE2_SPTR name;
   5813       named_group *ng = cb->named_groups;
   5814       uint32_t length = *(++pptr);
   5815 
   5816       GETPLUSOFFSET(offset, pptr);
   5817       name = cb->start_pattern + offset;
   5818 
   5819       /* In the first pass, the names generated in the pre-pass are available,
   5820       but the main name table has not yet been created. Scan the list of names
   5821       generated in the pre-pass in order to get a number and whether or not
   5822       this name is duplicated. If it is not duplicated, we can handle it as a
   5823       numerical group. */
   5824 
   5825       for (i = 0; i < cb->names_found; i++, ng++)
   5826         {
   5827         if (length == ng->length &&
   5828             PRIV(strncmp)(name, ng->name, length) == 0)
   5829           {
   5830           if (!ng->isdup)
   5831             {
   5832             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
   5833             PUT2(code, 2+LINK_SIZE, ng->number);
   5834             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
   5835             skipunits = 1+IMM2_SIZE;
   5836             goto GROUP_PROCESS_NOTE_EMPTY;
   5837             }
   5838           break;  /* Found a duplicated name */
   5839           }
   5840         }
   5841 
   5842       /* If the name was not found we have a bad reference, unless we are
   5843       dealing with R<digits>, which is treated as a recursion test by number.
   5844       */
   5845 
   5846       if (i >= cb->names_found)
   5847         {
   5848         groupnumber = 0;
   5849         if (meta == META_COND_RNUMBER)
   5850           {
   5851           for (i = 1; i < (int)length; i++)
   5852             {
   5853             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
   5854             if (groupnumber > MAX_GROUP_NUMBER)
   5855               {
   5856               *errorcodeptr = ERR61;
   5857               cb->erroroffset = offset + i;
   5858               return 0;
   5859               }
   5860             }
   5861           }
   5862 
   5863         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
   5864           {
   5865           *errorcodeptr = ERR15;
   5866           cb->erroroffset = offset;
   5867           return 0;
   5868           }
   5869 
   5870         /* (?Rdigits) treated as a recursion reference by number. A value of
   5871         zero (which is the result of both (?R) and (?R0)) means "any", and is
   5872         translated into RREF_ANY (which is 0xffff). */
   5873 
   5874         if (groupnumber == 0) groupnumber = RREF_ANY;
   5875         code[1+LINK_SIZE] = OP_RREF;
   5876         PUT2(code, 2+LINK_SIZE, groupnumber);
   5877         skipunits = 1+IMM2_SIZE;
   5878         goto GROUP_PROCESS_NOTE_EMPTY;
   5879         }
   5880 
   5881       /* A duplicated name was found. Note that if an R<digits> name is found
   5882       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
   5883 
   5884       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
   5885 
   5886       /* We have a duplicated name. In the compile pass we have to search the
   5887       main table in order to get the index and count values. */
   5888 
   5889       count = 0;  /* Values for first pass (avoids compiler warning) */
   5890       index = 0;
   5891       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
   5892             &count, errorcodeptr, cb)) return 0;
   5893 
   5894       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
   5895       insert appropriate data values. */
   5896 
   5897       code[1+LINK_SIZE]++;
   5898       skipunits = 1+2*IMM2_SIZE;
   5899       PUT2(code, 2+LINK_SIZE, index);
   5900       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
   5901       }
   5902     goto GROUP_PROCESS_NOTE_EMPTY;
   5903 
   5904     /* The DEFINE condition is always false. It's internal groups may never
   5905     be called, so matched_char must remain false, hence the jump to
   5906     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
   5907 
   5908     case META_COND_DEFINE:
   5909     bravalue = OP_COND;
   5910     GETPLUSOFFSET(offset, pptr);
   5911     code[1+LINK_SIZE] = OP_DEFINE;
   5912     skipunits = 1;
   5913     goto GROUP_PROCESS;
   5914 
   5915     /* Conditional test of a group's being set. */
   5916 
   5917     case META_COND_NUMBER:
   5918     bravalue = OP_COND;
   5919     GETPLUSOFFSET(offset, pptr);
   5920     groupnumber = *(++pptr);
   5921     if (groupnumber > cb->bracount)
   5922       {
   5923       *errorcodeptr = ERR15;
   5924       cb->erroroffset = offset;
   5925       return 0;
   5926       }
   5927     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
   5928     offset -= 2;   /* Point at initial ( for too many branches error */
   5929     code[1+LINK_SIZE] = OP_CREF;
   5930     skipunits = 1+IMM2_SIZE;
   5931     PUT2(code, 2+LINK_SIZE, groupnumber);
   5932     goto GROUP_PROCESS_NOTE_EMPTY;
   5933 
   5934     /* Test for the PCRE2 version. */
   5935 
   5936     case META_COND_VERSION:
   5937     bravalue = OP_COND;
   5938     if (pptr[1] > 0)
   5939       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
   5940         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
   5941           OP_TRUE : OP_FALSE;
   5942     else
   5943       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
   5944         OP_TRUE : OP_FALSE;
   5945     skipunits = 1;
   5946     pptr += 3;
   5947     goto GROUP_PROCESS_NOTE_EMPTY;
   5948 
   5949     /* The condition is an assertion, possibly preceded by a callout. */
   5950 
   5951     case META_COND_ASSERT:
   5952     bravalue = OP_COND;
   5953     goto GROUP_PROCESS_NOTE_EMPTY;
   5954 
   5955 
   5956     /* ===================================================================*/
   5957     /* Handle all kinds of nested bracketed groups. The non-capturing,
   5958     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
   5959 
   5960     case META_LOOKAHEAD:
   5961     bravalue = OP_ASSERT;
   5962     cb->assert_depth += 1;
   5963     goto GROUP_PROCESS;
   5964 
   5965     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
   5966     thing to do, but Perl allows all assertions to be quantified, and when
   5967     they contain capturing parentheses there may be a potential use for
   5968     this feature. Not that that applies to a quantified (?!) but we allow
   5969     it for uniformity. */
   5970 
   5971     case META_LOOKAHEADNOT:
   5972     if (pptr[1] == META_KET &&
   5973          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
   5974       {
   5975       *code++ = OP_FAIL;
   5976       pptr++;
   5977       }
   5978     else
   5979       {
   5980       bravalue = OP_ASSERT_NOT;
   5981       cb->assert_depth += 1;
   5982       goto GROUP_PROCESS;
   5983       }
   5984     break;
   5985 
   5986     case META_LOOKBEHIND:
   5987     bravalue = OP_ASSERTBACK;
   5988     cb->assert_depth += 1;
   5989     goto GROUP_PROCESS;
   5990 
   5991     case META_LOOKBEHINDNOT:
   5992     bravalue = OP_ASSERTBACK_NOT;
   5993     cb->assert_depth += 1;
   5994     goto GROUP_PROCESS;
   5995 
   5996     case META_ATOMIC:
   5997     bravalue = OP_ONCE;
   5998     goto GROUP_PROCESS_NOTE_EMPTY;
   5999 
   6000     case META_NOCAPTURE:
   6001     bravalue = OP_BRA;
   6002     /* Fall through */
   6003 
   6004     /* Process nested bracketed regex. The nesting depth is maintained for the
   6005     benefit of the stackguard function. The test for too deep nesting is now
   6006     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
   6007     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
   6008     note of whether or not they may match an empty string. */
   6009 
   6010     GROUP_PROCESS_NOTE_EMPTY:
   6011     note_group_empty = TRUE;
   6012 
   6013     GROUP_PROCESS:
   6014     cb->parens_depth += 1;
   6015     *code = bravalue;
   6016     pptr++;
   6017     tempcode = code;
   6018     tempreqvary = cb->req_varyopt;        /* Save value before group */
   6019     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
   6020 
   6021     if ((group_return =
   6022          compile_regex(
   6023          options,                         /* The option state */
   6024          &tempcode,                       /* Where to put code (updated) */
   6025          &pptr,                           /* Input pointer (updated) */
   6026          errorcodeptr,                    /* Where to put an error message */
   6027          skipunits,                       /* Skip over bracket number */
   6028          &subfirstcu,                     /* For possible first char */
   6029          &subfirstcuflags,
   6030          &subreqcu,                       /* For possible last char */
   6031          &subreqcuflags,
   6032          bcptr,                           /* Current branch chain */
   6033          cb,                              /* Compile data block */
   6034          (lengthptr == NULL)? NULL :      /* Actual compile phase */
   6035            &length_prevgroup              /* Pre-compile phase */
   6036          )) == 0)
   6037       return 0;  /* Error */
   6038 
   6039     cb->parens_depth -= 1;
   6040 
   6041     /* If that was a non-conditional significant group (not an assertion, not a
   6042     DEFINE) that matches at least one character, then the current item matches
   6043     a character. Conditionals are handled below. */
   6044 
   6045     if (note_group_empty && bravalue != OP_COND && group_return > 0)
   6046       matched_char = TRUE;
   6047 
   6048     /* If we've just compiled an assertion, pop the assert depth. */
   6049 
   6050     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
   6051       cb->assert_depth -= 1;
   6052 
   6053     /* At the end of compiling, code is still pointing to the start of the
   6054     group, while tempcode has been updated to point past the end of the group.
   6055     The parsed pattern pointer (pptr) is on the closing META_KET.
   6056 
   6057     If this is a conditional bracket, check that there are no more than
   6058     two branches in the group, or just one if it's a DEFINE group. We do this
   6059     in the real compile phase, not in the pre-pass, where the whole group may
   6060     not be available. */
   6061 
   6062     if (bravalue == OP_COND && lengthptr == NULL)
   6063       {
   6064       PCRE2_UCHAR *tc = code;
   6065       int condcount = 0;
   6066 
   6067       do {
   6068          condcount++;
   6069          tc += GET(tc,1);
   6070          }
   6071       while (*tc != OP_KET);
   6072 
   6073       /* A DEFINE group is never obeyed inline (the "condition" is always
   6074       false). It must have only one branch. Having checked this, change the
   6075       opcode to OP_FALSE. */
   6076 
   6077       if (code[LINK_SIZE+1] == OP_DEFINE)
   6078         {
   6079         if (condcount > 1)
   6080           {
   6081           cb->erroroffset = offset;
   6082           *errorcodeptr = ERR54;
   6083           return 0;
   6084           }
   6085         code[LINK_SIZE+1] = OP_FALSE;
   6086         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
   6087         }
   6088 
   6089       /* A "normal" conditional group. If there is just one branch, we must not
   6090       make use of its firstcu or reqcu, because this is equivalent to an
   6091       empty second branch. Also, it may match an empty string. If there are two
   6092       branches, this item must match a character if the group must. */
   6093 
   6094       else
   6095         {
   6096         if (condcount > 2)
   6097           {
   6098           cb->erroroffset = offset;
   6099           *errorcodeptr = ERR27;
   6100           return 0;
   6101           }
   6102         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
   6103           else if (group_return > 0) matched_char = TRUE;
   6104         }
   6105       }
   6106 
   6107     /* In the pre-compile phase, update the length by the length of the group,
   6108     less the brackets at either end. Then reduce the compiled code to just a
   6109     set of non-capturing brackets so that it doesn't use much memory if it is
   6110     duplicated by a quantifier.*/
   6111 
   6112     if (lengthptr != NULL)
   6113       {
   6114       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
   6115         {
   6116         *errorcodeptr = ERR20;
   6117         return 0;
   6118         }
   6119       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
   6120       code++;   /* This already contains bravalue */
   6121       PUTINC(code, 0, 1 + LINK_SIZE);
   6122       *code++ = OP_KET;
   6123       PUTINC(code, 0, 1 + LINK_SIZE);
   6124       break;    /* No need to waste time with special character handling */
   6125       }
   6126 
   6127     /* Otherwise update the main code pointer to the end of the group. */
   6128 
   6129     code = tempcode;
   6130 
   6131     /* For a DEFINE group, required and first character settings are not
   6132     relevant. */
   6133 
   6134     if (bravalue == OP_DEFINE) break;
   6135 
   6136     /* Handle updating of the required and first code units for other types of
   6137     group. Update for normal brackets of all kinds, and conditions with two
   6138     branches (see code above). If the bracket is followed by a quantifier with
   6139     zero repeat, we have to back off. Hence the definition of zeroreqcu and
   6140     zerofirstcu outside the main loop so that they can be accessed for the back
   6141     off. */
   6142 
   6143     zeroreqcu = reqcu;
   6144     zeroreqcuflags = reqcuflags;
   6145     zerofirstcu = firstcu;
   6146     zerofirstcuflags = firstcuflags;
   6147     groupsetfirstcu = FALSE;
   6148 
   6149     if (bravalue >= OP_ONCE)  /* Not an assertion */
   6150       {
   6151       /* If we have not yet set a firstcu in this branch, take it from the
   6152       subpattern, remembering that it was set here so that a repeat of more
   6153       than one can replicate it as reqcu if necessary. If the subpattern has
   6154       no firstcu, set "none" for the whole branch. In both cases, a zero
   6155       repeat forces firstcu to "none". */
   6156 
   6157       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
   6158         {
   6159         if (subfirstcuflags >= 0)
   6160           {
   6161           firstcu = subfirstcu;
   6162           firstcuflags = subfirstcuflags;
   6163           groupsetfirstcu = TRUE;
   6164           }
   6165         else firstcuflags = REQ_NONE;
   6166         zerofirstcuflags = REQ_NONE;
   6167         }
   6168 
   6169       /* If firstcu was previously set, convert the subpattern's firstcu
   6170       into reqcu if there wasn't one, using the vary flag that was in
   6171       existence beforehand. */
   6172 
   6173       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
   6174         {
   6175         subreqcu = subfirstcu;
   6176         subreqcuflags = subfirstcuflags | tempreqvary;
   6177         }
   6178 
   6179       /* If the subpattern set a required code unit (or set a first code unit
   6180       that isn't really the first code unit - see above), set it. */
   6181 
   6182       if (subreqcuflags >= 0)
   6183         {
   6184         reqcu = subreqcu;
   6185         reqcuflags = subreqcuflags;
   6186         }
   6187       }
   6188 
   6189     /* For a forward assertion, we take the reqcu, if set, provided that the
   6190     group has also set a firstcu. This can be helpful if the pattern that
   6191     follows the assertion doesn't set a different char. For example, it's
   6192     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
   6193     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
   6194     the "real" "a" would then become a reqcu instead of a firstcu. This is
   6195     overcome by a scan at the end if there's no firstcu, looking for an
   6196     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
   6197     we must only take the reqcu when the group also set a firstcu. Otherwise,
   6198     in that example, 'X' ends up set for both. */
   6199 
   6200     else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
   6201              subfirstcuflags >= 0)
   6202       {
   6203       reqcu = subreqcu;
   6204       reqcuflags = subreqcuflags;
   6205       }
   6206 
   6207     break;  /* End of nested group handling */
   6208 
   6209 
   6210     /* ===================================================================*/
   6211     /* Handle named backreferences and recursions. */
   6212 
   6213     case META_BACKREF_BYNAME:
   6214     case META_RECURSE_BYNAME:
   6215       {
   6216       int count, index;
   6217       PCRE2_SPTR name;
   6218       BOOL is_dupname = FALSE;
   6219       named_group *ng = cb->named_groups;
   6220       uint32_t length = *(++pptr);
   6221 
   6222       GETPLUSOFFSET(offset, pptr);
   6223       name = cb->start_pattern + offset;
   6224 
   6225       /* In the first pass, the names generated in the pre-pass are available,
   6226       but the main name table has not yet been created. Scan the list of names
   6227       generated in the pre-pass in order to get a number and whether or not
   6228       this name is duplicated. */
   6229 
   6230       groupnumber = 0;
   6231       for (i = 0; i < cb->names_found; i++, ng++)
   6232         {
   6233         if (length == ng->length &&
   6234             PRIV(strncmp)(name, ng->name, length) == 0)
   6235           {
   6236           is_dupname = ng->isdup;
   6237           groupnumber = ng->number;
   6238 
   6239           /* For a recursion, that's all that is needed. We can now go to
   6240           the code above that handles numerical recursion, applying it to
   6241           the first group with the given name. */
   6242 
   6243           if (meta == META_RECURSE_BYNAME)
   6244             {
   6245             meta_arg = groupnumber;
   6246             goto HANDLE_NUMERICAL_RECURSION;
   6247             }
   6248 
   6249           /* For a back reference, update the back reference map and the
   6250           maximum back reference. Then, for each group, we must check to
   6251           see if it is recursive, that is, it is inside the group that it
   6252           references. A flag is set so that the group can be made atomic.
   6253           */
   6254 
   6255           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
   6256           if (groupnumber > cb->top_backref)
   6257             cb->top_backref = groupnumber;
   6258 
   6259           for (oc = cb->open_caps; oc != NULL; oc = oc->next)
   6260             {
   6261             if (oc->number == groupnumber)
   6262               {
   6263               oc->flag = TRUE;
   6264               break;
   6265               }
   6266             }
   6267           }
   6268         }
   6269 
   6270       /* If the name was not found we have a bad reference. */
   6271 
   6272       if (groupnumber == 0)
   6273         {
   6274         *errorcodeptr = ERR15;
   6275         cb->erroroffset = offset;
   6276         return 0;
   6277         }
   6278 
   6279       /* If a back reference name is not duplicated, we can handle it as
   6280       a numerical reference. */
   6281 
   6282       if (!is_dupname)
   6283         {
   6284         meta_arg = groupnumber;
   6285         goto HANDLE_SINGLE_REFERENCE;
   6286         }
   6287 
   6288       /* If a back reference name is duplicated, we generate a different
   6289       opcode to a numerical back reference. In the second pass we must
   6290       search for the index and count in the final name table. */
   6291 
   6292       count = 0;  /* Values for first pass (avoids compiler warning) */
   6293       index = 0;
   6294       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
   6295             &count, errorcodeptr, cb)) return 0;
   6296 
   6297       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   6298       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
   6299       PUT2INC(code, 0, index);
   6300       PUT2INC(code, 0, count);
   6301       }
   6302     break;
   6303 
   6304 
   6305     /* ===================================================================*/
   6306     /* Handle a numerical callout. */
   6307 
   6308     case META_CALLOUT_NUMBER:
   6309     code[0] = OP_CALLOUT;
   6310     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
   6311     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
   6312     code[1 + 2*LINK_SIZE] = pptr[3];
   6313     pptr += 3;
   6314     code += PRIV(OP_lengths)[OP_CALLOUT];
   6315     break;
   6316 
   6317 
   6318     /* ===================================================================*/
   6319     /* Handle a callout with a string argument. In the pre-pass we just compute
   6320     the length without generating anything. The length in pptr[3] includes both
   6321     delimiters; in the actual compile only the first one is copied, but a
   6322     terminating zero is added. Any doubled delimiters within the string make
   6323     this an overestimate, but it is not worth bothering about. */
   6324 
   6325     case META_CALLOUT_STRING:
   6326     if (lengthptr != NULL)
   6327       {
   6328       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
   6329       pptr += 3;
   6330       SKIPOFFSET(pptr);
   6331       }
   6332 
   6333     /* In the real compile we can copy the string. The starting delimiter is
   6334      included so that the client can discover it if they want. We also pass the
   6335      start offset to help a script language give better error messages. */
   6336 
   6337     else
   6338       {
   6339       PCRE2_SPTR pp;
   6340       uint32_t delimiter;
   6341       uint32_t length = pptr[3];
   6342       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
   6343 
   6344       code[0] = OP_CALLOUT_STR;
   6345       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
   6346       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
   6347 
   6348       pptr += 3;
   6349       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
   6350       pp = cb->start_pattern + offset;
   6351       delimiter = *callout_string++ = *pp++;
   6352       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
   6353         delimiter = CHAR_RIGHT_CURLY_BRACKET;
   6354       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
   6355 
   6356       /* The syntax of the pattern was checked in the parsing scan. The length
   6357       includes both delimiters, but we have passed the opening one just above,
   6358       so we reduce length before testing it. The test is for > 1 because we do
   6359       not want to copy the final delimiter. This also ensures that pp[1] is
   6360       accessible. */
   6361 
   6362       while (--length > 1)
   6363         {
   6364         if (*pp == delimiter && pp[1] == delimiter)
   6365           {
   6366           *callout_string++ = delimiter;
   6367           pp += 2;
   6368           length--;
   6369           }
   6370         else *callout_string++ = *pp++;
   6371         }
   6372       *callout_string++ = CHAR_NUL;
   6373 
   6374       /* Set the length of the entire item, the advance to its end. */
   6375 
   6376       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
   6377       code = callout_string;
   6378       }
   6379     break;
   6380 
   6381 
   6382     /* ===================================================================*/
   6383     /* Handle repetition. The different types are all sorted out in the parsing
   6384     pass. */
   6385 
   6386     case META_MINMAX_PLUS:
   6387     case META_MINMAX_QUERY:
   6388     case META_MINMAX:
   6389     repeat_min = *(++pptr);
   6390     repeat_max = *(++pptr);
   6391     goto REPEAT;
   6392 
   6393     case META_ASTERISK:
   6394     case META_ASTERISK_PLUS:
   6395     case META_ASTERISK_QUERY:
   6396     repeat_min = 0;
   6397     repeat_max = REPEAT_UNLIMITED;
   6398     goto REPEAT;
   6399 
   6400     case META_PLUS:
   6401     case META_PLUS_PLUS:
   6402     case META_PLUS_QUERY:
   6403     repeat_min = 1;
   6404     repeat_max = REPEAT_UNLIMITED;
   6405     goto REPEAT;
   6406 
   6407     case META_QUERY:
   6408     case META_QUERY_PLUS:
   6409     case META_QUERY_QUERY:
   6410     repeat_min = 0;
   6411     repeat_max = 1;
   6412 
   6413     REPEAT:
   6414     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
   6415 
   6416     /* Remember whether this is a variable length repeat, and default to
   6417     single-char opcodes. */
   6418 
   6419     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
   6420     op_type = 0;
   6421 
   6422     /* If the repeat is {1} we can ignore it. */
   6423 
   6424     if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
   6425 
   6426     /* Adjust first and required code units for a zero repeat. */
   6427 
   6428     if (repeat_min == 0)
   6429       {
   6430       firstcu = zerofirstcu;
   6431       firstcuflags = zerofirstcuflags;
   6432       reqcu = zeroreqcu;
   6433       reqcuflags = zeroreqcuflags;
   6434       }
   6435 
   6436     /* Note the greediness and possessiveness. */
   6437 
   6438     switch (meta)
   6439       {
   6440       case META_MINMAX_PLUS:
   6441       case META_ASTERISK_PLUS:
   6442       case META_PLUS_PLUS:
   6443       case META_QUERY_PLUS:
   6444       repeat_type = 0;                  /* Force greedy */
   6445       possessive_quantifier = TRUE;
   6446       break;
   6447 
   6448       case META_MINMAX_QUERY:
   6449       case META_ASTERISK_QUERY:
   6450       case META_PLUS_QUERY:
   6451       case META_QUERY_QUERY:
   6452       repeat_type = greedy_non_default;
   6453       possessive_quantifier = FALSE;
   6454       break;
   6455 
   6456       default:
   6457       repeat_type = greedy_default;
   6458       possessive_quantifier = FALSE;
   6459       break;
   6460       }
   6461 
   6462     /* Save start of previous item, in case we have to move it up in order to
   6463     insert something before it, and remember what it was. */
   6464 
   6465     tempcode = previous;
   6466     op_previous = *previous;
   6467 
   6468     /* Now handle repetition for the different types of item. */
   6469 
   6470     switch (op_previous)
   6471       {
   6472       /* If previous was a character or negated character match, abolish the
   6473       item and generate a repeat item instead. If a char item has a minimum of
   6474       more than one, ensure that it is set in reqcu - it might not be if a
   6475       sequence such as x{3} is the first thing in a branch because the x will
   6476       have gone into firstcu instead.  */
   6477 
   6478       case OP_CHAR:
   6479       case OP_CHARI:
   6480       case OP_NOT:
   6481       case OP_NOTI:
   6482       op_type = chartypeoffset[op_previous - OP_CHAR];
   6483 
   6484       /* Deal with UTF characters that take up more than one code unit. */
   6485 
   6486 #ifdef MAYBE_UTF_MULTI
   6487       if (utf && NOT_FIRSTCU(code[-1]))
   6488         {
   6489         PCRE2_UCHAR *lastchar = code - 1;
   6490         BACKCHAR(lastchar);
   6491         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
   6492         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
   6493         }
   6494       else
   6495 #endif  /* MAYBE_UTF_MULTI */
   6496 
   6497       /* Handle the case of a single code unit - either with no UTF support, or
   6498       with UTF disabled, or for a single-code-unit UTF character. */
   6499         {
   6500         mcbuffer[0] = code[-1];
   6501         mclength = 1;
   6502         if (op_previous <= OP_CHARI && repeat_min > 1)
   6503           {
   6504           reqcu = mcbuffer[0];
   6505           reqcuflags = req_caseopt | cb->req_varyopt;
   6506           }
   6507         }
   6508       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
   6509 
   6510       /* If previous was a character class or a back reference, we put the
   6511       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
   6512 
   6513 #ifdef SUPPORT_WIDE_CHARS
   6514       case OP_XCLASS:
   6515 #endif
   6516       case OP_CLASS:
   6517       case OP_NCLASS:
   6518       case OP_REF:
   6519       case OP_REFI:
   6520       case OP_DNREF:
   6521       case OP_DNREFI:
   6522 
   6523       if (repeat_max == 0)
   6524         {
   6525         code = previous;
   6526         goto END_REPEAT;
   6527         }
   6528 
   6529       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
   6530         *code++ = OP_CRSTAR + repeat_type;
   6531       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
   6532         *code++ = OP_CRPLUS + repeat_type;
   6533       else if (repeat_min == 0 && repeat_max == 1)
   6534         *code++ = OP_CRQUERY + repeat_type;
   6535       else
   6536         {
   6537         *code++ = OP_CRRANGE + repeat_type;
   6538         PUT2INC(code, 0, repeat_min);
   6539         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
   6540         PUT2INC(code, 0, repeat_max);
   6541         }
   6542       break;
   6543 
   6544       /* If previous is OP_FAIL, it was generated by an empty class []
   6545       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
   6546       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
   6547       time. We can just ignore this repeat. */
   6548 
   6549       case OP_FAIL:
   6550       goto END_REPEAT;
   6551 
   6552       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
   6553       because pcre2_match() could not handle backtracking into recursively
   6554       called groups. Now that this backtracking is available, we no longer need
   6555       to do this. However, we still need to replicate recursions as we do for
   6556       groups so as to have independent backtracking points. We can replicate
   6557       for the minimum number of repeats directly. For optional repeats we now
   6558       wrap the recursion in OP_BRA brackets and make use of the bracket
   6559       repetition. */
   6560 
   6561       case OP_RECURSE:
   6562 
   6563       /* Generate unwrapped repeats for a non-zero minimum, except when the
   6564       minimum is 1 and the maximum unlimited, because that can be handled with
   6565       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
   6566       minimum, we just need to generate the appropriate additional copies.
   6567       Otherwise we need to generate one more, to simulate the situation when
   6568       the minimum is zero. */
   6569 
   6570       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
   6571         {
   6572         int replicate = repeat_min;
   6573         if (repeat_min == repeat_max) replicate--;
   6574 
   6575         /* In the pre-compile phase, we don't actually do the replication. We
   6576         just adjust the length as if we had. Do some paranoid checks for
   6577         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
   6578         integer type when available, otherwise double. */
   6579 
   6580         if (lengthptr != NULL)
   6581           {
   6582           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
   6583           if ((INT64_OR_DOUBLE)replicate*
   6584                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
   6585                   (INT64_OR_DOUBLE)INT_MAX ||
   6586               OFLOW_MAX - *lengthptr < delta)
   6587             {
   6588             *errorcodeptr = ERR20;
   6589             return 0;
   6590             }
   6591           *lengthptr += delta;
   6592           }
   6593 
   6594         else for (i = 0; i < replicate; i++)
   6595           {
   6596           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
   6597           previous = code;
   6598           code += 1 + LINK_SIZE;
   6599           }
   6600 
   6601         /* If the number of repeats is fixed, we are done. Otherwise, adjust
   6602         the counts and fall through. */
   6603 
   6604         if (repeat_min == repeat_max) break;
   6605         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
   6606         repeat_min = 0;
   6607         }
   6608 
   6609       /* Wrap the recursion call in OP_BRA brackets. */
   6610 
   6611       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
   6612       op_previous = *previous = OP_BRA;
   6613       PUT(previous, 1, 2 + 2*LINK_SIZE);
   6614       previous[2 + 2*LINK_SIZE] = OP_KET;
   6615       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
   6616       code += 2 + 2 * LINK_SIZE;
   6617       length_prevgroup = 3 + 3*LINK_SIZE;
   6618       group_return = -1;  /* Set "may match empty string" */
   6619 
   6620       /* Now treat as a repeated OP_BRA. */
   6621       /* Fall through */
   6622 
   6623       /* If previous was a bracket group, we may have to replicate it in
   6624       certain cases. Note that at this point we can encounter only the "basic"
   6625       bracket opcodes such as BRA and CBRA, as this is the place where they get
   6626       converted into the more special varieties such as BRAPOS and SBRA.
   6627       Originally, PCRE did not allow repetition of assertions, but now it does,
   6628       for Perl compatibility. */
   6629 
   6630       case OP_ASSERT:
   6631       case OP_ASSERT_NOT:
   6632       case OP_ASSERTBACK:
   6633       case OP_ASSERTBACK_NOT:
   6634       case OP_ONCE:
   6635       case OP_BRA:
   6636       case OP_CBRA:
   6637       case OP_COND:
   6638         {
   6639         int len = (int)(code - previous);
   6640         PCRE2_UCHAR *bralink = NULL;
   6641         PCRE2_UCHAR *brazeroptr = NULL;
   6642 
   6643         /* Repeating a DEFINE group (or any group where the condition is always
   6644         FALSE and there is only one branch) is pointless, but Perl allows the
   6645         syntax, so we just ignore the repeat. */
   6646 
   6647         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
   6648             previous[GET(previous, 1)] != OP_ALT)
   6649           goto END_REPEAT;
   6650 
   6651         /* There is no sense in actually repeating assertions. The only
   6652         potential use of repetition is in cases when the assertion is optional.
   6653         Therefore, if the minimum is greater than zero, just ignore the repeat.
   6654         If the maximum is not zero or one, set it to 1. */
   6655 
   6656         if (op_previous < OP_ONCE)    /* Assertion */
   6657           {
   6658           if (repeat_min > 0) goto END_REPEAT;
   6659           if (repeat_max > 1) repeat_max = 1;
   6660           }
   6661 
   6662         /* The case of a zero minimum is special because of the need to stick
   6663         OP_BRAZERO in front of it, and because the group appears once in the
   6664         data, whereas in other cases it appears the minimum number of times. For
   6665         this reason, it is simplest to treat this case separately, as otherwise
   6666         the code gets far too messy. There are several special subcases when the
   6667         minimum is zero. */
   6668 
   6669         if (repeat_min == 0)
   6670           {
   6671           /* If the maximum is also zero, we used to just omit the group from
   6672           the output altogether, like this:
   6673 
   6674           ** if (repeat_max == 0)
   6675           **   {
   6676           **   code = previous;
   6677           **   goto END_REPEAT;
   6678           **   }
   6679 
   6680           However, that fails when a group or a subgroup within it is
   6681           referenced as a subroutine from elsewhere in the pattern, so now we
   6682           stick in OP_SKIPZERO in front of it so that it is skipped on
   6683           execution. As we don't have a list of which groups are referenced, we
   6684           cannot do this selectively.
   6685 
   6686           If the maximum is 1 or unlimited, we just have to stick in the
   6687           BRAZERO and do no more at this point. */
   6688 
   6689           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
   6690             {
   6691             (void)memmove(previous + 1, previous, CU2BYTES(len));
   6692             code++;
   6693             if (repeat_max == 0)
   6694               {
   6695               *previous++ = OP_SKIPZERO;
   6696               goto END_REPEAT;
   6697               }
   6698             brazeroptr = previous;    /* Save for possessive optimizing */
   6699             *previous++ = OP_BRAZERO + repeat_type;
   6700             }
   6701 
   6702           /* If the maximum is greater than 1 and limited, we have to replicate
   6703           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
   6704           The first one has to be handled carefully because it's the original
   6705           copy, which has to be moved up. The remainder can be handled by code
   6706           that is common with the non-zero minimum case below. We have to
   6707           adjust the value or repeat_max, since one less copy is required. */
   6708 
   6709           else
   6710             {
   6711             int linkoffset;
   6712             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
   6713             code += 2 + LINK_SIZE;
   6714             *previous++ = OP_BRAZERO + repeat_type;
   6715             *previous++ = OP_BRA;
   6716 
   6717             /* We chain together the bracket link offset fields that have to be
   6718             filled in later when the ends of the brackets are reached. */
   6719 
   6720             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
   6721             bralink = previous;
   6722             PUTINC(previous, 0, linkoffset);
   6723             }
   6724 
   6725           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
   6726           }
   6727 
   6728         /* If the minimum is greater than zero, replicate the group as many
   6729         times as necessary, and adjust the maximum to the number of subsequent
   6730         copies that we need. */
   6731 
   6732         else
   6733           {
   6734           if (repeat_min > 1)
   6735             {
   6736             /* In the pre-compile phase, we don't actually do the replication.
   6737             We just adjust the length as if we had. Do some paranoid checks for
   6738             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
   6739             integer type when available, otherwise double. */
   6740 
   6741             if (lengthptr != NULL)
   6742               {
   6743               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
   6744               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
   6745                     (INT64_OR_DOUBLE)length_prevgroup >
   6746                       (INT64_OR_DOUBLE)INT_MAX ||
   6747                   OFLOW_MAX - *lengthptr < delta)
   6748                 {
   6749                 *errorcodeptr = ERR20;
   6750                 return 0;
   6751                 }
   6752               *lengthptr += delta;
   6753               }
   6754 
   6755             /* This is compiling for real. If there is a set first code unit
   6756             for the group, and we have not yet set a "required code unit", set
   6757             it. */
   6758 
   6759             else
   6760               {
   6761               if (groupsetfirstcu && reqcuflags < 0)
   6762                 {
   6763                 reqcu = firstcu;
   6764                 reqcuflags = firstcuflags;
   6765                 }
   6766               for (i = 1; (uint32_t)i < repeat_min; i++)
   6767                 {
   6768                 memcpy(code, previous, CU2BYTES(len));
   6769                 code += len;
   6770                 }
   6771               }
   6772             }
   6773 
   6774           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
   6775           }
   6776 
   6777         /* This code is common to both the zero and non-zero minimum cases. If
   6778         the maximum is limited, it replicates the group in a nested fashion,
   6779         remembering the bracket starts on a stack. In the case of a zero
   6780         minimum, the first one was set up above. In all cases the repeat_max
   6781         now specifies the number of additional copies needed. Again, we must
   6782         remember to replicate entries on the forward reference list. */
   6783 
   6784         if (repeat_max != REPEAT_UNLIMITED)
   6785           {
   6786           /* In the pre-compile phase, we don't actually do the replication. We
   6787           just adjust the length as if we had. For each repetition we must add
   6788           1 to the length for BRAZERO and for all but the last repetition we
   6789           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
   6790           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
   6791           is a 64-bit integer type when available, otherwise double. */
   6792 
   6793           if (lengthptr != NULL && repeat_max > 0)
   6794             {
   6795             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
   6796                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
   6797             if ((INT64_OR_DOUBLE)repeat_max *
   6798                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
   6799                     > (INT64_OR_DOUBLE)INT_MAX ||
   6800                 OFLOW_MAX - *lengthptr < delta)
   6801               {
   6802               *errorcodeptr = ERR20;
   6803               return 0;
   6804               }
   6805             *lengthptr += delta;
   6806             }
   6807 
   6808           /* This is compiling for real */
   6809 
   6810           else for (i = repeat_max - 1; i >= 0; i--)
   6811             {
   6812             *code++ = OP_BRAZERO + repeat_type;
   6813 
   6814             /* All but the final copy start a new nesting, maintaining the
   6815             chain of brackets outstanding. */
   6816 
   6817             if (i != 0)
   6818               {
   6819               int linkoffset;
   6820               *code++ = OP_BRA;
   6821               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
   6822               bralink = code;
   6823               PUTINC(code, 0, linkoffset);
   6824               }
   6825 
   6826             memcpy(code, previous, CU2BYTES(len));
   6827             code += len;
   6828             }
   6829 
   6830           /* Now chain through the pending brackets, and fill in their length
   6831           fields (which are holding the chain links pro tem). */
   6832 
   6833           while (bralink != NULL)
   6834             {
   6835             int oldlinkoffset;
   6836             int linkoffset = (int)(code - bralink + 1);
   6837             PCRE2_UCHAR *bra = code - linkoffset;
   6838             oldlinkoffset = GET(bra, 1);
   6839             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
   6840             *code++ = OP_KET;
   6841             PUTINC(code, 0, linkoffset);
   6842             PUT(bra, 1, linkoffset);
   6843             }
   6844           }
   6845 
   6846         /* If the maximum is unlimited, set a repeater in the final copy. For
   6847         ONCE brackets, that's all we need to do. However, possessively repeated
   6848         ONCE brackets can be converted into non-capturing brackets, as the
   6849         behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
   6850         deal with possessive ONCEs specially.
   6851 
   6852         Otherwise, when we are doing the actual compile phase, check to see
   6853         whether this group is one that could match an empty string. If so,
   6854         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
   6855         that runtime checking can be done. [This check is also applied to ONCE
   6856         groups at runtime, but in a different way.]
   6857 
   6858         Then, if the quantifier was possessive and the bracket is not a
   6859         conditional, we convert the BRA code to the POS form, and the KET code to
   6860         KETRPOS. (It turns out to be convenient at runtime to detect this kind of
   6861         subpattern at both the start and at the end.) The use of special opcodes
   6862         makes it possible to reduce greatly the stack usage in pcre2_match(). If
   6863         the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
   6864 
   6865         Then, if the minimum number of matches is 1 or 0, cancel the possessive
   6866         flag so that the default action below, of wrapping everything inside
   6867         atomic brackets, does not happen. When the minimum is greater than 1,
   6868         there will be earlier copies of the group, and so we still have to wrap
   6869         the whole thing. */
   6870 
   6871         else
   6872           {
   6873           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
   6874           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
   6875 
   6876           /* Convert possessive ONCE brackets to non-capturing */
   6877 
   6878           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
   6879 
   6880           /* For non-possessive ONCE brackets, all we need to do is to
   6881           set the KET. */
   6882 
   6883           if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
   6884 
   6885           /* Handle non-ONCE brackets and possessive ONCEs (which have been
   6886           converted to non-capturing above). */
   6887 
   6888           else
   6889             {
   6890             /* In the compile phase, adjust the opcode if the group can match
   6891             an empty string. For a conditional group with only one branch, the
   6892             value of group_return will not show "could be empty", so we must
   6893             check that separately. */
   6894 
   6895             if (lengthptr == NULL)
   6896               {
   6897               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
   6898               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
   6899                 *bracode = OP_SCOND;
   6900               }
   6901 
   6902             /* Handle possessive quantifiers. */
   6903 
   6904             if (possessive_quantifier)
   6905               {
   6906               /* For COND brackets, we wrap the whole thing in a possessively
   6907               repeated non-capturing bracket, because we have not invented POS
   6908               versions of the COND opcodes. */
   6909 
   6910               if (*bracode == OP_COND || *bracode == OP_SCOND)
   6911                 {
   6912                 int nlen = (int)(code - bracode);
   6913                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
   6914                 code += 1 + LINK_SIZE;
   6915                 nlen += 1 + LINK_SIZE;
   6916                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
   6917                 *code++ = OP_KETRPOS;
   6918                 PUTINC(code, 0, nlen);
   6919                 PUT(bracode, 1, nlen);
   6920                 }
   6921 
   6922               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
   6923 
   6924               else
   6925                 {
   6926                 *bracode += 1;              /* Switch to xxxPOS opcodes */
   6927                 *ketcode = OP_KETRPOS;
   6928                 }
   6929 
   6930               /* If the minimum is zero, mark it as possessive, then unset the
   6931               possessive flag when the minimum is 0 or 1. */
   6932 
   6933               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
   6934               if (repeat_min < 2) possessive_quantifier = FALSE;
   6935               }
   6936 
   6937             /* Non-possessive quantifier */
   6938 
   6939             else *ketcode = OP_KETRMAX + repeat_type;
   6940             }
   6941           }
   6942         }
   6943       break;
   6944 
   6945       /* If previous was a character type match (\d or similar), abolish it and
   6946       create a suitable repeat item. The code is shared with single-character
   6947       repeats by setting op_type to add a suitable offset into repeat_type.
   6948       Note the the Unicode property types will be present only when
   6949       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
   6950       here because it just makes it horribly messy. */
   6951 
   6952       default:
   6953       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
   6954         {
   6955         *errorcodeptr = ERR10;
   6956         return 0;
   6957         }
   6958       else
   6959         {
   6960         int prop_type, prop_value;
   6961         PCRE2_UCHAR *oldcode;
   6962 
   6963         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
   6964         mclength = 0;                         /* Not a character */
   6965 
   6966         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
   6967           {
   6968           prop_type = previous[1];
   6969           prop_value = previous[2];
   6970           }
   6971         else
   6972           {
   6973           /* Come here from just above with a character in mcbuffer/mclength. */
   6974           OUTPUT_SINGLE_REPEAT:
   6975           prop_type = prop_value = -1;
   6976           }
   6977 
   6978         /* At this point, if prop_type == prop_value == -1 we either have a
   6979         character in mcbuffer when mclength is greater than zero, or we have
   6980         mclength zero, in which case there is a non-property character type in
   6981         op_previous. If prop_type/value are not negative, we have a property
   6982         character type in op_previous. */
   6983 
   6984         oldcode = code;                   /* Save where we were */
   6985         code = previous;                  /* Usually overwrite previous item */
   6986 
   6987         /* If the maximum is zero then the minimum must also be zero; Perl allows
   6988         this case, so we do too - by simply omitting the item altogether. */
   6989 
   6990         if (repeat_max == 0) goto END_REPEAT;
   6991 
   6992         /* Combine the op_type with the repeat_type */
   6993 
   6994         repeat_type += op_type;
   6995 
   6996         /* A minimum of zero is handled either as the special case * or ?, or as
   6997         an UPTO, with the maximum given. */
   6998 
   6999         if (repeat_min == 0)
   7000           {
   7001           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
   7002             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
   7003           else
   7004             {
   7005             *code++ = OP_UPTO + repeat_type;
   7006             PUT2INC(code, 0, repeat_max);
   7007             }
   7008           }
   7009 
   7010         /* A repeat minimum of 1 is optimized into some special cases. If the
   7011         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
   7012         left in place and, if the maximum is greater than 1, we use OP_UPTO with
   7013         one less than the maximum. */
   7014 
   7015         else if (repeat_min == 1)
   7016           {
   7017           if (repeat_max == REPEAT_UNLIMITED)
   7018             *code++ = OP_PLUS + repeat_type;
   7019           else
   7020             {
   7021             code = oldcode;  /* Leave previous item in place */
   7022             if (repeat_max == 1) goto END_REPEAT;
   7023             *code++ = OP_UPTO + repeat_type;
   7024             PUT2INC(code, 0, repeat_max - 1);
   7025             }
   7026           }
   7027 
   7028         /* The case {n,n} is just an EXACT, while the general case {n,m} is
   7029         handled as an EXACT followed by an UPTO or STAR or QUERY. */
   7030 
   7031         else
   7032           {
   7033           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
   7034           PUT2INC(code, 0, repeat_min);
   7035 
   7036           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
   7037           and then generate the second opcode. For a repeated Unicode property
   7038           match, there are two extra values that define the required property,
   7039           and mclength is set zero to indicate this. */
   7040 
   7041           if (repeat_max != repeat_min)
   7042             {
   7043             if (mclength > 0)
   7044               {
   7045               memcpy(code, mcbuffer, CU2BYTES(mclength));
   7046               code += mclength;
   7047               }
   7048             else
   7049               {
   7050               *code++ = op_previous;
   7051               if (prop_type >= 0)
   7052                 {
   7053                 *code++ = prop_type;
   7054                 *code++ = prop_value;
   7055                 }
   7056               }
   7057 
   7058             /* Now set up the following opcode */
   7059 
   7060             if (repeat_max == REPEAT_UNLIMITED)
   7061               *code++ = OP_STAR + repeat_type;
   7062             else
   7063               {
   7064               repeat_max -= repeat_min;
   7065               if (repeat_max == 1)
   7066                 {
   7067                 *code++ = OP_QUERY + repeat_type;
   7068                 }
   7069               else
   7070                 {
   7071                 *code++ = OP_UPTO + repeat_type;
   7072                 PUT2INC(code, 0, repeat_max);
   7073                 }
   7074               }
   7075             }
   7076           }
   7077 
   7078         /* Fill in the character or character type for the final opcode. */
   7079 
   7080         if (mclength > 0)
   7081           {
   7082           memcpy(code, mcbuffer, CU2BYTES(mclength));
   7083           code += mclength;
   7084           }
   7085         else
   7086           {
   7087           *code++ = op_previous;
   7088           if (prop_type >= 0)
   7089             {
   7090             *code++ = prop_type;
   7091             *code++ = prop_value;
   7092             }
   7093           }
   7094         }
   7095       break;
   7096       }  /* End of switch on different op_previous values */
   7097 
   7098 
   7099     /* If the character following a repeat is '+', possessive_quantifier is
   7100     TRUE. For some opcodes, there are special alternative opcodes for this
   7101     case. For anything else, we wrap the entire repeated item inside OP_ONCE
   7102     brackets. Logically, the '+' notation is just syntactic sugar, taken from
   7103     Sun's Java package, but the special opcodes can optimize it.
   7104 
   7105     Some (but not all) possessively repeated subpatterns have already been
   7106     completely handled in the code just above. For them, possessive_quantifier
   7107     is always FALSE at this stage. Note that the repeated item starts at
   7108     tempcode, not at previous, which might be the first part of a string whose
   7109     (former) last char we repeated. */
   7110 
   7111     if (possessive_quantifier)
   7112       {
   7113       int len;
   7114 
   7115       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
   7116       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
   7117       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
   7118       remains is greater than zero, there's a further opcode that can be
   7119       handled. If not, do nothing, leaving the EXACT alone. */
   7120 
   7121       switch(*tempcode)
   7122         {
   7123         case OP_TYPEEXACT:
   7124         tempcode += PRIV(OP_lengths)[*tempcode] +
   7125           ((tempcode[1 + IMM2_SIZE] == OP_PROP
   7126           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
   7127         break;
   7128 
   7129         /* CHAR opcodes are used for exacts whose count is 1. */
   7130 
   7131         case OP_CHAR:
   7132         case OP_CHARI:
   7133         case OP_NOT:
   7134         case OP_NOTI:
   7135         case OP_EXACT:
   7136         case OP_EXACTI:
   7137         case OP_NOTEXACT:
   7138         case OP_NOTEXACTI:
   7139         tempcode += PRIV(OP_lengths)[*tempcode];
   7140 #ifdef SUPPORT_UNICODE
   7141         if (utf && HAS_EXTRALEN(tempcode[-1]))
   7142           tempcode += GET_EXTRALEN(tempcode[-1]);
   7143 #endif
   7144         break;
   7145 
   7146         /* For the class opcodes, the repeat operator appears at the end;
   7147         adjust tempcode to point to it. */
   7148 
   7149         case OP_CLASS:
   7150         case OP_NCLASS:
   7151         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
   7152         break;
   7153 
   7154 #ifdef SUPPORT_WIDE_CHARS
   7155         case OP_XCLASS:
   7156         tempcode += GET(tempcode, 1);
   7157         break;
   7158 #endif
   7159         }
   7160 
   7161       /* If tempcode is equal to code (which points to the end of the repeated
   7162       item), it means we have skipped an EXACT item but there is no following
   7163       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
   7164       all other cases, tempcode will be pointing to the repeat opcode, and will
   7165       be less than code, so the value of len will be greater than 0. */
   7166 
   7167       len = (int)(code - tempcode);
   7168       if (len > 0)
   7169         {
   7170         unsigned int repcode = *tempcode;
   7171 
   7172         /* There is a table for possessifying opcodes, all of which are less
   7173         than OP_CALLOUT. A zero entry means there is no possessified version.
   7174         */
   7175 
   7176         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
   7177           *tempcode = opcode_possessify[repcode];
   7178 
   7179         /* For opcode without a special possessified version, wrap the item in
   7180         ONCE brackets. */
   7181 
   7182         else
   7183           {
   7184           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
   7185           code += 1 + LINK_SIZE;
   7186           len += 1 + LINK_SIZE;
   7187           tempcode[0] = OP_ONCE;
   7188           *code++ = OP_KET;
   7189           PUTINC(code, 0, len);
   7190           PUT(tempcode, 1, len);
   7191           }
   7192         }
   7193       }
   7194 
   7195     /* We set the "follows varying string" flag for subsequently encountered
   7196     reqcus if it isn't already set and we have just passed a varying length
   7197     item. */
   7198 
   7199     END_REPEAT:
   7200     cb->req_varyopt |= reqvary;
   7201     break;
   7202 
   7203 
   7204     /* ===================================================================*/
   7205     /* Handle a 32-bit data character with a value greater than META_END. */
   7206 
   7207     case META_BIGVALUE:
   7208     pptr++;
   7209     goto NORMAL_CHAR;
   7210 
   7211 
   7212     /* ===============================================================*/
   7213     /* Handle a back reference by number, which is the meta argument. The
   7214     pattern offsets for back references to group numbers less than 10 are held
   7215     in a special vector, to avoid using more than two parsed pattern elements
   7216     in 64-bit environments. We only need the offset to the first occurrence,
   7217     because if that doesn't fail, subsequent ones will also be OK. */
   7218 
   7219     case META_BACKREF:
   7220     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
   7221       else GETPLUSOFFSET(offset, pptr);
   7222 
   7223     if (meta_arg > cb->bracount)
   7224       {
   7225       cb->erroroffset = offset;
   7226       *errorcodeptr = ERR15;  /* Non-existent subpattern */
   7227       return 0;
   7228       }
   7229 
   7230     /* Come here from named backref handling when the reference is to a
   7231     single group (that is, not to a duplicated name). The back reference
   7232     data will have already been updated. We must disable firstcu if not
   7233     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
   7234     later. */
   7235 
   7236     HANDLE_SINGLE_REFERENCE:
   7237     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
   7238     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
   7239     PUT2INC(code, 0, meta_arg);
   7240 
   7241     /* Update the map of back references, and keep the highest one. We
   7242     could do this in parse_regex() for numerical back references, but not
   7243     for named back references, because we don't know the numbers to which
   7244     named back references refer. So we do it all in this function. */
   7245 
   7246     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
   7247     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
   7248 
   7249     /* Check to see if this back reference is recursive, that it, it
   7250     is inside the group that it references. A flag is set so that the
   7251     group can be made atomic. */
   7252 
   7253     for (oc = cb->open_caps; oc != NULL; oc = oc->next)
   7254       {
   7255       if (oc->number == meta_arg)
   7256         {
   7257         oc->flag = TRUE;
   7258         break;
   7259         }
   7260       }
   7261     break;
   7262 
   7263 
   7264     /* ===============================================================*/
   7265     /* Handle recursion by inserting the number of the called group (which is
   7266     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
   7267     scanned and these numbers are replaced by offsets within the pattern. It is
   7268     done like this to avoid problems with forward references and adjusting
   7269     offsets when groups are duplicated and moved (as discovered in previous
   7270     implementations). Note that a recursion does not have a set first character
   7271     (relevant if it is repeated, because it will then be wrapped with ONCE
   7272     brackets). */
   7273 
   7274     case META_RECURSE:
   7275     GETPLUSOFFSET(offset, pptr);
   7276     if (meta_arg > cb->bracount)
   7277       {
   7278       cb->erroroffset = offset;
   7279       *errorcodeptr = ERR15;  /* Non-existent subpattern */
   7280       return 0;
   7281       }
   7282     HANDLE_NUMERICAL_RECURSION:
   7283     *code = OP_RECURSE;
   7284     PUT(code, 1, meta_arg);
   7285     code += 1 + LINK_SIZE;
   7286     groupsetfirstcu = FALSE;
   7287     cb->had_recurse = TRUE;
   7288     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   7289     break;
   7290 
   7291 
   7292     /* ===============================================================*/
   7293     /* Handle capturing parentheses; the number is the meta argument. */
   7294 
   7295     case META_CAPTURE:
   7296     bravalue = OP_CBRA;
   7297     skipunits = IMM2_SIZE;
   7298     PUT2(code, 1+LINK_SIZE, meta_arg);
   7299     cb->lastcapture = meta_arg;
   7300     goto GROUP_PROCESS_NOTE_EMPTY;
   7301 
   7302 
   7303     /* ===============================================================*/
   7304     /* Handle escape sequence items. For ones like \d, the ESC_values are
   7305     arranged to be the same as the corresponding OP_values in the default case
   7306     when PCRE2_UCP is not set (which is the only case in which they will appear
   7307     here).
   7308 
   7309     Note: \Q and \E are never seen here, as they were dealt with in
   7310     parse_pattern(). Neither are numerical back references or recursions, which
   7311     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
   7312     \g, when followed by names, are turned into META_BACKREF_BYNAME or
   7313     META_RECURSE_BYNAME. */
   7314 
   7315     case META_ESCAPE:
   7316 
   7317     /* We can test for escape sequences that consume a character because their
   7318     values lie between ESC_b and ESC_Z; this may have to change if any new ones
   7319     are ever created. For these sequences, we disable the setting of a first
   7320     character if it hasn't already been set. */
   7321 
   7322     if (meta_arg > ESC_b && meta_arg < ESC_Z)
   7323       {
   7324       matched_char = TRUE;
   7325       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
   7326       }
   7327 
   7328     /* Set values to reset to if this is followed by a zero repeat. */
   7329 
   7330     zerofirstcu = firstcu;
   7331     zerofirstcuflags = firstcuflags;
   7332     zeroreqcu = reqcu;
   7333     zeroreqcuflags = reqcuflags;
   7334 
   7335     /* If Unicode is not supported, \P and \p are not allowed and are
   7336     faulted at parse time, so will never appear here. */
   7337 
   7338 #ifdef SUPPORT_UNICODE
   7339     if (meta_arg == ESC_P || meta_arg == ESC_p)
   7340       {
   7341       uint32_t ptype = *(++pptr) >> 16;
   7342       uint32_t pdata = *pptr & 0xffff;
   7343       *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
   7344       *code++ = ptype;
   7345       *code++ = pdata;
   7346       break;  /* End META_ESCAPE */
   7347       }
   7348 #endif
   7349 
   7350     /* For the rest (including \X when Unicode is supported - if not it's
   7351     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
   7352     not set; if it is set, these escapes do not show up here because they are
   7353     converted into Unicode property tests in parse_regex(). Note that \b and \B
   7354     do a one-character lookbehind, and \A also behaves as if it does. */
   7355 
   7356     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
   7357     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
   7358          cb->max_lookbehind == 0)
   7359       cb->max_lookbehind = 1;
   7360 
   7361     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
   7362     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
   7363 
   7364 #if PCRE2_CODE_UNIT_WIDTH == 32
   7365     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
   7366 #else
   7367     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
   7368 #endif
   7369     break;  /* End META_ESCAPE */
   7370 
   7371 
   7372     /* ===================================================================*/
   7373     /* Handle an unrecognized meta value. A parsed pattern value less than
   7374     META_END is a literal. Otherwise we have a problem. */
   7375 
   7376     default:
   7377     if (meta >= META_END)
   7378       {
   7379 #ifdef DEBUG_SHOW_PARSED
   7380       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
   7381 #endif
   7382       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
   7383       return 0;
   7384       }
   7385 
   7386     /* Handle a literal character. We come here by goto in the case of a
   7387     32-bit, non-UTF character whose value is greater than META_END. */
   7388 
   7389     NORMAL_CHAR:
   7390     meta = *pptr;     /* Get the full 32 bits */
   7391     NORMAL_CHAR_SET:  /* Character is already in meta */
   7392     matched_char = TRUE;
   7393 
   7394     /* For caseless UTF mode, check whether this character has more than one
   7395     other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
   7396 
   7397 #ifdef SUPPORT_UNICODE
   7398     if (utf && (options & PCRE2_CASELESS) != 0)
   7399       {
   7400       uint32_t caseset = UCD_CASESET(meta);
   7401       if (caseset != 0)
   7402         {
   7403         *code++ = OP_PROP;
   7404         *code++ = PT_CLIST;
   7405         *code++ = caseset;
   7406         if (firstcuflags == REQ_UNSET)
   7407           firstcuflags = zerofirstcuflags = REQ_NONE;
   7408         break;  /* End handling this meta item */
   7409         }
   7410       }
   7411 #endif
   7412 
   7413     /* Caseful matches, or not one of the multicase characters. Get the
   7414     character's code units into mcbuffer, with the length in mclength. When not
   7415     in UTF mode, the length is always 1. */
   7416 
   7417 #ifdef SUPPORT_UNICODE
   7418     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
   7419 #endif
   7420       {
   7421       mclength = 1;
   7422       mcbuffer[0] = meta;
   7423       }
   7424 
   7425     /* Generate the appropriate code */
   7426 
   7427     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
   7428     memcpy(code, mcbuffer, CU2BYTES(mclength));
   7429     code += mclength;
   7430 
   7431     /* Remember if \r or \n were seen */
   7432 
   7433     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
   7434       cb->external_flags |= PCRE2_HASCRORLF;
   7435 
   7436     /* Set the first and required code units appropriately. If no previous
   7437     first code unit, set it from this character, but revert to none on a zero
   7438     repeat. Otherwise, leave the firstcu value alone, and don't change it on
   7439     a zero repeat. */
   7440 
   7441     if (firstcuflags == REQ_UNSET)
   7442       {
   7443       zerofirstcuflags = REQ_NONE;
   7444       zeroreqcu = reqcu;
   7445       zeroreqcuflags = reqcuflags;
   7446 
   7447       /* If the character is more than one code unit long, we can set firstcu
   7448       only if it is not to be matched caselessly. */
   7449 
   7450       if (mclength == 1 || req_caseopt == 0)
   7451         {
   7452         firstcu = mcbuffer[0];
   7453         firstcuflags = req_caseopt;
   7454         if (mclength != 1)
   7455           {
   7456           reqcu = code[-1];
   7457           reqcuflags = cb->req_varyopt;
   7458           }
   7459         }
   7460       else firstcuflags = reqcuflags = REQ_NONE;
   7461       }
   7462 
   7463     /* firstcu was previously set; we can set reqcu only if the length is
   7464     1 or the matching is caseful. */
   7465 
   7466     else
   7467       {
   7468       zerofirstcu = firstcu;
   7469       zerofirstcuflags = firstcuflags;
   7470       zeroreqcu = reqcu;
   7471       zeroreqcuflags = reqcuflags;
   7472       if (mclength == 1 || req_caseopt == 0)
   7473         {
   7474         reqcu = code[-1];
   7475         reqcuflags = req_caseopt | cb->req_varyopt;
   7476         }
   7477       }
   7478     break;    /* End default meta handling */
   7479     }         /* End of big switch */
   7480   }           /* End of big loop */
   7481 
   7482 /* Control never reaches here. */
   7483 }
   7484 
   7485 
   7486 
   7487 /*************************************************
   7488 *   Compile regex: a sequence of alternatives    *
   7489 *************************************************/
   7490 
   7491 /* On entry, pptr is pointing past the bracket meta, but on return it points to
   7492 the closing bracket or META_END. The code variable is pointing at the code unit
   7493 into which the BRA operator has been stored. This function is used during the
   7494 pre-compile phase when we are trying to find out the amount of memory needed,
   7495 as well as during the real compile phase. The value of lengthptr distinguishes
   7496 the two phases.
   7497 
   7498 Arguments:
   7499   options           option bits, including any changes for this subpattern
   7500   codeptr           -> the address of the current code pointer
   7501   pptrptr           -> the address of the current parsed pattern pointer
   7502   errorcodeptr      -> pointer to error code variable
   7503   skipunits         skip this many code units at start (for brackets and OP_COND)
   7504   firstcuptr        place to put the first required code unit
   7505   firstcuflagsptr   place to put the first code unit flags, or a negative number
   7506   reqcuptr          place to put the last required code unit
   7507   reqcuflagsptr     place to put the last required code unit flags, or a negative number
   7508   bcptr             pointer to the chain of currently open branches
   7509   cb                points to the data block with tables pointers etc.
   7510   lengthptr         NULL during the real compile phase
   7511                     points to length accumulator during pre-compile phase
   7512 
   7513 Returns:            0 There has been an error
   7514                    +1 Success, this group must match at least one character
   7515                    -1 Success, this group may match an empty string
   7516 */
   7517 
   7518 static int
   7519 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
   7520   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
   7521   int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
   7522   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
   7523 {
   7524 PCRE2_UCHAR *code = *codeptr;
   7525 PCRE2_UCHAR *last_branch = code;
   7526 PCRE2_UCHAR *start_bracket = code;
   7527 BOOL lookbehind;
   7528 open_capitem capitem;
   7529 int capnumber = 0;
   7530 int okreturn = 1;
   7531 uint32_t *pptr = *pptrptr;
   7532 uint32_t firstcu, reqcu;
   7533 uint32_t lookbehindlength;
   7534 int32_t firstcuflags, reqcuflags;
   7535 uint32_t branchfirstcu, branchreqcu;
   7536 int32_t branchfirstcuflags, branchreqcuflags;
   7537 PCRE2_SIZE length;
   7538 branch_chain bc;
   7539 
   7540 /* If set, call the external function that checks for stack availability. */
   7541 
   7542 if (cb->cx->stack_guard != NULL &&
   7543     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
   7544   {
   7545   *errorcodeptr= ERR33;
   7546   return 0;
   7547   }
   7548 
   7549 /* Miscellaneous initialization */
   7550 
   7551 bc.outer = bcptr;
   7552 bc.current_branch = code;
   7553 
   7554 firstcu = reqcu = 0;
   7555 firstcuflags = reqcuflags = REQ_UNSET;
   7556 
   7557 /* Accumulate the length for use in the pre-compile phase. Start with the
   7558 length of the BRA and KET and any extra code units that are required at the
   7559 beginning. We accumulate in a local variable to save frequent testing of
   7560 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
   7561 start and end of each alternative, because compiled items are discarded during
   7562 the pre-compile phase so that the workspace is not exceeded. */
   7563 
   7564 length = 2 + 2*LINK_SIZE + skipunits;
   7565 
   7566 /* Remember if this is a lookbehind assertion, and if it is, save its length
   7567 and skip over the pattern offset. */
   7568 
   7569 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT;
   7570 if (lookbehind)
   7571   {
   7572   lookbehindlength = META_DATA(pptr[-1]);
   7573   pptr += SIZEOFFSET;
   7574   }
   7575 else lookbehindlength = 0;
   7576 
   7577 /* If this is a capturing subpattern, add to the chain of open capturing items
   7578 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
   7579 need be tested here; changing this opcode to one of its variants, e.g.
   7580 OP_SCBRAPOS, happens later, after the group has been compiled. */
   7581 
   7582 if (*code == OP_CBRA)
   7583   {
   7584   capnumber = GET2(code, 1 + LINK_SIZE);
   7585   capitem.number = capnumber;
   7586   capitem.next = cb->open_caps;
   7587   capitem.flag = FALSE;
   7588   capitem.assert_depth = cb->assert_depth;
   7589   cb->open_caps = &capitem;
   7590   }
   7591 
   7592 /* Offset is set zero to mark that this bracket is still open */
   7593 
   7594 PUT(code, 1, 0);
   7595 code += 1 + LINK_SIZE + skipunits;
   7596 
   7597 /* Loop for each alternative branch */
   7598 
   7599 for (;;)
   7600   {
   7601   int branch_return;
   7602 
   7603   /* Insert OP_REVERSE if this is as lookbehind assertion. */
   7604 
   7605   if (lookbehind && lookbehindlength > 0)
   7606     {
   7607     *code++ = OP_REVERSE;
   7608     PUTINC(code, 0, lookbehindlength);
   7609     length += 1 + LINK_SIZE;
   7610     }
   7611 
   7612   /* Now compile the branch; in the pre-compile phase its length gets added
   7613   into the length. */
   7614 
   7615   if ((branch_return =
   7616         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
   7617           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
   7618           cb, (lengthptr == NULL)? NULL : &length)) == 0)
   7619     return 0;
   7620 
   7621   /* If a branch can match an empty string, so can the whole group. */
   7622 
   7623   if (branch_return < 0) okreturn = -1;
   7624 
   7625   /* In the real compile phase, there is some post-processing to be done. */
   7626 
   7627   if (lengthptr == NULL)
   7628     {
   7629     /* If this is the first branch, the firstcu and reqcu values for the
   7630     branch become the values for the regex. */
   7631 
   7632     if (*last_branch != OP_ALT)
   7633       {
   7634       firstcu = branchfirstcu;
   7635       firstcuflags = branchfirstcuflags;
   7636       reqcu = branchreqcu;
   7637       reqcuflags = branchreqcuflags;
   7638       }
   7639 
   7640     /* If this is not the first branch, the first char and reqcu have to
   7641     match the values from all the previous branches, except that if the
   7642     previous value for reqcu didn't have REQ_VARY set, it can still match,
   7643     and we set REQ_VARY for the regex. */
   7644 
   7645     else
   7646       {
   7647       /* If we previously had a firstcu, but it doesn't match the new branch,
   7648       we have to abandon the firstcu for the regex, but if there was
   7649       previously no reqcu, it takes on the value of the old firstcu. */
   7650 
   7651       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
   7652         {
   7653         if (firstcuflags >= 0)
   7654           {
   7655           if (reqcuflags < 0)
   7656             {
   7657             reqcu = firstcu;
   7658             reqcuflags = firstcuflags;
   7659             }
   7660           }
   7661         firstcuflags = REQ_NONE;
   7662         }
   7663 
   7664       /* If we (now or from before) have no firstcu, a firstcu from the
   7665       branch becomes a reqcu if there isn't a branch reqcu. */
   7666 
   7667       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
   7668           branchreqcuflags < 0)
   7669         {
   7670         branchreqcu = branchfirstcu;
   7671         branchreqcuflags = branchfirstcuflags;
   7672         }
   7673 
   7674       /* Now ensure that the reqcus match */
   7675 
   7676       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
   7677           reqcu != branchreqcu)
   7678         reqcuflags = REQ_NONE;
   7679       else
   7680         {
   7681         reqcu = branchreqcu;
   7682         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
   7683         }
   7684       }
   7685     }
   7686 
   7687   /* Handle reaching the end of the expression, either ')' or end of pattern.
   7688   In the real compile phase, go back through the alternative branches and
   7689   reverse the chain of offsets, with the field in the BRA item now becoming an
   7690   offset to the first alternative. If there are no alternatives, it points to
   7691   the end of the group. The length in the terminating ket is always the length
   7692   of the whole bracketed item. Return leaving the pointer at the terminating
   7693   char. */
   7694 
   7695   if (META_CODE(*pptr) != META_ALT)
   7696     {
   7697     if (lengthptr == NULL)
   7698       {
   7699       PCRE2_SIZE branch_length = code - last_branch;
   7700       do
   7701         {
   7702         PCRE2_SIZE prev_length = GET(last_branch, 1);
   7703         PUT(last_branch, 1, branch_length);
   7704         branch_length = prev_length;
   7705         last_branch -= branch_length;
   7706         }
   7707       while (branch_length > 0);
   7708       }
   7709 
   7710     /* Fill in the ket */
   7711 
   7712     *code = OP_KET;
   7713     PUT(code, 1, (int)(code - start_bracket));
   7714     code += 1 + LINK_SIZE;
   7715 
   7716     /* If it was a capturing subpattern, check to see if it contained any
   7717     recursive back references. If so, we must wrap it in atomic brackets. In
   7718     any event, remove the block from the chain. */
   7719 
   7720     if (capnumber > 0)
   7721       {
   7722       if (cb->open_caps->flag)
   7723         {
   7724         (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
   7725           CU2BYTES(code - start_bracket));
   7726         *start_bracket = OP_ONCE;
   7727         code += 1 + LINK_SIZE;
   7728         PUT(start_bracket, 1, (int)(code - start_bracket));
   7729         *code = OP_KET;
   7730         PUT(code, 1, (int)(code - start_bracket));
   7731         code += 1 + LINK_SIZE;
   7732         length += 2 + 2*LINK_SIZE;
   7733         }
   7734       cb->open_caps = cb->open_caps->next;
   7735       }
   7736 
   7737     /* Set values to pass back */
   7738 
   7739     *codeptr = code;
   7740     *pptrptr = pptr;
   7741     *firstcuptr = firstcu;
   7742     *firstcuflagsptr = firstcuflags;
   7743     *reqcuptr = reqcu;
   7744     *reqcuflagsptr = reqcuflags;
   7745     if (lengthptr != NULL)
   7746       {
   7747       if (OFLOW_MAX - *lengthptr < length)
   7748         {
   7749         *errorcodeptr = ERR20;
   7750         return 0;
   7751         }
   7752       *lengthptr += length;
   7753       }
   7754     return okreturn;
   7755     }
   7756 
   7757   /* Another branch follows. In the pre-compile phase, we can move the code
   7758   pointer back to where it was for the start of the first branch. (That is,
   7759   pretend that each branch is the only one.)
   7760 
   7761   In the real compile phase, insert an ALT node. Its length field points back
   7762   to the previous branch while the bracket remains open. At the end the chain
   7763   is reversed. It's done like this so that the start of the bracket has a
   7764   zero offset until it is closed, making it possible to detect recursion. */
   7765 
   7766   if (lengthptr != NULL)
   7767     {
   7768     code = *codeptr + 1 + LINK_SIZE + skipunits;
   7769     length += 1 + LINK_SIZE;
   7770     }
   7771   else
   7772     {
   7773     *code = OP_ALT;
   7774     PUT(code, 1, (int)(code - last_branch));
   7775     bc.current_branch = last_branch = code;
   7776     code += 1 + LINK_SIZE;
   7777     }
   7778 
   7779   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
   7780   and then advance past the vertical bar. */
   7781 
   7782   lookbehindlength = META_DATA(*pptr);
   7783   pptr++;
   7784   }
   7785 /* Control never reaches here */
   7786 }
   7787 
   7788 
   7789 
   7790 /*************************************************
   7791 *          Check for anchored pattern            *
   7792 *************************************************/
   7793 
   7794 /* Try to find out if this is an anchored regular expression. Consider each
   7795 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
   7796 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
   7797 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
   7798 be found, because ^ generates OP_CIRCM in that mode.
   7799 
   7800 We can also consider a regex to be anchored if OP_SOM starts all its branches.
   7801 This is the code for \G, which means "match at start of match position, taking
   7802 into account the match offset".
   7803 
   7804 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
   7805 because that will try the rest of the pattern at all possible matching points,
   7806 so there is no point trying again.... er ....
   7807 
   7808 .... except when the .* appears inside capturing parentheses, and there is a
   7809 subsequent back reference to those parentheses. We haven't enough information
   7810 to catch that case precisely.
   7811 
   7812 At first, the best we could do was to detect when .* was in capturing brackets
   7813 and the highest back reference was greater than or equal to that level.
   7814 However, by keeping a bitmap of the first 31 back references, we can catch some
   7815 of the more common cases more precisely.
   7816 
   7817 ... A second exception is when the .* appears inside an atomic group, because
   7818 this prevents the number of characters it matches from being adjusted.
   7819 
   7820 Arguments:
   7821   code           points to start of the compiled pattern
   7822   bracket_map    a bitmap of which brackets we are inside while testing; this
   7823                    handles up to substring 31; after that we just have to take
   7824                    the less precise approach
   7825   cb             points to the compile data block
   7826   atomcount      atomic group level
   7827   inassert       TRUE if in an assertion
   7828 
   7829 Returns:     TRUE or FALSE
   7830 */
   7831 
   7832 static BOOL
   7833 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
   7834   int atomcount, BOOL inassert)
   7835 {
   7836 do {
   7837    PCRE2_SPTR scode = first_significant_code(
   7838      code + PRIV(OP_lengths)[*code], FALSE);
   7839    int op = *scode;
   7840 
   7841    /* Non-capturing brackets */
   7842 
   7843    if (op == OP_BRA  || op == OP_BRAPOS ||
   7844        op == OP_SBRA || op == OP_SBRAPOS)
   7845      {
   7846      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
   7847        return FALSE;
   7848      }
   7849 
   7850    /* Capturing brackets */
   7851 
   7852    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   7853             op == OP_SCBRA || op == OP_SCBRAPOS)
   7854      {
   7855      int n = GET2(scode, 1+LINK_SIZE);
   7856      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
   7857      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
   7858      }
   7859 
   7860    /* Positive forward assertion */
   7861 
   7862    else if (op == OP_ASSERT)
   7863      {
   7864      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
   7865      }
   7866 
   7867    /* Condition. If there is no second branch, it can't be anchored. */
   7868 
   7869    else if (op == OP_COND || op == OP_SCOND)
   7870      {
   7871      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
   7872      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
   7873        return FALSE;
   7874      }
   7875 
   7876    /* Atomic groups */
   7877 
   7878    else if (op == OP_ONCE)
   7879      {
   7880      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
   7881        return FALSE;
   7882      }
   7883 
   7884    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
   7885    it isn't in brackets that are or may be referenced or inside an atomic
   7886    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
   7887    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
   7888    with the subject "aab", which matches "b", i.e. not at the start of a line.
   7889    There is also an option that disables auto-anchoring. */
   7890 
   7891    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
   7892              op == OP_TYPEPOSSTAR))
   7893      {
   7894      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
   7895          atomcount > 0 || cb->had_pruneorskip || inassert ||
   7896          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
   7897        return FALSE;
   7898      }
   7899 
   7900    /* Check for explicit anchoring */
   7901 
   7902    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
   7903 
   7904    code += GET(code, 1);
   7905    }
   7906 while (*code == OP_ALT);   /* Loop for each alternative */
   7907 return TRUE;
   7908 }
   7909 
   7910 
   7911 
   7912 /*************************************************
   7913 *         Check for starting with ^ or .*        *
   7914 *************************************************/
   7915 
   7916 /* This is called to find out if every branch starts with ^ or .* so that
   7917 "first char" processing can be done to speed things up in multiline
   7918 matching and for non-DOTALL patterns that start with .* (which must start at
   7919 the beginning or after \n). As in the case of is_anchored() (see above), we
   7920 have to take account of back references to capturing brackets that contain .*
   7921 because in that case we can't make the assumption. Also, the appearance of .*
   7922 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
   7923 or *SKIP does not count, because once again the assumption no longer holds.
   7924 
   7925 Arguments:
   7926   code           points to start of the compiled pattern or a group
   7927   bracket_map    a bitmap of which brackets we are inside while testing; this
   7928                    handles up to substring 31; after that we just have to take
   7929                    the less precise approach
   7930   cb             points to the compile data
   7931   atomcount      atomic group level
   7932   inassert       TRUE if in an assertion
   7933 
   7934 Returns:         TRUE or FALSE
   7935 */
   7936 
   7937 static BOOL
   7938 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
   7939   int atomcount, BOOL inassert)
   7940 {
   7941 do {
   7942    PCRE2_SPTR scode = first_significant_code(
   7943      code + PRIV(OP_lengths)[*code], FALSE);
   7944    int op = *scode;
   7945 
   7946    /* If we are at the start of a conditional assertion group, *both* the
   7947    conditional assertion *and* what follows the condition must satisfy the test
   7948    for start of line. Other kinds of condition fail. Note that there may be an
   7949    auto-callout at the start of a condition. */
   7950 
   7951    if (op == OP_COND)
   7952      {
   7953      scode += 1 + LINK_SIZE;
   7954 
   7955      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
   7956        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
   7957 
   7958      switch (*scode)
   7959        {
   7960        case OP_CREF:
   7961        case OP_DNCREF:
   7962        case OP_RREF:
   7963        case OP_DNRREF:
   7964        case OP_FAIL:
   7965        case OP_FALSE:
   7966        case OP_TRUE:
   7967        return FALSE;
   7968 
   7969        default:     /* Assertion */
   7970        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
   7971        do scode += GET(scode, 1); while (*scode == OP_ALT);
   7972        scode += 1 + LINK_SIZE;
   7973        break;
   7974        }
   7975      scode = first_significant_code(scode, FALSE);
   7976      op = *scode;
   7977      }
   7978 
   7979    /* Non-capturing brackets */
   7980 
   7981    if (op == OP_BRA  || op == OP_BRAPOS ||
   7982        op == OP_SBRA || op == OP_SBRAPOS)
   7983      {
   7984      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
   7985        return FALSE;
   7986      }
   7987 
   7988    /* Capturing brackets */
   7989 
   7990    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   7991             op == OP_SCBRA || op == OP_SCBRAPOS)
   7992      {
   7993      int n = GET2(scode, 1+LINK_SIZE);
   7994      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
   7995      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
   7996      }
   7997 
   7998    /* Positive forward assertions */
   7999 
   8000    else if (op == OP_ASSERT)
   8001      {
   8002      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
   8003        return FALSE;
   8004      }
   8005 
   8006    /* Atomic brackets */
   8007 
   8008    else if (op == OP_ONCE)
   8009      {
   8010      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
   8011        return FALSE;
   8012      }
   8013 
   8014    /* .* means "start at start or after \n" if it isn't in atomic brackets or
   8015    brackets that may be referenced or an assertion, and as long as the pattern
   8016    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
   8017    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
   8018    i.e. not at the start of a line. There is also an option that disables this
   8019    optimization. */
   8020 
   8021    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
   8022      {
   8023      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
   8024          atomcount > 0 || cb->had_pruneorskip || inassert ||
   8025          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
   8026        return FALSE;
   8027      }
   8028 
   8029    /* Check for explicit circumflex; anything else gives a FALSE result. Note
   8030    in particular that this includes atomic brackets OP_ONCE because the number
   8031    of characters matched by .* cannot be adjusted inside them. */
   8032 
   8033    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
   8034 
   8035    /* Move on to the next alternative */
   8036 
   8037    code += GET(code, 1);
   8038    }
   8039 while (*code == OP_ALT);  /* Loop for each alternative */
   8040 return TRUE;
   8041 }
   8042 
   8043 
   8044 
   8045 /*************************************************
   8046 *   Scan compiled regex for recursion reference  *
   8047 *************************************************/
   8048 
   8049 /* This function scans through a compiled pattern until it finds an instance of
   8050 OP_RECURSE.
   8051 
   8052 Arguments:
   8053   code        points to start of expression
   8054   utf         TRUE in UTF mode
   8055 
   8056 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
   8057 */
   8058 
   8059 static PCRE2_SPTR
   8060 find_recurse(PCRE2_SPTR code, BOOL utf)
   8061 {
   8062 for (;;)
   8063   {
   8064   PCRE2_UCHAR c = *code;
   8065   if (c == OP_END) return NULL;
   8066   if (c == OP_RECURSE) return code;
   8067 
   8068   /* XCLASS is used for classes that cannot be represented just by a bit map.
   8069   This includes negated single high-valued characters. CALLOUT_STR is used for
   8070   callouts with string arguments. In both cases the length in the table is
   8071   zero; the actual length is stored in the compiled code. */
   8072 
   8073   if (c == OP_XCLASS) code += GET(code, 1);
   8074     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
   8075 
   8076   /* Otherwise, we can get the item's length from the table, except that for
   8077   repeated character types, we have to test for \p and \P, which have an extra
   8078   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
   8079   we must add in its length. */
   8080 
   8081   else
   8082     {
   8083     switch(c)
   8084       {
   8085       case OP_TYPESTAR:
   8086       case OP_TYPEMINSTAR:
   8087       case OP_TYPEPLUS:
   8088       case OP_TYPEMINPLUS:
   8089       case OP_TYPEQUERY:
   8090       case OP_TYPEMINQUERY:
   8091       case OP_TYPEPOSSTAR:
   8092       case OP_TYPEPOSPLUS:
   8093       case OP_TYPEPOSQUERY:
   8094       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   8095       break;
   8096 
   8097       case OP_TYPEPOSUPTO:
   8098       case OP_TYPEUPTO:
   8099       case OP_TYPEMINUPTO:
   8100       case OP_TYPEEXACT:
   8101       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   8102         code += 2;
   8103       break;
   8104 
   8105       case OP_MARK:
   8106       case OP_COMMIT_ARG:
   8107       case OP_PRUNE_ARG:
   8108       case OP_SKIP_ARG:
   8109       case OP_THEN_ARG:
   8110       code += code[1];
   8111       break;
   8112       }
   8113 
   8114     /* Add in the fixed length from the table */
   8115 
   8116     code += PRIV(OP_lengths)[c];
   8117 
   8118     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
   8119     be followed by a multi-unit character. The length in the table is a
   8120     minimum, so we have to arrange to skip the extra units. */
   8121 
   8122 #ifdef MAYBE_UTF_MULTI
   8123     if (utf) switch(c)
   8124       {
   8125       case OP_CHAR:
   8126       case OP_CHARI:
   8127       case OP_NOT:
   8128       case OP_NOTI:
   8129       case OP_EXACT:
   8130       case OP_EXACTI:
   8131       case OP_NOTEXACT:
   8132       case OP_NOTEXACTI:
   8133       case OP_UPTO:
   8134       case OP_UPTOI:
   8135       case OP_NOTUPTO:
   8136       case OP_NOTUPTOI:
   8137       case OP_MINUPTO:
   8138       case OP_MINUPTOI:
   8139       case OP_NOTMINUPTO:
   8140       case OP_NOTMINUPTOI:
   8141       case OP_POSUPTO:
   8142       case OP_POSUPTOI:
   8143       case OP_NOTPOSUPTO:
   8144       case OP_NOTPOSUPTOI:
   8145       case OP_STAR:
   8146       case OP_STARI:
   8147       case OP_NOTSTAR:
   8148       case OP_NOTSTARI:
   8149       case OP_MINSTAR:
   8150       case OP_MINSTARI:
   8151       case OP_NOTMINSTAR:
   8152       case OP_NOTMINSTARI:
   8153       case OP_POSSTAR:
   8154       case OP_POSSTARI:
   8155       case OP_NOTPOSSTAR:
   8156       case OP_NOTPOSSTARI:
   8157       case OP_PLUS:
   8158       case OP_PLUSI:
   8159       case OP_NOTPLUS:
   8160       case OP_NOTPLUSI:
   8161       case OP_MINPLUS:
   8162       case OP_MINPLUSI:
   8163       case OP_NOTMINPLUS:
   8164       case OP_NOTMINPLUSI:
   8165       case OP_POSPLUS:
   8166       case OP_POSPLUSI:
   8167       case OP_NOTPOSPLUS:
   8168       case OP_NOTPOSPLUSI:
   8169       case OP_QUERY:
   8170       case OP_QUERYI:
   8171       case OP_NOTQUERY:
   8172       case OP_NOTQUERYI:
   8173       case OP_MINQUERY:
   8174       case OP_MINQUERYI:
   8175       case OP_NOTMINQUERY:
   8176       case OP_NOTMINQUERYI:
   8177       case OP_POSQUERY:
   8178       case OP_POSQUERYI:
   8179       case OP_NOTPOSQUERY:
   8180       case OP_NOTPOSQUERYI:
   8181       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   8182       break;
   8183       }
   8184 #else
   8185     (void)(utf);  /* Keep compiler happy by referencing function argument */
   8186 #endif  /* MAYBE_UTF_MULTI */
   8187     }
   8188   }
   8189 }
   8190 
   8191 
   8192 
   8193 /*************************************************
   8194 *    Check for asserted fixed first code unit    *
   8195 *************************************************/
   8196 
   8197 /* During compilation, the "first code unit" settings from forward assertions
   8198 are discarded, because they can cause conflicts with actual literals that
   8199 follow. However, if we end up without a first code unit setting for an
   8200 unanchored pattern, it is worth scanning the regex to see if there is an
   8201 initial asserted first code unit. If all branches start with the same asserted
   8202 code unit, or with a non-conditional bracket all of whose alternatives start
   8203 with the same asserted code unit (recurse ad lib), then we return that code
   8204 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
   8205 REQ_NONE in the flags.
   8206 
   8207 Arguments:
   8208   code       points to start of compiled pattern
   8209   flags      points to the first code unit flags
   8210   inassert   non-zero if in an assertion
   8211 
   8212 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
   8213 */
   8214 
   8215 static uint32_t
   8216 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
   8217 {
   8218 uint32_t c = 0;
   8219 int cflags = REQ_NONE;
   8220 
   8221 *flags = REQ_NONE;
   8222 do {
   8223    uint32_t d;
   8224    int dflags;
   8225    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
   8226              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
   8227    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
   8228    PCRE2_UCHAR op = *scode;
   8229 
   8230    switch(op)
   8231      {
   8232      default:
   8233      return 0;
   8234 
   8235      case OP_BRA:
   8236      case OP_BRAPOS:
   8237      case OP_CBRA:
   8238      case OP_SCBRA:
   8239      case OP_CBRAPOS:
   8240      case OP_SCBRAPOS:
   8241      case OP_ASSERT:
   8242      case OP_ONCE:
   8243      d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
   8244      if (dflags < 0)
   8245        return 0;
   8246      if (cflags < 0) { c = d; cflags = dflags; }
   8247        else if (c != d || cflags != dflags) return 0;
   8248      break;
   8249 
   8250      case OP_EXACT:
   8251      scode += IMM2_SIZE;
   8252      /* Fall through */
   8253 
   8254      case OP_CHAR:
   8255      case OP_PLUS:
   8256      case OP_MINPLUS:
   8257      case OP_POSPLUS:
   8258      if (inassert == 0) return 0;
   8259      if (cflags < 0) { c = scode[1]; cflags = 0; }
   8260        else if (c != scode[1]) return 0;
   8261      break;
   8262 
   8263      case OP_EXACTI:
   8264      scode += IMM2_SIZE;
   8265      /* Fall through */
   8266 
   8267      case OP_CHARI:
   8268      case OP_PLUSI:
   8269      case OP_MINPLUSI:
   8270      case OP_POSPLUSI:
   8271      if (inassert == 0) return 0;
   8272      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
   8273        else if (c != scode[1]) return 0;
   8274      break;
   8275      }
   8276 
   8277    code += GET(code, 1);
   8278    }
   8279 while (*code == OP_ALT);
   8280 
   8281 *flags = cflags;
   8282 return c;
   8283 }
   8284 
   8285 
   8286 
   8287 /*************************************************
   8288 *     Add an entry to the name/number table      *
   8289 *************************************************/
   8290 
   8291 /* This function is called between compiling passes to add an entry to the
   8292 name/number table, maintaining alphabetical order. Checking for permitted
   8293 and forbidden duplicates has already been done.
   8294 
   8295 Arguments:
   8296   cb           the compile data block
   8297   name         the name to add
   8298   length       the length of the name
   8299   groupno      the group number
   8300   tablecount   the count of names in the table so far
   8301 
   8302 Returns:       nothing
   8303 */
   8304 
   8305 static void
   8306 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
   8307   unsigned int groupno, uint32_t tablecount)
   8308 {
   8309 uint32_t i;
   8310 PCRE2_UCHAR *slot = cb->name_table;
   8311 
   8312 for (i = 0; i < tablecount; i++)
   8313   {
   8314   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
   8315   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
   8316     crc = -1; /* Current name is a substring */
   8317 
   8318   /* Make space in the table and break the loop for an earlier name. For a
   8319   duplicate or later name, carry on. We do this for duplicates so that in the
   8320   simple case (when ?(| is not used) they are in order of their numbers. In all
   8321   cases they are in the order in which they appear in the pattern. */
   8322 
   8323   if (crc < 0)
   8324     {
   8325     (void)memmove(slot + cb->name_entry_size, slot,
   8326       CU2BYTES((tablecount - i) * cb->name_entry_size));
   8327     break;
   8328     }
   8329 
   8330   /* Continue the loop for a later or duplicate name */
   8331 
   8332   slot += cb->name_entry_size;
   8333   }
   8334 
   8335 PUT2(slot, 0, groupno);
   8336 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
   8337 
   8338 /* Add a terminating zero and fill the rest of the slot with zeroes so that
   8339 the memory is all initialized. Otherwise valgrind moans about uninitialized
   8340 memory when saving serialized compiled patterns. */
   8341 
   8342 memset(slot + IMM2_SIZE + length, 0,
   8343   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
   8344 }
   8345 
   8346 
   8347 
   8348 /*************************************************
   8349 *             Skip in parsed pattern             *
   8350 *************************************************/
   8351 
   8352 /* This function is called to skip parts of the parsed pattern when finding the
   8353 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
   8354 the end of the branch, it is called to skip over an internal lookaround, and it
   8355 is also called to skip to the end of a class, during which it will never
   8356 encounter nested groups (but there's no need to have special code for that).
   8357 
   8358 When called to find the end of a branch or group, pptr must point to the first
   8359 meta code inside the branch, not the branch-starting code. In other cases it
   8360 can point to the item that causes the function to be called.
   8361 
   8362 Arguments:
   8363   pptr       current pointer to skip from
   8364   skiptype   PSKIP_CLASS when skipping to end of class
   8365              PSKIP_ALT when META_ALT ends the skip
   8366              PSKIP_KET when only META_KET ends the skip
   8367 
   8368 Returns:     new value of pptr
   8369              NULL if META_END is reached - should never occur
   8370                or for an unknown meta value - likewise
   8371 */
   8372 
   8373 static uint32_t *
   8374 parsed_skip(uint32_t *pptr, uint32_t skiptype)
   8375 {
   8376 uint32_t nestlevel = 0;
   8377 
   8378 for (;; pptr++)
   8379   {
   8380   uint32_t meta = META_CODE(*pptr);
   8381 
   8382   switch(meta)
   8383     {
   8384     default:  /* Just skip over most items */
   8385     if (meta < META_END) continue;  /* Literal */
   8386     break;
   8387 
   8388     /* This should never occur. */
   8389 
   8390     case META_END:
   8391     return NULL;
   8392 
   8393     /* The data for these items is variable in length. */
   8394 
   8395     case META_BACKREF:  /* Offset is present only if group >= 10 */
   8396     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
   8397     break;
   8398 
   8399     case META_ESCAPE:   /* A few escapes are followed by data items. */
   8400     switch (META_DATA(*pptr))
   8401       {
   8402       case ESC_P:
   8403       case ESC_p:
   8404       pptr += 1;
   8405       break;
   8406 
   8407       case ESC_g:
   8408       case ESC_k:
   8409       pptr += 1 + SIZEOFFSET;
   8410       break;
   8411       }
   8412     break;
   8413 
   8414     case META_MARK:     /* Add the length of the name. */
   8415     case META_COMMIT_ARG:
   8416     case META_PRUNE_ARG:
   8417     case META_SKIP_ARG:
   8418     case META_THEN_ARG:
   8419     pptr += pptr[1];
   8420     break;
   8421 
   8422     /* These are the "active" items in this loop. */
   8423 
   8424     case META_CLASS_END:
   8425     if (skiptype == PSKIP_CLASS) return pptr;
   8426     break;
   8427 
   8428     case META_ATOMIC:
   8429     case META_CAPTURE:
   8430     case META_COND_ASSERT:
   8431     case META_COND_DEFINE:
   8432     case META_COND_NAME:
   8433     case META_COND_NUMBER:
   8434     case META_COND_RNAME:
   8435     case META_COND_RNUMBER:
   8436     case META_COND_VERSION:
   8437     case META_LOOKAHEAD:
   8438     case META_LOOKAHEADNOT:
   8439     case META_LOOKBEHIND:
   8440     case META_LOOKBEHINDNOT:
   8441     case META_NOCAPTURE:
   8442     nestlevel++;
   8443     break;
   8444 
   8445     case META_ALT:
   8446     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
   8447     break;
   8448 
   8449     case META_KET:
   8450     if (nestlevel == 0) return pptr;
   8451     nestlevel--;
   8452     break;
   8453     }
   8454 
   8455   /* The extra data item length for each meta is in a table. */
   8456 
   8457   meta = (meta >> 16) & 0x7fff;
   8458   if (meta >= sizeof(meta_extra_lengths)) return NULL;
   8459   pptr += meta_extra_lengths[meta];
   8460   }
   8461 /* Control never reaches here */
   8462 return pptr;
   8463 }
   8464 
   8465 
   8466 
   8467 /*************************************************
   8468 *       Find length of a parsed group            *
   8469 *************************************************/
   8470 
   8471 /* This is called for nested groups within a branch of a lookbehind whose
   8472 length is being computed. If all the branches in the nested group have the same
   8473 length, that is OK. On entry, the pointer must be at the first element after
   8474 the group initializing code. On exit it points to OP_KET. Caching is used to
   8475 improve processing speed when the same capturing group occurs many times.
   8476 
   8477 Arguments:
   8478   pptrptr     pointer to pointer in the parsed pattern
   8479   isinline    FALSE if a reference or recursion; TRUE for inline group
   8480   errcodeptr  pointer to the errorcode
   8481   lcptr       pointer to the loop counter
   8482   group       number of captured group or -1 for a non-capturing group
   8483   recurses    chain of recurse_check to catch mutual recursion
   8484   cb          pointer to the compile data
   8485 
   8486 Returns:      the group length or a negative number
   8487 */
   8488 
   8489 static int
   8490 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
   8491   int group, parsed_recurse_check *recurses, compile_block *cb)
   8492 {
   8493 int branchlength;
   8494 int grouplength = -1;
   8495 
   8496 /* The cache can be used only if there is no possibility of there being two
   8497 groups with the same number. We do not need to set the end pointer for a group
   8498 that is being processed as a back reference or recursion, but we must do so for
   8499 an inline group. */
   8500 
   8501 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
   8502   {
   8503   uint32_t groupinfo = cb->groupinfo[group];
   8504   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
   8505   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
   8506     {
   8507     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
   8508     return groupinfo & GI_FIXED_LENGTH_MASK;
   8509     }
   8510   }
   8511 
   8512 /* Scan the group. In this case we find the end pointer of necessity. */
   8513 
   8514 for(;;)
   8515   {
   8516   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
   8517   if (branchlength < 0) goto ISNOTFIXED;
   8518   if (grouplength == -1) grouplength = branchlength;
   8519     else if (grouplength != branchlength) goto ISNOTFIXED;
   8520   if (**pptrptr == META_KET) break;
   8521   *pptrptr += 1;   /* Skip META_ALT */
   8522   }
   8523 
   8524 if (group > 0)
   8525   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
   8526 return grouplength;
   8527 
   8528 ISNOTFIXED:
   8529 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
   8530 return -1;
   8531 }
   8532 
   8533 
   8534 
   8535 /*************************************************
   8536 *        Find length of a parsed branch          *
   8537 *************************************************/
   8538 
   8539 /* Return a fixed length for a branch in a lookbehind, giving an error if the
   8540 length is not fixed. If any lookbehinds are encountered on the way, they get
   8541 their length set. On entry, *pptrptr points to the first element inside the
   8542 branch. On exit it is set to point to the ALT or KET.
   8543 
   8544 Arguments:
   8545   pptrptr     pointer to pointer in the parsed pattern
   8546   errcodeptr  pointer to error code
   8547   lcptr       pointer to loop counter
   8548   recurses    chain of recurse_check to catch mutual recursion
   8549   cb          pointer to compile block
   8550 
   8551 Returns:      the length, or a negative value on error
   8552 */
   8553 
   8554 static int
   8555 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
   8556   parsed_recurse_check *recurses, compile_block *cb)
   8557 {
   8558 int branchlength = 0;
   8559 int grouplength;
   8560 uint32_t lastitemlength = 0;
   8561 uint32_t *pptr = *pptrptr;
   8562 PCRE2_SIZE offset;
   8563 parsed_recurse_check this_recurse;
   8564 
   8565 /* A large and/or complex regex can take too long to process. This can happen
   8566 more often when (?| groups are present in the pattern because their length
   8567 cannot be cached. */
   8568 
   8569 if ((*lcptr)++ > 2000)
   8570   {
   8571   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
   8572   return -1;
   8573   }
   8574 
   8575 /* Scan the branch, accumulating the length. */
   8576 
   8577 for (;; pptr++)
   8578   {
   8579   parsed_recurse_check *r;
   8580   uint32_t *gptr, *gptrend;
   8581   uint32_t escape;
   8582   uint32_t group = 0;
   8583   uint32_t itemlength = 0;
   8584 
   8585   if (*pptr < META_END)
   8586     {
   8587     itemlength = 1;
   8588     }
   8589 
   8590   else switch (META_CODE(*pptr))
   8591     {
   8592     case META_KET:
   8593     case META_ALT:
   8594     goto EXIT;
   8595 
   8596     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
   8597     actual termination. */
   8598 
   8599     case META_ACCEPT:
   8600     case META_FAIL:
   8601     pptr = parsed_skip(pptr, PSKIP_ALT);
   8602     if (pptr == NULL) goto PARSED_SKIP_FAILED;
   8603     goto EXIT;
   8604 
   8605     case META_MARK:
   8606     case META_COMMIT_ARG:
   8607     case META_PRUNE_ARG:
   8608     case META_SKIP_ARG:
   8609     case META_THEN_ARG:
   8610     pptr += pptr[1] + 1;
   8611     break;
   8612 
   8613     case META_CIRCUMFLEX:
   8614     case META_COMMIT:
   8615     case META_DOLLAR:
   8616     case META_PRUNE:
   8617     case META_SKIP:
   8618     case META_THEN:
   8619     break;
   8620 
   8621     case META_OPTIONS:
   8622     pptr += 1;
   8623     break;
   8624 
   8625     case META_BIGVALUE:
   8626     itemlength = 1;
   8627     pptr += 1;
   8628     break;
   8629 
   8630     case META_CLASS:
   8631     case META_CLASS_NOT:
   8632     itemlength = 1;
   8633     pptr = parsed_skip(pptr, PSKIP_CLASS);
   8634     if (pptr == NULL) goto PARSED_SKIP_FAILED;
   8635     break;
   8636 
   8637     case META_CLASS_EMPTY_NOT:
   8638     case META_DOT:
   8639     itemlength = 1;
   8640     break;
   8641 
   8642     case META_CALLOUT_NUMBER:
   8643     pptr += 3;
   8644     break;
   8645 
   8646     case META_CALLOUT_STRING:
   8647     pptr += 3 + SIZEOFFSET;
   8648     break;
   8649 
   8650     /* Only some escapes consume a character. Of those, \R and \X are never
   8651     allowed because they might match more than character. \C is allowed only in
   8652     32-bit and non-UTF 8/16-bit modes. */
   8653 
   8654     case META_ESCAPE:
   8655     escape = META_DATA(*pptr);
   8656     if (escape == ESC_R || escape == ESC_X) return -1;
   8657     if (escape > ESC_b && escape < ESC_Z)
   8658       {
   8659 #if PCRE2_CODE_UNIT_WIDTH != 32
   8660       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
   8661         {
   8662         *errcodeptr = ERR36;
   8663         return -1;
   8664         }
   8665 #endif
   8666       itemlength = 1;
   8667       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
   8668       }
   8669     break;
   8670 
   8671     /* Lookaheads can be ignored, but we must start the skip inside the group
   8672     so that it isn't treated as a group within the branch. */
   8673 
   8674     case META_LOOKAHEAD:
   8675     case META_LOOKAHEADNOT:
   8676     pptr = parsed_skip(pptr + 1, PSKIP_KET);
   8677     if (pptr == NULL) goto PARSED_SKIP_FAILED;
   8678 
   8679     /* Also ignore any qualifiers that follow a lookahead assertion. */
   8680 
   8681     switch (pptr[1])
   8682       {
   8683       case META_ASTERISK:
   8684       case META_ASTERISK_PLUS:
   8685       case META_ASTERISK_QUERY:
   8686       case META_PLUS:
   8687       case META_PLUS_PLUS:
   8688       case META_PLUS_QUERY:
   8689       case META_QUERY:
   8690       case META_QUERY_PLUS:
   8691       case META_QUERY_QUERY:
   8692       pptr++;
   8693       break;
   8694 
   8695       case META_MINMAX:
   8696       case META_MINMAX_PLUS:
   8697       case META_MINMAX_QUERY:
   8698       pptr += 3;
   8699       break;
   8700 
   8701       default:
   8702       break;
   8703       }
   8704     break;
   8705 
   8706     /* Lookbehinds can be ignored, but must themselves be checked. */
   8707 
   8708     case META_LOOKBEHIND:
   8709     case META_LOOKBEHINDNOT:
   8710     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
   8711       return -1;
   8712     break;
   8713 
   8714     /* Back references and recursions are handled by very similar code. At this
   8715     stage, the names generated in the parsing pass are available, but the main
   8716     name table has not yet been created. So for the named varieties, scan the
   8717     list of names in order to get the number of the first one in the pattern,
   8718     and whether or not this name is duplicated. */
   8719 
   8720     case META_BACKREF_BYNAME:
   8721     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
   8722       goto ISNOTFIXED;
   8723     /* Fall through */
   8724 
   8725     case META_RECURSE_BYNAME:
   8726       {
   8727       int i;
   8728       PCRE2_SPTR name;
   8729       BOOL is_dupname = FALSE;
   8730       named_group *ng = cb->named_groups;
   8731       uint32_t meta_code = META_CODE(*pptr);
   8732       uint32_t length = *(++pptr);
   8733 
   8734       GETPLUSOFFSET(offset, pptr);
   8735       name = cb->start_pattern + offset;
   8736       for (i = 0; i < cb->names_found; i++, ng++)
   8737         {
   8738         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
   8739           {
   8740           group = ng->number;
   8741           is_dupname = ng->isdup;
   8742           break;
   8743           }
   8744         }
   8745 
   8746       if (group == 0)
   8747         {
   8748         *errcodeptr = ERR15;  /* Non-existent subpattern */
   8749         cb->erroroffset = offset;
   8750         return -1;
   8751         }
   8752 
   8753       /* A numerical back reference can be fixed length if duplicate capturing
   8754       groups are not being used. A non-duplicate named back reference can also
   8755       be handled. */
   8756 
   8757       if (meta_code == META_RECURSE_BYNAME ||
   8758           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
   8759         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
   8760       }
   8761     goto ISNOTFIXED;                     /* Duplicate name or number */
   8762 
   8763     /* The offset values for back references < 10 are in a separate vector
   8764     because otherwise they would use more than two parsed pattern elements on
   8765     64-bit systems. */
   8766 
   8767     case META_BACKREF:
   8768     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
   8769         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
   8770       goto ISNOTFIXED;
   8771     group = META_DATA(*pptr);
   8772     if (group < 10)
   8773       {
   8774       offset = cb->small_ref_offset[group];
   8775       goto RECURSE_OR_BACKREF_LENGTH;
   8776       }
   8777 
   8778     /* Fall through */
   8779     /* For groups >= 10 - picking up group twice does no harm. */
   8780 
   8781     /* A true recursion implies not fixed length, but a subroutine call may
   8782     be OK. Back reference "recursions" are also failed. */
   8783 
   8784     case META_RECURSE:
   8785     group = META_DATA(*pptr);
   8786     GETPLUSOFFSET(offset, pptr);
   8787 
   8788     RECURSE_OR_BACKREF_LENGTH:
   8789     if (group > cb->bracount)
   8790       {
   8791       cb->erroroffset = offset;
   8792       *errcodeptr = ERR15;  /* Non-existent subpattern */
   8793       return -1;
   8794       }
   8795     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
   8796     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
   8797       {
   8798       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
   8799         else if (*gptr == (META_CAPTURE | group)) break;
   8800       }
   8801 
   8802     /* We must start the search for the end of the group at the first meta code
   8803     inside the group. Otherwise it will be treated as an enclosed group. */
   8804 
   8805     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
   8806     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
   8807     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
   8808     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
   8809     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
   8810     this_recurse.prev = recurses;
   8811     this_recurse.groupptr = gptr;
   8812 
   8813     /* We do not need to know the position of the end of the group, that is,
   8814     gptr is not used after the call to get_grouplength(). Setting the second
   8815     argument FALSE stops it scanning for the end when the length can be found
   8816     in the cache. */
   8817 
   8818     gptr++;
   8819     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
   8820       &this_recurse, cb);
   8821     if (grouplength < 0)
   8822       {
   8823       if (*errcodeptr == 0) goto ISNOTFIXED;
   8824       return -1;  /* Error already set */
   8825       }
   8826     itemlength = grouplength;
   8827     break;
   8828 
   8829     /* Check nested groups - advance past the initial data for each type and
   8830     then seek a fixed length with get_grouplength(). */
   8831 
   8832     case META_COND_NAME:
   8833     case META_COND_NUMBER:
   8834     case META_COND_RNAME:
   8835     case META_COND_RNUMBER:
   8836     case META_COND_DEFINE:
   8837     pptr += 2 + SIZEOFFSET;
   8838     goto CHECK_GROUP;
   8839 
   8840     case META_COND_ASSERT:
   8841     pptr += 1;
   8842     goto CHECK_GROUP;
   8843 
   8844     case META_COND_VERSION:
   8845     pptr += 4;
   8846     goto CHECK_GROUP;
   8847 
   8848     case META_CAPTURE:
   8849     group = META_DATA(*pptr);
   8850     /* Fall through */
   8851 
   8852     case META_ATOMIC:
   8853     case META_NOCAPTURE:
   8854     pptr++;
   8855     CHECK_GROUP:
   8856     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
   8857       recurses, cb);
   8858     if (grouplength < 0) return -1;
   8859     itemlength = grouplength;
   8860     break;
   8861 
   8862     /* Exact repetition is OK; variable repetition is not. A repetition of zero
   8863     must subtract the length that has already been added. */
   8864 
   8865     case META_MINMAX:
   8866     case META_MINMAX_PLUS:
   8867     case META_MINMAX_QUERY:
   8868     if (pptr[1] == pptr[2])
   8869       {
   8870       if (pptr[1] == 0) branchlength -= lastitemlength;
   8871         else itemlength = (pptr[1] - 1) * lastitemlength;
   8872       pptr += 2;
   8873       break;
   8874       }
   8875     /* Fall through */
   8876 
   8877     /* Any other item means this branch does not have a fixed length. */
   8878 
   8879     default:
   8880     ISNOTFIXED:
   8881     *errcodeptr = ERR25;   /* Not fixed length */
   8882     return -1;
   8883     }
   8884 
   8885   /* Add the item length to the branchlength, and save it for use if the next
   8886   thing is a quantifier. */
   8887 
   8888   branchlength += itemlength;
   8889   lastitemlength = itemlength;
   8890 
   8891   /* Ensure that the length does not overflow the limit. */
   8892 
   8893   if (branchlength > LOOKBEHIND_MAX)
   8894     {
   8895     *errcodeptr = ERR87;
   8896     return -1;
   8897     }
   8898   }
   8899 
   8900 EXIT:
   8901 *pptrptr = pptr;
   8902 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
   8903 return branchlength;
   8904 
   8905 PARSED_SKIP_FAILED:
   8906 *errcodeptr = ERR90;
   8907 return -1;
   8908 }
   8909 
   8910 
   8911 
   8912 /*************************************************
   8913 *        Set lengths in a lookbehind             *
   8914 *************************************************/
   8915 
   8916 /* This function is called for each lookbehind, to set the lengths in its
   8917 branches. An error occurs if any branch does not have a fixed length that is
   8918 less than the maximum (65535). On exit, the pointer must be left on the final
   8919 ket.
   8920 
   8921 Arguments:
   8922   pptrptr     pointer to pointer in the parsed pattern
   8923   errcodeptr  pointer to error code
   8924   lcptr       pointer to loop counter
   8925   recurses    chain of recurse_check to catch mutual recursion
   8926   cb          pointer to compile block
   8927 
   8928 Returns:      TRUE if all is well
   8929               FALSE otherwise, with error code and offset set
   8930 */
   8931 
   8932 static BOOL
   8933 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
   8934   parsed_recurse_check *recurses, compile_block *cb)
   8935 {
   8936 PCRE2_SIZE offset;
   8937 int branchlength;
   8938 uint32_t *bptr = *pptrptr;
   8939 
   8940 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
   8941 *pptrptr += SIZEOFFSET;
   8942 
   8943 do
   8944   {
   8945   *pptrptr += 1;
   8946   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
   8947   if (branchlength < 0)
   8948     {
   8949     /* The errorcode and offset may already be set from a nested lookbehind. */
   8950     if (*errcodeptr == 0) *errcodeptr = ERR25;
   8951     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
   8952     return FALSE;
   8953     }
   8954   *bptr |= branchlength;  /* branchlength never more than 65535 */
   8955   bptr = *pptrptr;
   8956   }
   8957 while (*bptr == META_ALT);
   8958 
   8959 return TRUE;
   8960 }
   8961 
   8962 
   8963 
   8964 /*************************************************
   8965 *         Check parsed pattern lookbehinds       *
   8966 *************************************************/
   8967 
   8968 /* This function is called at the end of parsing a pattern if any lookbehinds
   8969 were encountered. It scans the parsed pattern for them, calling
   8970 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
   8971 the error offset is marked unset. The enables the functions above not to
   8972 override settings from deeper nestings.
   8973 
   8974 Arguments cb      points to the compile block
   8975 Returns:          0 on success, or an errorcode (cb->erroroffset will be set)
   8976 */
   8977 
   8978 static int
   8979 check_lookbehinds(compile_block *cb)
   8980 {
   8981 uint32_t *pptr;
   8982 int errorcode = 0;
   8983 int loopcount = 0;
   8984 
   8985 cb->erroroffset = PCRE2_UNSET;
   8986 
   8987 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
   8988   {
   8989   if (*pptr < META_END) continue;  /* Literal */
   8990 
   8991   switch (META_CODE(*pptr))
   8992     {
   8993     default:
   8994     return ERR70;  /* Unrecognized meta code */
   8995 
   8996     case META_ESCAPE:
   8997     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
   8998       pptr += 1;
   8999     break;
   9000 
   9001     case META_ACCEPT:
   9002     case META_ALT:
   9003     case META_ASTERISK:
   9004     case META_ASTERISK_PLUS:
   9005     case META_ASTERISK_QUERY:
   9006     case META_ATOMIC:
   9007     case META_BACKREF:
   9008     case META_CAPTURE:
   9009     case META_CIRCUMFLEX:
   9010     case META_CLASS:
   9011     case META_CLASS_EMPTY:
   9012     case META_CLASS_EMPTY_NOT:
   9013     case META_CLASS_END:
   9014     case META_CLASS_NOT:
   9015     case META_COMMIT:
   9016     case META_COND_ASSERT:
   9017     case META_DOLLAR:
   9018     case META_DOT:
   9019     case META_FAIL:
   9020     case META_KET:
   9021     case META_LOOKAHEAD:
   9022     case META_LOOKAHEADNOT:
   9023     case META_NOCAPTURE:
   9024     case META_PLUS:
   9025     case META_PLUS_PLUS:
   9026     case META_PLUS_QUERY:
   9027     case META_PRUNE:
   9028     case META_QUERY:
   9029     case META_QUERY_PLUS:
   9030     case META_QUERY_QUERY:
   9031     case META_RANGE_ESCAPED:
   9032     case META_RANGE_LITERAL:
   9033     case META_SKIP:
   9034     case META_THEN:
   9035     break;
   9036 
   9037     case META_RECURSE:
   9038     pptr += SIZEOFFSET;
   9039     break;
   9040 
   9041     case META_BACKREF_BYNAME:
   9042     case META_COND_DEFINE:
   9043     case META_COND_NAME:
   9044     case META_COND_NUMBER:
   9045     case META_COND_RNAME:
   9046     case META_COND_RNUMBER:
   9047     case META_RECURSE_BYNAME:
   9048     pptr += 1 + SIZEOFFSET;
   9049     break;
   9050 
   9051     case META_CALLOUT_STRING:
   9052     pptr += 3 + SIZEOFFSET;
   9053     break;
   9054 
   9055     case META_BIGVALUE:
   9056     case META_OPTIONS:
   9057     case META_POSIX:
   9058     case META_POSIX_NEG:
   9059     pptr += 1;
   9060     break;
   9061 
   9062     case META_MINMAX:
   9063     case META_MINMAX_QUERY:
   9064     case META_MINMAX_PLUS:
   9065     pptr += 2;
   9066     break;
   9067 
   9068     case META_CALLOUT_NUMBER:
   9069     case META_COND_VERSION:
   9070     pptr += 3;
   9071     break;
   9072 
   9073     case META_MARK:
   9074     case META_COMMIT_ARG:
   9075     case META_PRUNE_ARG:
   9076     case META_SKIP_ARG:
   9077     case META_THEN_ARG:
   9078     pptr += 1 + pptr[1];
   9079     break;
   9080 
   9081     case META_LOOKBEHIND:
   9082     case META_LOOKBEHINDNOT:
   9083     if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb))
   9084       return errorcode;
   9085     break;
   9086     }
   9087   }
   9088 
   9089 return 0;
   9090 }
   9091 
   9092 
   9093 
   9094 /*************************************************
   9095 *     External function to compile a pattern     *
   9096 *************************************************/
   9097 
   9098 /* This function reads a regular expression in the form of a string and returns
   9099 a pointer to a block of store holding a compiled version of the expression.
   9100 
   9101 Arguments:
   9102   pattern       the regular expression
   9103   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
   9104   options       option bits
   9105   errorptr      pointer to errorcode
   9106   erroroffset   pointer to error offset
   9107   ccontext      points to a compile context or is NULL
   9108 
   9109 Returns:        pointer to compiled data block, or NULL on error,
   9110                 with errorcode and erroroffset set
   9111 */
   9112 
   9113 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
   9114 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
   9115    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
   9116 {
   9117 BOOL utf;                             /* Set TRUE for UTF mode */
   9118 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
   9119 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
   9120 pcre2_real_code *re = NULL;           /* What we will return */
   9121 compile_block cb;                     /* "Static" compile-time data */
   9122 const uint8_t *tables;                /* Char tables base pointer */
   9123 
   9124 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
   9125 PCRE2_SPTR codestart;                 /* Start of compiled code */
   9126 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
   9127 uint32_t *pptr;                       /* Current pointer in parsed pattern */
   9128 
   9129 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
   9130 PCRE2_SIZE usedlength;                /* Actual length used */
   9131 PCRE2_SIZE re_blocksize;              /* Size of memory block */
   9132 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
   9133 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
   9134 
   9135 int32_t firstcuflags, reqcuflags;     /* Type of first/req code unit */
   9136 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
   9137 uint32_t setflags = 0;                /* NL and BSR set flags */
   9138 
   9139 uint32_t skipatstart;                 /* When checking (*UTF) etc */
   9140 uint32_t limit_heap  = UINT32_MAX;
   9141 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
   9142 uint32_t limit_depth = UINT32_MAX;
   9143 
   9144 int newline = 0;                      /* Unset; can be set by the pattern */
   9145 int bsr = 0;                          /* Unset; can be set by the pattern */
   9146 int errorcode = 0;                    /* Initialize to avoid compiler warn */
   9147 int regexrc;                          /* Return from compile */
   9148 
   9149 uint32_t i;                           /* Local loop counter */
   9150 
   9151 /* Comments at the head of this file explain about these variables. */
   9152 
   9153 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
   9154 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
   9155 named_group named_groups[NAMED_GROUP_LIST_SIZE];
   9156 
   9157 /* The workspace is used in different ways in the different compiling phases.
   9158 It needs to be 16-bit aligned for the preliminary parsing scan. */
   9159 
   9160 uint32_t c16workspace[C16_WORK_SIZE];
   9161 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
   9162 
   9163 
   9164 /* -------------- Check arguments and set up the pattern ----------------- */
   9165 
   9166 /* There must be error code and offset pointers. */
   9167 
   9168 if (errorptr == NULL || erroroffset == NULL) return NULL;
   9169 *errorptr = ERR0;
   9170 *erroroffset = 0;
   9171 
   9172 /* There must be a pattern! */
   9173 
   9174 if (pattern == NULL)
   9175   {
   9176   *errorptr = ERR16;
   9177   return NULL;
   9178   }
   9179 
   9180 /* A NULL compile context means "use a default context" */
   9181 
   9182 if (ccontext == NULL)
   9183   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
   9184 
   9185 /* Check that all undefined public option bits are zero. */
   9186 
   9187 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
   9188     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
   9189   {
   9190   *errorptr = ERR17;
   9191   return NULL;
   9192   }
   9193 
   9194 if ((options & PCRE2_LITERAL) != 0 &&
   9195     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
   9196      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
   9197   {
   9198   *errorptr = ERR92;
   9199   return NULL;
   9200   }
   9201 
   9202 /* A zero-terminated pattern is indicated by the special length value
   9203 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
   9204 
   9205 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
   9206   patlen = PRIV(strlen)(pattern);
   9207 
   9208 if (patlen > ccontext->max_pattern_length)
   9209   {
   9210   *errorptr = ERR88;
   9211   return NULL;
   9212   }
   9213 
   9214 /* From here on, all returns from this function should end up going via the
   9215 EXIT label. */
   9216 
   9217 
   9218 /* ------------ Initialize the "static" compile data -------------- */
   9219 
   9220 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
   9221 
   9222 cb.lcc = tables + lcc_offset;          /* Individual */
   9223 cb.fcc = tables + fcc_offset;          /*   character */
   9224 cb.cbits = tables + cbits_offset;      /*      tables */
   9225 cb.ctypes = tables + ctypes_offset;
   9226 
   9227 cb.assert_depth = 0;
   9228 cb.bracount = 0;
   9229 cb.cx = ccontext;
   9230 cb.dupnames = FALSE;
   9231 cb.end_pattern = pattern + patlen;
   9232 cb.erroroffset = 0;
   9233 cb.external_flags = 0;
   9234 cb.external_options = options;
   9235 cb.groupinfo = stack_groupinfo;
   9236 cb.had_recurse = FALSE;
   9237 cb.lastcapture = 0;
   9238 cb.max_lookbehind = 0;
   9239 cb.name_entry_size = 0;
   9240 cb.name_table = NULL;
   9241 cb.named_groups = named_groups;
   9242 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
   9243 cb.names_found = 0;
   9244 cb.open_caps = NULL;
   9245 cb.parens_depth = 0;
   9246 cb.parsed_pattern = stack_parsed_pattern;
   9247 cb.req_varyopt = 0;
   9248 cb.start_code = cworkspace;
   9249 cb.start_pattern = pattern;
   9250 cb.start_workspace = cworkspace;
   9251 cb.workspace_size = COMPILE_WORK_SIZE;
   9252 
   9253 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
   9254 references to help in deciding whether (.*) can be treated as anchored or not.
   9255 */
   9256 
   9257 cb.top_backref = 0;
   9258 cb.backref_map = 0;
   9259 
   9260 /* Escape sequences \1 to \9 are always back references, but as they are only
   9261 two characters long, only two elements can be used in the parsed_pattern
   9262 vector. The first contains the reference, and we'd like to use the second to
   9263 record the offset in the pattern, so that forward references to non-existent
   9264 groups can be diagnosed later with an offset. However, on 64-bit systems,
   9265 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
   9266 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
   9267 references have enough space for the offset to be put into the parsed pattern.
   9268 */
   9269 
   9270 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
   9271 
   9272 
   9273 /* --------------- Start looking at the pattern --------------- */
   9274 
   9275 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
   9276 the start of the pattern, and remember the offset to the actual regex. With
   9277 valgrind support, make the terminator of a zero-terminated pattern
   9278 inaccessible. This catches bugs that would otherwise only show up for
   9279 non-zero-terminated patterns. */
   9280 
   9281 #ifdef SUPPORT_VALGRIND
   9282 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
   9283 #endif
   9284 
   9285 ptr = pattern;
   9286 skipatstart = 0;
   9287 
   9288 if ((options & PCRE2_LITERAL) == 0)
   9289   {
   9290   while (patlen - skipatstart >= 2 &&
   9291          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
   9292          ptr[skipatstart+1] == CHAR_ASTERISK)
   9293     {
   9294     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
   9295       {
   9296       uint32_t c, pp;
   9297       pso *p = pso_list + i;
   9298 
   9299       if (patlen - skipatstart - 2 >= p->length &&
   9300           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
   9301             p->length) == 0)
   9302         {
   9303         skipatstart += p->length + 2;
   9304         switch(p->type)
   9305           {
   9306           case PSO_OPT:
   9307           cb.external_options |= p->value;
   9308           break;
   9309 
   9310           case PSO_FLG:
   9311           setflags |= p->value;
   9312           break;
   9313 
   9314           case PSO_NL:
   9315           newline = p->value;
   9316           setflags |= PCRE2_NL_SET;
   9317           break;
   9318 
   9319           case PSO_BSR:
   9320           bsr = p->value;
   9321           setflags |= PCRE2_BSR_SET;
   9322           break;
   9323 
   9324           case PSO_LIMM:
   9325           case PSO_LIMD:
   9326           case PSO_LIMH:
   9327           c = 0;
   9328           pp = skipatstart;
   9329           if (!IS_DIGIT(ptr[pp]))
   9330             {
   9331             errorcode = ERR60;
   9332             ptr += pp;
   9333             goto HAD_EARLY_ERROR;
   9334             }
   9335           while (IS_DIGIT(ptr[pp]))
   9336             {
   9337             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
   9338             c = c*10 + (ptr[pp++] - CHAR_0);
   9339             }
   9340           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
   9341             {
   9342             errorcode = ERR60;
   9343             ptr += pp;
   9344             goto HAD_EARLY_ERROR;
   9345             }
   9346           if (p->type == PSO_LIMH) limit_heap = c;
   9347             else if (p->type == PSO_LIMM) limit_match = c;
   9348             else limit_depth = c;
   9349           skipatstart += pp - skipatstart;
   9350           break;
   9351           }
   9352         break;   /* Out of the table scan loop */
   9353         }
   9354       }
   9355     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
   9356     }
   9357   }
   9358 
   9359 /* End of pattern-start options; advance to start of real regex. */
   9360 
   9361 ptr += skipatstart;
   9362 
   9363 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
   9364 
   9365 #ifndef SUPPORT_UNICODE
   9366 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
   9367   {
   9368   errorcode = ERR32;
   9369   goto HAD_EARLY_ERROR;
   9370   }
   9371 #endif
   9372 
   9373 /* Check UTF. We have the original options in 'options', with that value as
   9374 modified by (*UTF) etc in cb->external_options. The extra option
   9375 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
   9376 surrogate code points cannot be represented in UTF-16. */
   9377 
   9378 utf = (cb.external_options & PCRE2_UTF) != 0;
   9379 if (utf)
   9380   {
   9381   if ((options & PCRE2_NEVER_UTF) != 0)
   9382     {
   9383     errorcode = ERR74;
   9384     goto HAD_EARLY_ERROR;
   9385     }
   9386   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
   9387        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
   9388     goto HAD_ERROR;  /* Offset was set by valid_utf() */
   9389 
   9390 #if PCRE2_CODE_UNIT_WIDTH == 16
   9391   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
   9392     {
   9393     errorcode = ERR91;
   9394     goto HAD_EARLY_ERROR;
   9395     }
   9396 #endif
   9397   }
   9398 
   9399 /* Check UCP lockout. */
   9400 
   9401 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
   9402     (PCRE2_UCP|PCRE2_NEVER_UCP))
   9403   {
   9404   errorcode = ERR75;
   9405   goto HAD_EARLY_ERROR;
   9406   }
   9407 
   9408 /* Process the BSR setting. */
   9409 
   9410 if (bsr == 0) bsr = ccontext->bsr_convention;
   9411 
   9412 /* Process the newline setting. */
   9413 
   9414 if (newline == 0) newline = ccontext->newline_convention;
   9415 cb.nltype = NLTYPE_FIXED;
   9416 switch(newline)
   9417   {
   9418   case PCRE2_NEWLINE_CR:
   9419   cb.nllen = 1;
   9420   cb.nl[0] = CHAR_CR;
   9421   break;
   9422 
   9423   case PCRE2_NEWLINE_LF:
   9424   cb.nllen = 1;
   9425   cb.nl[0] = CHAR_NL;
   9426   break;
   9427 
   9428   case PCRE2_NEWLINE_NUL:
   9429   cb.nllen = 1;
   9430   cb.nl[0] = CHAR_NUL;
   9431   break;
   9432 
   9433   case PCRE2_NEWLINE_CRLF:
   9434   cb.nllen = 2;
   9435   cb.nl[0] = CHAR_CR;
   9436   cb.nl[1] = CHAR_NL;
   9437   break;
   9438 
   9439   case PCRE2_NEWLINE_ANY:
   9440   cb.nltype = NLTYPE_ANY;
   9441   break;
   9442 
   9443   case PCRE2_NEWLINE_ANYCRLF:
   9444   cb.nltype = NLTYPE_ANYCRLF;
   9445   break;
   9446 
   9447   default:
   9448   errorcode = ERR56;
   9449   goto HAD_EARLY_ERROR;
   9450   }
   9451 
   9452 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
   9453 their numerical equivalents, so that this information is always available for
   9454 the remaining processing. (2) At the same time, parse the pattern and put a
   9455 processed version into the parsed_pattern vector. This has escapes interpreted
   9456 and comments removed (amongst other things).
   9457 
   9458 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
   9459 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
   9460 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
   9461 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
   9462 characters greater than META_END (0x80000000) have to be coded as two units. In
   9463 this case, therefore, we scan the pattern to check for such values. */
   9464 
   9465 #if PCRE2_CODE_UNIT_WIDTH == 32
   9466 if (!utf)
   9467   {
   9468   PCRE2_SPTR p;
   9469   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
   9470   }
   9471 #endif
   9472 
   9473 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
   9474 is set we have to assume a numerical callout (4 elements) for each character
   9475 plus one at the end. This is overkill, but memory is plentiful these days. For
   9476 many smaller patterns the vector on the stack (which was set up above) can be
   9477 used. */
   9478 
   9479 parsed_size_needed = patlen - skipatstart + big32count;
   9480 
   9481 if ((ccontext->extra_options &
   9482      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
   9483   parsed_size_needed += 4;
   9484 
   9485 if ((options & PCRE2_AUTO_CALLOUT) != 0)
   9486   parsed_size_needed = (parsed_size_needed + 1) * 5;
   9487 
   9488 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
   9489   {
   9490   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
   9491     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
   9492   if (heap_parsed_pattern == NULL)
   9493     {
   9494     *errorptr = ERR21;
   9495     goto EXIT;
   9496     }
   9497   cb.parsed_pattern = heap_parsed_pattern;
   9498   }
   9499 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
   9500 
   9501 /* Do the parsing scan. */
   9502 
   9503 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
   9504 if (errorcode != 0) goto HAD_CB_ERROR;
   9505 
   9506 /* Workspace is needed to remember information about numbered groups: whether a
   9507 group can match an empty string and what its fixed length is. This is done to
   9508 avoid the possibility of recursive references causing very long compile times
   9509 when checking these features. Unnumbered groups do not have this exposure since
   9510 they cannot be referenced. We use an indexed vector for this purpose. If there
   9511 are sufficiently few groups, the default vector on the stack, as set up above,
   9512 can be used. Otherwise we have to get/free a special vector. The vector must be
   9513 initialized to zero. */
   9514 
   9515 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
   9516   {
   9517   cb.groupinfo = ccontext->memctl.malloc(
   9518     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
   9519   if (cb.groupinfo == NULL)
   9520     {
   9521     errorcode = ERR21;
   9522     cb.erroroffset = 0;
   9523     goto HAD_CB_ERROR;
   9524     }
   9525   }
   9526 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
   9527 
   9528 /* If there were any lookbehinds, scan the parsed pattern to figure out their
   9529 lengths. */
   9530 
   9531 if (has_lookbehind)
   9532   {
   9533   errorcode = check_lookbehinds(&cb);
   9534   if (errorcode != 0) goto HAD_CB_ERROR;
   9535   }
   9536 
   9537 /* For debugging, there is a function that shows the parsed data vector. */
   9538 
   9539 #ifdef DEBUG_SHOW_PARSED
   9540 fprintf(stderr, "+++ Pre-scan complete:\n");
   9541 show_parsed(&cb);
   9542 #endif
   9543 
   9544 /* For debugging capturing information this code can be enabled. */
   9545 
   9546 #ifdef DEBUG_SHOW_CAPTURES
   9547   {
   9548   named_group *ng = cb.named_groups;
   9549   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
   9550   for (i = 0; i < cb.names_found; i++, ng++)
   9551     {
   9552     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
   9553     }
   9554   }
   9555 #endif
   9556 
   9557 /* Pretend to compile the pattern while actually just accumulating the amount
   9558 of memory required in the 'length' variable. This behaviour is triggered by
   9559 passing a non-NULL final argument to compile_regex(). We pass a block of
   9560 workspace (cworkspace) for it to compile parts of the pattern into; the
   9561 compiled code is discarded when it is no longer needed, so hopefully this
   9562 workspace will never overflow, though there is a test for its doing so.
   9563 
   9564 On error, errorcode will be set non-zero, so we don't need to look at the
   9565 result of the function. The initial options have been put into the cb block,
   9566 but we still have to pass a separate options variable (the first argument)
   9567 because the options may change as the pattern is processed. */
   9568 
   9569 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
   9570 pptr = cb.parsed_pattern;
   9571 code = cworkspace;
   9572 *code = OP_BRA;
   9573 
   9574 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
   9575    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
   9576 
   9577 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
   9578 
   9579 /* This should be caught in compile_regex(), but just in case... */
   9580 
   9581 if (length > MAX_PATTERN_SIZE)
   9582   {
   9583   errorcode = ERR20;
   9584   goto HAD_CB_ERROR;
   9585   }
   9586 
   9587 /* Compute the size of, and then get and initialize, the data block for storing
   9588 the compiled pattern and names table. Integer overflow should no longer be
   9589 possible because nowadays we limit the maximum value of cb.names_found and
   9590 cb.name_entry_size. */
   9591 
   9592 re_blocksize = sizeof(pcre2_real_code) +
   9593   CU2BYTES(length +
   9594   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
   9595 re = (pcre2_real_code *)
   9596   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
   9597 if (re == NULL)
   9598   {
   9599   errorcode = ERR21;
   9600   goto HAD_CB_ERROR;
   9601   }
   9602 
   9603 /* The compiler may put padding at the end of the pcre2_real_code structure in
   9604 order to round it up to a multiple of 4 or 8 bytes. This means that when a
   9605 compiled pattern is copied (for example, when serialized) undefined bytes are
   9606 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
   9607 write to the last 8 bytes of the structure before setting the fields. */
   9608 
   9609 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
   9610 re->memctl = ccontext->memctl;
   9611 re->tables = tables;
   9612 re->executable_jit = NULL;
   9613 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
   9614 re->blocksize = re_blocksize;
   9615 re->magic_number = MAGIC_NUMBER;
   9616 re->compile_options = options;
   9617 re->overall_options = cb.external_options;
   9618 re->extra_options = ccontext->extra_options;
   9619 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
   9620 re->limit_heap = limit_heap;
   9621 re->limit_match = limit_match;
   9622 re->limit_depth = limit_depth;
   9623 re->first_codeunit = 0;
   9624 re->last_codeunit = 0;
   9625 re->bsr_convention = bsr;
   9626 re->newline_convention = newline;
   9627 re->max_lookbehind = 0;
   9628 re->minlength = 0;
   9629 re->top_bracket = 0;
   9630 re->top_backref = 0;
   9631 re->name_entry_size = cb.name_entry_size;
   9632 re->name_count = cb.names_found;
   9633 
   9634 /* The basic block is immediately followed by the name table, and the compiled
   9635 code follows after that. */
   9636 
   9637 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
   9638   re->name_entry_size * re->name_count;
   9639 
   9640 /* Update the compile data block for the actual compile. The starting points of
   9641 the name/number translation table and of the code are passed around in the
   9642 compile data block. The start/end pattern and initial options are already set
   9643 from the pre-compile phase, as is the name_entry_size field. */
   9644 
   9645 cb.parens_depth = 0;
   9646 cb.assert_depth = 0;
   9647 cb.lastcapture = 0;
   9648 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
   9649 cb.start_code = codestart;
   9650 cb.req_varyopt = 0;
   9651 cb.had_accept = FALSE;
   9652 cb.had_pruneorskip = FALSE;
   9653 cb.open_caps = NULL;
   9654 
   9655 /* If any named groups were found, create the name/number table from the list
   9656 created in the pre-pass. */
   9657 
   9658 if (cb.names_found > 0)
   9659   {
   9660   named_group *ng = cb.named_groups;
   9661   for (i = 0; i < cb.names_found; i++, ng++)
   9662     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
   9663   }
   9664 
   9665 /* Set up a starting, non-extracting bracket, then compile the expression. On
   9666 error, errorcode will be set non-zero, so we don't need to look at the result
   9667 of the function here. */
   9668 
   9669 pptr = cb.parsed_pattern;
   9670 code = (PCRE2_UCHAR *)codestart;
   9671 *code = OP_BRA;
   9672 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
   9673   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
   9674 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
   9675 re->top_bracket = cb.bracount;
   9676 re->top_backref = cb.top_backref;
   9677 re->max_lookbehind = cb.max_lookbehind;
   9678 
   9679 if (cb.had_accept)
   9680   {
   9681   reqcu = 0;              /* Must disable after (*ACCEPT) */
   9682   reqcuflags = REQ_NONE;
   9683   }
   9684 
   9685 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
   9686 but the estimated length exceeds the really used length, adjust the value of
   9687 re->blocksize, and if valgrind support is configured, mark the extra allocated
   9688 memory as unaddressable, so that any out-of-bound reads can be detected. */
   9689 
   9690 *code++ = OP_END;
   9691 usedlength = code - codestart;
   9692 if (usedlength > length) errorcode = ERR23; else
   9693   {
   9694   re->blocksize -= CU2BYTES(length - usedlength);
   9695 #ifdef SUPPORT_VALGRIND
   9696   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
   9697 #endif
   9698   }
   9699 
   9700 /* Scan the pattern for recursion/subroutine calls and convert the group
   9701 numbers into offsets. Maintain a small cache so that repeated groups containing
   9702 recursions are efficiently handled. */
   9703 
   9704 #define RSCAN_CACHE_SIZE 8
   9705 
   9706 if (errorcode == 0 && cb.had_recurse)
   9707   {
   9708   PCRE2_UCHAR *rcode;
   9709   PCRE2_SPTR rgroup;
   9710   unsigned int ccount = 0;
   9711   int start = RSCAN_CACHE_SIZE;
   9712   recurse_cache rc[RSCAN_CACHE_SIZE];
   9713 
   9714   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
   9715        rcode != NULL;
   9716        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
   9717     {
   9718     int p, groupnumber;
   9719 
   9720     groupnumber = (int)GET(rcode, 1);
   9721     if (groupnumber == 0) rgroup = codestart; else
   9722       {
   9723       PCRE2_SPTR search_from = codestart;
   9724       rgroup = NULL;
   9725       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
   9726         {
   9727         if (groupnumber == rc[p].groupnumber)
   9728           {
   9729           rgroup = rc[p].group;
   9730           break;
   9731           }
   9732 
   9733         /* Group n+1 must always start to the right of group n, so we can save
   9734         search time below when the new group number is greater than any of the
   9735         previously found groups. */
   9736 
   9737         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
   9738         }
   9739 
   9740       if (rgroup == NULL)
   9741         {
   9742         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
   9743         if (rgroup == NULL)
   9744           {
   9745           errorcode = ERR53;
   9746           break;
   9747           }
   9748         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
   9749         rc[start].groupnumber = groupnumber;
   9750         rc[start].group = rgroup;
   9751         if (ccount < RSCAN_CACHE_SIZE) ccount++;
   9752         }
   9753       }
   9754 
   9755     PUT(rcode, 1, rgroup - codestart);
   9756     }
   9757   }
   9758 
   9759 /* In rare debugging situations we sometimes need to look at the compiled code
   9760 at this stage. */
   9761 
   9762 #ifdef DEBUG_CALL_PRINTINT
   9763 pcre2_printint(re, stderr, TRUE);
   9764 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
   9765 #endif
   9766 
   9767 /* Unless disabled, check whether any single character iterators can be
   9768 auto-possessified. The function overwrites the appropriate opcode values, so
   9769 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
   9770 used in this code because at least one compiler gives a warning about loss of
   9771 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
   9772 function call. */
   9773 
   9774 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
   9775   {
   9776   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
   9777   if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
   9778   }
   9779 
   9780 /* Failed to compile, or error while post-processing. */
   9781 
   9782 if (errorcode != 0) goto HAD_CB_ERROR;
   9783 
   9784 /* Successful compile. If the anchored option was not passed, set it if
   9785 we can determine that the pattern is anchored by virtue of ^ characters or \A
   9786 or anything else, such as starting with non-atomic .* when DOTALL is set and
   9787 there are no occurrences of *PRUNE or *SKIP (though there is an option to
   9788 disable this case). */
   9789 
   9790 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
   9791      is_anchored(codestart, 0, &cb, 0, FALSE))
   9792   re->overall_options |= PCRE2_ANCHORED;
   9793 
   9794 /* Set up the first code unit or startline flag, the required code unit, and
   9795 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
   9796 is set, as the data it would create will not be used. Note that a first code
   9797 unit (but not the startline flag) is useful for anchored patterns because it
   9798 can still give a quick "no match" and also avoid searching for a last code
   9799 unit. */
   9800 
   9801 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
   9802   {
   9803   /* If we do not have a first code unit, see if there is one that is asserted
   9804   (these are not saved during the compile because they can cause conflicts with
   9805   actual literals that follow). */
   9806 
   9807   if (firstcuflags < 0)
   9808     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
   9809 
   9810   /* Save the data for a first code unit. */
   9811 
   9812   if (firstcuflags >= 0)
   9813     {
   9814     re->first_codeunit = firstcu;
   9815     re->flags |= PCRE2_FIRSTSET;
   9816 
   9817     /* Handle caseless first code units. */
   9818 
   9819     if ((firstcuflags & REQ_CASELESS) != 0)
   9820       {
   9821       if (firstcu < 128 || (!utf && firstcu < 255))
   9822         {
   9823         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
   9824         }
   9825 
   9826       /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
   9827       8-bit UTF mode, codepoints in the range 128-255 are introductory code
   9828       points and cannot have another case. In 16-bit and 32-bit modes, we can
   9829       check wide characters when UTF (and therefore UCP) is supported. */
   9830 
   9831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
   9832       else if (firstcu <= MAX_UTF_CODE_POINT &&
   9833                UCD_OTHERCASE(firstcu) != firstcu)
   9834         re->flags |= PCRE2_FIRSTCASELESS;
   9835 #endif
   9836       }
   9837     }
   9838 
   9839   /* When there is no first code unit, for non-anchored patterns, see if we can
   9840   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
   9841   branches start with ^ and also when all branches start with non-atomic .* for
   9842   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
   9843   that disables this case.) */
   9844 
   9845   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
   9846            is_startline(codestart, 0, &cb, 0, FALSE))
   9847     re->flags |= PCRE2_STARTLINE;
   9848 
   9849   /* Handle the "required code unit", if one is set. In the case of an anchored
   9850   pattern, do this only if it follows a variable length item in the pattern. */
   9851 
   9852   if (reqcuflags >= 0 &&
   9853        ((re->overall_options & PCRE2_ANCHORED) == 0 ||
   9854         (reqcuflags & REQ_VARY) != 0))
   9855     {
   9856     re->last_codeunit = reqcu;
   9857     re->flags |= PCRE2_LASTSET;
   9858 
   9859     /* Handle caseless required code units as for first code units (above). */
   9860 
   9861     if ((reqcuflags & REQ_CASELESS) != 0)
   9862       {
   9863       if (reqcu < 128 || (!utf && reqcu < 255))
   9864         {
   9865         if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
   9866         }
   9867 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
   9868       else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
   9869         re->flags |= PCRE2_LASTCASELESS;
   9870 #endif
   9871       }
   9872     }
   9873 
   9874   /* Finally, study the compiled pattern to set up information such as a bitmap
   9875   of starting code units and a minimum matching length. */
   9876 
   9877   if (PRIV(study)(re) != 0)
   9878     {
   9879     errorcode = ERR31;
   9880     goto HAD_CB_ERROR;
   9881     }
   9882   }   /* End of start-of-match optimizations. */
   9883 
   9884 /* Control ends up here in all cases. When running under valgrind, make a
   9885 pattern's terminating zero defined again. If memory was obtained for the parsed
   9886 version of the pattern, free it before returning. Also free the list of named
   9887 groups if a larger one had to be obtained, and likewise the group information
   9888 vector. */
   9889 
   9890 EXIT:
   9891 #ifdef SUPPORT_VALGRIND
   9892 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
   9893 #endif
   9894 if (cb.parsed_pattern != stack_parsed_pattern)
   9895   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
   9896 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
   9897   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
   9898 if (cb.groupinfo != stack_groupinfo)
   9899   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
   9900 return re;    /* Will be NULL after an error */
   9901 
   9902 /* Errors discovered in parse_regex() set the offset value in the compile
   9903 block. Errors discovered before it is called must compute it from the ptr
   9904 value. After parse_regex() is called, the offset in the compile block is set to
   9905 the end of the pattern, but certain errors in compile_regex() may reset it if
   9906 an offset is available in the parsed pattern. */
   9907 
   9908 HAD_CB_ERROR:
   9909 ptr = pattern + cb.erroroffset;
   9910 
   9911 HAD_EARLY_ERROR:
   9912 *erroroffset = ptr - pattern;
   9913 
   9914 HAD_ERROR:
   9915 *errorptr = errorcode;
   9916 pcre2_code_free(re);
   9917 re = NULL;
   9918 goto EXIT;
   9919 }
   9920 
   9921 /* End of pcre2_compile.c */
   9922