Home | History | Annotate | Download | only in dist
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9            Copyright (c) 1997-2014 University of Cambridge
     10 
     11 -----------------------------------------------------------------------------
     12 Redistribution and use in source and binary forms, with or without
     13 modification, are permitted provided that the following conditions are met:
     14 
     15     * Redistributions of source code must retain the above copyright notice,
     16       this list of conditions and the following disclaimer.
     17 
     18     * Redistributions in binary form must reproduce the above copyright
     19       notice, this list of conditions and the following disclaimer in the
     20       documentation and/or other materials provided with the distribution.
     21 
     22     * Neither the name of the University of Cambridge nor the names of its
     23       contributors may be used to endorse or promote products derived from
     24       this software without specific prior written permission.
     25 
     26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36 POSSIBILITY OF SUCH DAMAGE.
     37 -----------------------------------------------------------------------------
     38 */
     39 
     40 
     41 /* This module contains the external function pcre_compile(), along with
     42 supporting internal functions that are not used by other modules. */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #define NLBLOCK cd             /* Block containing newline information */
     50 #define PSSTART start_pattern  /* Field containing pattern start */
     51 #define PSEND   end_pattern    /* Field containing pattern end */
     52 
     53 #include "pcre_internal.h"
     54 
     55 
     56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
     57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
     58 library. We do not need to select pcre16_printint.c specially, because the
     59 COMPILE_PCREx macro will already be appropriately set. */
     60 
     61 #ifdef PCRE_DEBUG
     62 /* pcre_printint.c should not include any headers */
     63 #define PCRE_INCLUDED
     64 #include "pcre_printint.c"
     65 #undef PCRE_INCLUDED
     66 #endif
     67 
     68 
     69 /* Macro for setting individual bits in class bitmaps. */
     70 
     71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
     72 
     73 /* Maximum length value to check against when making sure that the integer that
     74 holds the compiled pattern length does not overflow. We make it a bit less than
     75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
     76 to check them every time. */
     77 
     78 #define OFLOW_MAX (INT_MAX - 20)
     79 
     80 /* Definitions to allow mutual recursion */
     81 
     82 static int
     83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
     84     const pcre_uint32 *, unsigned int);
     85 
     86 static BOOL
     87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
     88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
     89     compile_data *, int *);
     90 
     91 
     92 
     93 /*************************************************
     94 *      Code parameters and static tables         *
     95 *************************************************/
     96 
     97 /* This value specifies the size of stack workspace that is used during the
     98 first pre-compile phase that determines how much memory is required. The regex
     99 is partly compiled into this space, but the compiled parts are discarded as
    100 soon as they can be, so that hopefully there will never be an overrun. The code
    101 does, however, check for an overrun. The largest amount I've seen used is 218,
    102 so this number is very generous.
    103 
    104 The same workspace is used during the second, actual compile phase for
    105 remembering forward references to groups so that they can be filled in at the
    106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
    107 is 4 there is plenty of room for most patterns. However, the memory can get
    108 filled up by repetitions of forward references, for example patterns like
    109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
    110 that the workspace is expanded using malloc() in this situation. The value
    111 below is therefore a minimum, and we put a maximum on it for safety. The
    112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
    113 kicks in at the same number of forward references in all cases. */
    114 
    115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
    116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
    117 
    118 /* This value determines the size of the initial vector that is used for
    119 remembering named groups during the pre-compile. It is allocated on the stack,
    120 but if it is too small, it is expanded using malloc(), in a similar way to the
    121 workspace. The value is the number of slots in the list. */
    122 
    123 #define NAMED_GROUP_LIST_SIZE  20
    124 
    125 /* The overrun tests check for a slightly smaller size so that they detect the
    126 overrun before it actually does run off the end of the data block. */
    127 
    128 #define WORK_SIZE_SAFETY_MARGIN (100)
    129 
    130 /* Private flags added to firstchar and reqchar. */
    131 
    132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
    133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
    134 /* Negative values for the firstchar and reqchar flags */
    135 #define REQ_UNSET       (-2)
    136 #define REQ_NONE        (-1)
    137 
    138 /* Repeated character flags. */
    139 
    140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
    141 
    142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
    143 are simple data values; negative values are for special things like \d and so
    144 on. Zero means further processing is needed (for things like \x), or the escape
    145 is invalid. */
    146 
    147 #ifndef EBCDIC
    148 
    149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
    150 in UTF-8 mode. */
    151 
    152 static const short int escapes[] = {
    153      0,                       0,
    154      0,                       0,
    155      0,                       0,
    156      0,                       0,
    157      0,                       0,
    158      CHAR_COLON,              CHAR_SEMICOLON,
    159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
    160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
    161      CHAR_COMMERCIAL_AT,      -ESC_A,
    162      -ESC_B,                  -ESC_C,
    163      -ESC_D,                  -ESC_E,
    164      0,                       -ESC_G,
    165      -ESC_H,                  0,
    166      0,                       -ESC_K,
    167      0,                       0,
    168      -ESC_N,                  0,
    169      -ESC_P,                  -ESC_Q,
    170      -ESC_R,                  -ESC_S,
    171      0,                       0,
    172      -ESC_V,                  -ESC_W,
    173      -ESC_X,                  0,
    174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
    175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
    176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
    177      CHAR_GRAVE_ACCENT,       7,
    178      -ESC_b,                  0,
    179      -ESC_d,                  ESC_e,
    180      ESC_f,                   0,
    181      -ESC_h,                  0,
    182      0,                       -ESC_k,
    183      0,                       0,
    184      ESC_n,                   0,
    185      -ESC_p,                  0,
    186      ESC_r,                   -ESC_s,
    187      ESC_tee,                 0,
    188      -ESC_v,                  -ESC_w,
    189      0,                       0,
    190      -ESC_z
    191 };
    192 
    193 #else
    194 
    195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
    196 
    197 static const short int escapes[] = {
    198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
    199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
    200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
    201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
    202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
    203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
    204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
    205 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
    206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
    207 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
    208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
    209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
    210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
    211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
    212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
    213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
    214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
    215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
    216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
    217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
    218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
    219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
    220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
    221 };
    222 #endif
    223 
    224 
    225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
    226 searched linearly. Put all the names into a single string, in order to reduce
    227 the number of relocations when a shared library is dynamically linked. The
    228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
    229 platforms. */
    230 
    231 typedef struct verbitem {
    232   int   len;                 /* Length of verb name */
    233   int   op;                  /* Op when no arg, or -1 if arg mandatory */
    234   int   op_arg;              /* Op when arg present, or -1 if not allowed */
    235 } verbitem;
    236 
    237 static const char verbnames[] =
    238   "\0"                       /* Empty name is a shorthand for MARK */
    239   STRING_MARK0
    240   STRING_ACCEPT0
    241   STRING_COMMIT0
    242   STRING_F0
    243   STRING_FAIL0
    244   STRING_PRUNE0
    245   STRING_SKIP0
    246   STRING_THEN;
    247 
    248 static const verbitem verbs[] = {
    249   { 0, -1,        OP_MARK },
    250   { 4, -1,        OP_MARK },
    251   { 6, OP_ACCEPT, -1 },
    252   { 6, OP_COMMIT, -1 },
    253   { 1, OP_FAIL,   -1 },
    254   { 4, OP_FAIL,   -1 },
    255   { 5, OP_PRUNE,  OP_PRUNE_ARG },
    256   { 4, OP_SKIP,   OP_SKIP_ARG  },
    257   { 4, OP_THEN,   OP_THEN_ARG  }
    258 };
    259 
    260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
    261 
    262 
    263 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
    264 another regex library. */
    265 
    266 static const pcre_uchar sub_start_of_word[] = {
    267   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    268   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
    269 
    270 static const pcre_uchar sub_end_of_word[] = {
    271   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    272   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
    273   CHAR_RIGHT_PARENTHESIS, '\0' };
    274 
    275 
    276 /* Tables of names of POSIX character classes and their lengths. The names are
    277 now all in a single string, to reduce the number of relocations when a shared
    278 library is dynamically loaded. The list of lengths is terminated by a zero
    279 length entry. The first three must be alpha, lower, upper, as this is assumed
    280 for handling case independence. The indices for graph, print, and punct are
    281 needed, so identify them. */
    282 
    283 static const char posix_names[] =
    284   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
    285   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
    286   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
    287   STRING_word0  STRING_xdigit;
    288 
    289 static const pcre_uint8 posix_name_lengths[] = {
    290   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
    291 
    292 #define PC_GRAPH  8
    293 #define PC_PRINT  9
    294 #define PC_PUNCT 10
    295 
    296 
    297 /* Table of class bit maps for each POSIX class. Each class is formed from a
    298 base map, with an optional addition or removal of another map. Then, for some
    299 classes, there is some additional tweaking: for [:blank:] the vertical space
    300 characters are removed, and for [:alpha:] and [:alnum:] the underscore
    301 character is removed. The triples in the table consist of the base map offset,
    302 second map offset or -1 if no second map, and a non-negative value for map
    303 addition or a negative value for map subtraction (if there are two maps). The
    304 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
    305 remove vertical space characters, 2 => remove underscore. */
    306 
    307 static const int posix_class_maps[] = {
    308   cbit_word,  cbit_digit, -2,             /* alpha */
    309   cbit_lower, -1,          0,             /* lower */
    310   cbit_upper, -1,          0,             /* upper */
    311   cbit_word,  -1,          2,             /* alnum - word without underscore */
    312   cbit_print, cbit_cntrl,  0,             /* ascii */
    313   cbit_space, -1,          1,             /* blank - a GNU extension */
    314   cbit_cntrl, -1,          0,             /* cntrl */
    315   cbit_digit, -1,          0,             /* digit */
    316   cbit_graph, -1,          0,             /* graph */
    317   cbit_print, -1,          0,             /* print */
    318   cbit_punct, -1,          0,             /* punct */
    319   cbit_space, -1,          0,             /* space */
    320   cbit_word,  -1,          0,             /* word - a Perl extension */
    321   cbit_xdigit,-1,          0              /* xdigit */
    322 };
    323 
    324 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
    325 Unicode property escapes. */
    326 
    327 #ifdef SUPPORT_UCP
    328 static const pcre_uchar string_PNd[]  = {
    329   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    330   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    331 static const pcre_uchar string_pNd[]  = {
    332   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    333   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    334 static const pcre_uchar string_PXsp[] = {
    335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    336   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    337 static const pcre_uchar string_pXsp[] = {
    338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    339   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    340 static const pcre_uchar string_PXwd[] = {
    341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    342   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    343 static const pcre_uchar string_pXwd[] = {
    344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    345   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    346 
    347 static const pcre_uchar *substitutes[] = {
    348   string_PNd,           /* \D */
    349   string_pNd,           /* \d */
    350   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
    351   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
    352   string_PXwd,          /* \W */
    353   string_pXwd           /* \w */
    354 };
    355 
    356 /* The POSIX class substitutes must be in the order of the POSIX class names,
    357 defined above, and there are both positive and negative cases. NULL means no
    358 general substitute of a Unicode property escape (\p or \P). However, for some
    359 POSIX classes (e.g. graph, print, punct) a special property code is compiled
    360 directly. */
    361 
    362 static const pcre_uchar string_pL[] =   {
    363   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    364   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    365 static const pcre_uchar string_pLl[] =  {
    366   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    367   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    368 static const pcre_uchar string_pLu[] =  {
    369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    370   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    371 static const pcre_uchar string_pXan[] = {
    372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    373   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    374 static const pcre_uchar string_h[] =    {
    375   CHAR_BACKSLASH, CHAR_h, '\0' };
    376 static const pcre_uchar string_pXps[] = {
    377   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    378   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    379 static const pcre_uchar string_PL[] =   {
    380   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    381   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    382 static const pcre_uchar string_PLl[] =  {
    383   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    384   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    385 static const pcre_uchar string_PLu[] =  {
    386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    387   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    388 static const pcre_uchar string_PXan[] = {
    389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    390   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    391 static const pcre_uchar string_H[] =    {
    392   CHAR_BACKSLASH, CHAR_H, '\0' };
    393 static const pcre_uchar string_PXps[] = {
    394   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    395   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    396 
    397 static const pcre_uchar *posix_substitutes[] = {
    398   string_pL,            /* alpha */
    399   string_pLl,           /* lower */
    400   string_pLu,           /* upper */
    401   string_pXan,          /* alnum */
    402   NULL,                 /* ascii */
    403   string_h,             /* blank */
    404   NULL,                 /* cntrl */
    405   string_pNd,           /* digit */
    406   NULL,                 /* graph */
    407   NULL,                 /* print */
    408   NULL,                 /* punct */
    409   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
    410   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
    411   NULL,                 /* xdigit */
    412   /* Negated cases */
    413   string_PL,            /* ^alpha */
    414   string_PLl,           /* ^lower */
    415   string_PLu,           /* ^upper */
    416   string_PXan,          /* ^alnum */
    417   NULL,                 /* ^ascii */
    418   string_H,             /* ^blank */
    419   NULL,                 /* ^cntrl */
    420   string_PNd,           /* ^digit */
    421   NULL,                 /* ^graph */
    422   NULL,                 /* ^print */
    423   NULL,                 /* ^punct */
    424   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
    425   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
    426   NULL                  /* ^xdigit */
    427 };
    428 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
    429 #endif
    430 
    431 #define STRING(a)  # a
    432 #define XSTRING(s) STRING(s)
    433 
    434 /* The texts of compile-time error messages. These are "char *" because they
    435 are passed to the outside world. Do not ever re-use any error number, because
    436 they are documented. Always add a new error instead. Messages marked DEAD below
    437 are no longer used. This used to be a table of strings, but in order to reduce
    438 the number of relocations needed when a shared library is loaded dynamically,
    439 it is now one long string. We cannot use a table of offsets, because the
    440 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
    441 simply count through to the one we want - this isn't a performance issue
    442 because these strings are used only when there is a compilation error.
    443 
    444 Each substring ends with \0 to insert a null character. This includes the final
    445 substring, so that the whole string ends with \0\0, which can be detected when
    446 counting through. */
    447 
    448 static const char error_texts[] =
    449   "no error\0"
    450   "\\ at end of pattern\0"
    451   "\\c at end of pattern\0"
    452   "unrecognized character follows \\\0"
    453   "numbers out of order in {} quantifier\0"
    454   /* 5 */
    455   "number too big in {} quantifier\0"
    456   "missing terminating ] for character class\0"
    457   "invalid escape sequence in character class\0"
    458   "range out of order in character class\0"
    459   "nothing to repeat\0"
    460   /* 10 */
    461   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
    462   "internal error: unexpected repeat\0"
    463   "unrecognized character after (? or (?-\0"
    464   "POSIX named classes are supported only within a class\0"
    465   "missing )\0"
    466   /* 15 */
    467   "reference to non-existent subpattern\0"
    468   "erroffset passed as NULL\0"
    469   "unknown option bit(s) set\0"
    470   "missing ) after comment\0"
    471   "parentheses nested too deeply\0"  /** DEAD **/
    472   /* 20 */
    473   "regular expression is too large\0"
    474   "failed to get memory\0"
    475   "unmatched parentheses\0"
    476   "internal error: code overflow\0"
    477   "unrecognized character after (?<\0"
    478   /* 25 */
    479   "lookbehind assertion is not fixed length\0"
    480   "malformed number or name after (?(\0"
    481   "conditional group contains more than two branches\0"
    482   "assertion expected after (?(\0"
    483   "(?R or (?[+-]digits must be followed by )\0"
    484   /* 30 */
    485   "unknown POSIX class name\0"
    486   "POSIX collating elements are not supported\0"
    487   "this version of PCRE is compiled without UTF support\0"
    488   "spare error\0"  /** DEAD **/
    489   "character value in \\x{} or \\o{} is too large\0"
    490   /* 35 */
    491   "invalid condition (?(0)\0"
    492   "\\C not allowed in lookbehind assertion\0"
    493   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
    494   "number after (?C is > 255\0"
    495   "closing ) for (?C expected\0"
    496   /* 40 */
    497   "recursive call could loop indefinitely\0"
    498   "unrecognized character after (?P\0"
    499   "syntax error in subpattern name (missing terminator)\0"
    500   "two named subpatterns have the same name\0"
    501   "invalid UTF-8 string\0"
    502   /* 45 */
    503   "support for \\P, \\p, and \\X has not been compiled\0"
    504   "malformed \\P or \\p sequence\0"
    505   "unknown property name after \\P or \\p\0"
    506   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
    507   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
    508   /* 50 */
    509   "repeated subpattern is too long\0"    /** DEAD **/
    510   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
    511   "internal error: overran compiling workspace\0"
    512   "internal error: previously-checked referenced subpattern not found\0"
    513   "DEFINE group contains more than one branch\0"
    514   /* 55 */
    515   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
    516   "inconsistent NEWLINE options\0"
    517   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
    518   "a numbered reference must not be zero\0"
    519   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
    520   /* 60 */
    521   "(*VERB) not recognized or malformed\0"
    522   "number is too big\0"
    523   "subpattern name expected\0"
    524   "digit expected after (?+\0"
    525   "] is an invalid data character in JavaScript compatibility mode\0"
    526   /* 65 */
    527   "different names for subpatterns of the same number are not allowed\0"
    528   "(*MARK) must have an argument\0"
    529   "this version of PCRE is not compiled with Unicode property support\0"
    530   "\\c must be followed by an ASCII character\0"
    531   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
    532   /* 70 */
    533   "internal error: unknown opcode in find_fixedlength()\0"
    534   "\\N is not supported in a class\0"
    535   "too many forward references\0"
    536   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
    537   "invalid UTF-16 string\0"
    538   /* 75 */
    539   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
    540   "character value in \\u.... sequence is too large\0"
    541   "invalid UTF-32 string\0"
    542   "setting UTF is disabled by the application\0"
    543   "non-hex character in \\x{} (closing brace missing?)\0"
    544   /* 80 */
    545   "non-octal character in \\o{} (closing brace missing?)\0"
    546   "missing opening brace after \\o\0"
    547   "parentheses are too deeply nested\0"
    548   "invalid range in character class\0"
    549   "group name must start with a non-digit\0"
    550   /* 85 */
    551   "parentheses are too deeply nested (stack check)\0"
    552   "digits missing in \\x{} or \\o{}\0"
    553   ;
    554 
    555 /* Table to identify digits and hex digits. This is used when compiling
    556 patterns. Note that the tables in chartables are dependent on the locale, and
    557 may mark arbitrary characters as digits - but the PCRE compiling code expects
    558 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
    559 a private table here. It costs 256 bytes, but it is a lot faster than doing
    560 character value tests (at least in some simple cases I timed), and in some
    561 applications one wants PCRE to compile efficiently as well as match
    562 efficiently.
    563 
    564 For convenience, we use the same bit definitions as in chartables:
    565 
    566   0x04   decimal digit
    567   0x08   hexadecimal digit
    568 
    569 Then we can use ctype_digit and ctype_xdigit in the code. */
    570 
    571 /* Using a simple comparison for decimal numbers rather than a memory read
    572 is much faster, and the resulting code is simpler (the compiler turns it
    573 into a subtraction and unsigned comparison). */
    574 
    575 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
    576 
    577 #ifndef EBCDIC
    578 
    579 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
    580 UTF-8 mode. */
    581 
    582 static const pcre_uint8 digitab[] =
    583   {
    584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
    585   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
    586   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
    587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
    589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
    590   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
    591   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
    592   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
    593   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
    594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
    595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
    596   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
    597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
    598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
    599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
    600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
    601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
    602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
    603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
    604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
    605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
    606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
    607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
    609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
    610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
    611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
    612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
    613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
    614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
    615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
    616 
    617 #else
    618 
    619 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
    620 
    621 static const pcre_uint8 digitab[] =
    622   {
    623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
    624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
    625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
    626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
    627   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
    628   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
    629   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
    630   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
    631   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
    632   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
    633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
    634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
    635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
    636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
    637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
    638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
    639   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
    640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
    641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
    642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
    643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
    644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
    645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
    646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
    647   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
    648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
    649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
    650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
    651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
    652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
    653   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
    654   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
    655 
    656 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
    657   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
    658   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
    659   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
    660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    661   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
    662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
    663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
    664   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
    665   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
    666   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
    667   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
    668   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
    669   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
    670   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
    671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
    672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
    673   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
    674   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
    675   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
    676   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
    677   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
    678   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
    679   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
    680   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    681   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
    682   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
    683   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
    684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
    685   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
    686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
    687   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
    688   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
    689 #endif
    690 
    691 
    692 /* This table is used to check whether auto-possessification is possible
    693 between adjacent character-type opcodes. The left-hand (repeated) opcode is
    694 used to select the row, and the right-hand opcode is use to select the column.
    695 A value of 1 means that auto-possessification is OK. For example, the second
    696 value in the first row means that \D+\d can be turned into \D++\d.
    697 
    698 The Unicode property types (\P and \p) have to be present to fill out the table
    699 because of what their opcode values are, but the table values should always be
    700 zero because property types are handled separately in the code. The last four
    701 columns apply to items that cannot be repeated, so there is no need to have
    702 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
    703 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
    704 
    705 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
    706 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
    707 
    708 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
    709 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
    710   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
    711   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
    712   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
    713   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
    714   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
    715   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
    716   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
    717   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
    718   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
    719   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
    720   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
    721   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
    722   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
    723   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
    724   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
    725   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
    726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
    727 };
    728 
    729 
    730 /* This table is used to check whether auto-possessification is possible
    731 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
    732 left-hand (repeated) opcode is used to select the row, and the right-hand
    733 opcode is used to select the column. The values are as follows:
    734 
    735   0   Always return FALSE (never auto-possessify)
    736   1   Character groups are distinct (possessify if both are OP_PROP)
    737   2   Check character categories in the same group (general or particular)
    738   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
    739 
    740   4   Check left general category vs right particular category
    741   5   Check right general category vs left particular category
    742 
    743   6   Left alphanum vs right general category
    744   7   Left space vs right general category
    745   8   Left word vs right general category
    746 
    747   9   Right alphanum vs left general category
    748  10   Right space vs left general category
    749  11   Right word vs left general category
    750 
    751  12   Left alphanum vs right particular category
    752  13   Left space vs right particular category
    753  14   Left word vs right particular category
    754 
    755  15   Right alphanum vs left particular category
    756  16   Right space vs left particular category
    757  17   Right word vs left particular category
    758 */
    759 
    760 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
    761 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
    762   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
    763   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
    764   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
    765   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
    766   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
    767   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
    768   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
    769   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
    770   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
    771   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
    772   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
    773 };
    774 
    775 /* This table is used to check whether auto-possessification is possible
    776 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
    777 specifies a general category and the other specifies a particular category. The
    778 row is selected by the general category and the column by the particular
    779 category. The value is 1 if the particular category is not part of the general
    780 category. */
    781 
    782 static const pcre_uint8 catposstab[7][30] = {
    783 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
    784   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
    785   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
    786   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
    787   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
    788   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
    789   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
    790   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
    791 };
    792 
    793 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
    794 a general or particular category. The properties in each row are those
    795 that apply to the character set in question. Duplication means that a little
    796 unnecessary work is done when checking, but this keeps things much simpler
    797 because they can all use the same code. For more details see the comment where
    798 this table is used.
    799 
    800 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
    801 "space", but from Perl 5.18 it's included, so both categories are treated the
    802 same here. */
    803 
    804 static const pcre_uint8 posspropstab[3][4] = {
    805   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
    806   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
    807   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
    808 };
    809 
    810 /* This table is used when converting repeating opcodes into possessified
    811 versions as a result of an explicit possessive quantifier such as ++. A zero
    812 value means there is no possessified version - in those cases the item in
    813 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
    814 because all relevant opcodes are less than that. */
    815 
    816 static const pcre_uint8 opcode_possessify[] = {
    817   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
    818   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
    819 
    820   0,                       /* NOTI */
    821   OP_POSSTAR, 0,           /* STAR, MINSTAR */
    822   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
    823   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
    824   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
    825   0,                       /* EXACT */
    826   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
    827 
    828   OP_POSSTARI, 0,          /* STARI, MINSTARI */
    829   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
    830   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
    831   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
    832   0,                       /* EXACTI */
    833   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
    834 
    835   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
    836   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
    837   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
    838   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
    839   0,                       /* NOTEXACT */
    840   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
    841 
    842   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
    843   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
    844   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
    845   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
    846   0,                       /* NOTEXACTI */
    847   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
    848 
    849   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
    850   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
    851   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
    852   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
    853   0,                       /* TYPEEXACT */
    854   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
    855 
    856   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
    857   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
    858   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
    859   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
    860   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
    861 
    862   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
    863   0, 0,                    /* REF, REFI */
    864   0, 0,                    /* DNREF, DNREFI */
    865   0, 0                     /* RECURSE, CALLOUT */
    866 };
    867 
    868 
    869 
    870 /*************************************************
    871 *            Find an error text                  *
    872 *************************************************/
    873 
    874 /* The error texts are now all in one long string, to save on relocations. As
    875 some of the text is of unknown length, we can't use a table of offsets.
    876 Instead, just count through the strings. This is not a performance issue
    877 because it happens only when there has been a compilation error.
    878 
    879 Argument:   the error number
    880 Returns:    pointer to the error string
    881 */
    882 
    883 static const char *
    884 find_error_text(int n)
    885 {
    886 const char *s = error_texts;
    887 for (; n > 0; n--)
    888   {
    889   while (*s++ != CHAR_NULL) {};
    890   if (*s == CHAR_NULL) return "Error text not found (please report)";
    891   }
    892 return s;
    893 }
    894 
    895 
    896 
    897 /*************************************************
    898 *           Expand the workspace                 *
    899 *************************************************/
    900 
    901 /* This function is called during the second compiling phase, if the number of
    902 forward references fills the existing workspace, which is originally a block on
    903 the stack. A larger block is obtained from malloc() unless the ultimate limit
    904 has been reached or the increase will be rather small.
    905 
    906 Argument: pointer to the compile data block
    907 Returns:  0 if all went well, else an error number
    908 */
    909 
    910 static int
    911 expand_workspace(compile_data *cd)
    912 {
    913 pcre_uchar *newspace;
    914 int newsize = cd->workspace_size * 2;
    915 
    916 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
    917 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
    918     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
    919  return ERR72;
    920 
    921 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
    922 if (newspace == NULL) return ERR21;
    923 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
    924 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
    925 if (cd->workspace_size > COMPILE_WORK_SIZE)
    926   (PUBL(free))((void *)cd->start_workspace);
    927 cd->start_workspace = newspace;
    928 cd->workspace_size = newsize;
    929 return 0;
    930 }
    931 
    932 
    933 
    934 /*************************************************
    935 *            Check for counted repeat            *
    936 *************************************************/
    937 
    938 /* This function is called when a '{' is encountered in a place where it might
    939 start a quantifier. It looks ahead to see if it really is a quantifier or not.
    940 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
    941 where the ddds are digits.
    942 
    943 Arguments:
    944   p         pointer to the first char after '{'
    945 
    946 Returns:    TRUE or FALSE
    947 */
    948 
    949 static BOOL
    950 is_counted_repeat(const pcre_uchar *p)
    951 {
    952 if (!IS_DIGIT(*p)) return FALSE;
    953 p++;
    954 while (IS_DIGIT(*p)) p++;
    955 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
    956 
    957 if (*p++ != CHAR_COMMA) return FALSE;
    958 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
    959 
    960 if (!IS_DIGIT(*p)) return FALSE;
    961 p++;
    962 while (IS_DIGIT(*p)) p++;
    963 
    964 return (*p == CHAR_RIGHT_CURLY_BRACKET);
    965 }
    966 
    967 
    968 
    969 /*************************************************
    970 *            Handle escapes                      *
    971 *************************************************/
    972 
    973 /* This function is called when a \ has been encountered. It either returns a
    974 positive value for a simple escape such as \n, or 0 for a data character which
    975 will be placed in chptr. A backreference to group n is returned as negative n.
    976 When UTF-8 is enabled, a positive value greater than 255 may be returned in
    977 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
    978 character of the escape sequence.
    979 
    980 Arguments:
    981   ptrptr         points to the pattern position pointer
    982   chptr          points to a returned data character
    983   errorcodeptr   points to the errorcode variable
    984   bracount       number of previous extracting brackets
    985   options        the options bits
    986   isclass        TRUE if inside a character class
    987 
    988 Returns:         zero => a data character
    989                  positive => a special escape sequence
    990                  negative => a back reference
    991                  on error, errorcodeptr is set
    992 */
    993 
    994 static int
    995 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
    996   int bracount, int options, BOOL isclass)
    997 {
    998 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
    999 BOOL utf = (options & PCRE_UTF8) != 0;
   1000 const pcre_uchar *ptr = *ptrptr + 1;
   1001 pcre_uint32 c;
   1002 int escape = 0;
   1003 int i;
   1004 
   1005 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
   1006 ptr--;                            /* Set pointer back to the last byte */
   1007 
   1008 /* If backslash is at the end of the pattern, it's an error. */
   1009 
   1010 if (c == CHAR_NULL) *errorcodeptr = ERR1;
   1011 
   1012 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
   1013 in a table. A non-zero result is something that can be returned immediately.
   1014 Otherwise further processing may be required. */
   1015 
   1016 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1017 /* Not alphanumeric */
   1018 else if (c < CHAR_0 || c > CHAR_z) {}
   1019 else if ((i = escapes[c - CHAR_0]) != 0)
   1020   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
   1021 
   1022 #else           /* EBCDIC coding */
   1023 /* Not alphanumeric */
   1024 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
   1025 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
   1026 #endif
   1027 
   1028 /* Escapes that need further processing, or are illegal. */
   1029 
   1030 else
   1031   {
   1032   const pcre_uchar *oldptr;
   1033   BOOL braced, negated, overflow;
   1034   int s;
   1035 
   1036   switch (c)
   1037     {
   1038     /* A number of Perl escapes are not handled by PCRE. We give an explicit
   1039     error. */
   1040 
   1041     case CHAR_l:
   1042     case CHAR_L:
   1043     *errorcodeptr = ERR37;
   1044     break;
   1045 
   1046     case CHAR_u:
   1047     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
   1048       {
   1049       /* In JavaScript, \u must be followed by four hexadecimal numbers.
   1050       Otherwise it is a lowercase u letter. */
   1051       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
   1052         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
   1053         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
   1054         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
   1055         {
   1056         c = 0;
   1057         for (i = 0; i < 4; ++i)
   1058           {
   1059           register pcre_uint32 cc = *(++ptr);
   1060 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1061           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1062           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1063 #else           /* EBCDIC coding */
   1064           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1065           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1066 #endif
   1067           }
   1068 
   1069 #if defined COMPILE_PCRE8
   1070         if (c > (utf ? 0x10ffffU : 0xffU))
   1071 #elif defined COMPILE_PCRE16
   1072         if (c > (utf ? 0x10ffffU : 0xffffU))
   1073 #elif defined COMPILE_PCRE32
   1074         if (utf && c > 0x10ffffU)
   1075 #endif
   1076           {
   1077           *errorcodeptr = ERR76;
   1078           }
   1079         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1080         }
   1081       }
   1082     else
   1083       *errorcodeptr = ERR37;
   1084     break;
   1085 
   1086     case CHAR_U:
   1087     /* In JavaScript, \U is an uppercase U letter. */
   1088     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
   1089     break;
   1090 
   1091     /* In a character class, \g is just a literal "g". Outside a character
   1092     class, \g must be followed by one of a number of specific things:
   1093 
   1094     (1) A number, either plain or braced. If positive, it is an absolute
   1095     backreference. If negative, it is a relative backreference. This is a Perl
   1096     5.10 feature.
   1097 
   1098     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
   1099     is part of Perl's movement towards a unified syntax for back references. As
   1100     this is synonymous with \k{name}, we fudge it up by pretending it really
   1101     was \k.
   1102 
   1103     (3) For Oniguruma compatibility we also support \g followed by a name or a
   1104     number either in angle brackets or in single quotes. However, these are
   1105     (possibly recursive) subroutine calls, _not_ backreferences. Just return
   1106     the ESC_g code (cf \k). */
   1107 
   1108     case CHAR_g:
   1109     if (isclass) break;
   1110     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
   1111       {
   1112       escape = ESC_g;
   1113       break;
   1114       }
   1115 
   1116     /* Handle the Perl-compatible cases */
   1117 
   1118     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   1119       {
   1120       const pcre_uchar *p;
   1121       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
   1122         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
   1123       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
   1124         {
   1125         escape = ESC_k;
   1126         break;
   1127         }
   1128       braced = TRUE;
   1129       ptr++;
   1130       }
   1131     else braced = FALSE;
   1132 
   1133     if (ptr[1] == CHAR_MINUS)
   1134       {
   1135       negated = TRUE;
   1136       ptr++;
   1137       }
   1138     else negated = FALSE;
   1139 
   1140     /* The integer range is limited by the machine's int representation. */
   1141     s = 0;
   1142     overflow = FALSE;
   1143     while (IS_DIGIT(ptr[1]))
   1144       {
   1145       if (s > INT_MAX / 10 - 1) /* Integer overflow */
   1146         {
   1147         overflow = TRUE;
   1148         break;
   1149         }
   1150       s = s * 10 + (int)(*(++ptr) - CHAR_0);
   1151       }
   1152     if (overflow) /* Integer overflow */
   1153       {
   1154       while (IS_DIGIT(ptr[1]))
   1155         ptr++;
   1156       *errorcodeptr = ERR61;
   1157       break;
   1158       }
   1159 
   1160     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
   1161       {
   1162       *errorcodeptr = ERR57;
   1163       break;
   1164       }
   1165 
   1166     if (s == 0)
   1167       {
   1168       *errorcodeptr = ERR58;
   1169       break;
   1170       }
   1171 
   1172     if (negated)
   1173       {
   1174       if (s > bracount)
   1175         {
   1176         *errorcodeptr = ERR15;
   1177         break;
   1178         }
   1179       s = bracount - (s - 1);
   1180       }
   1181 
   1182     escape = -s;
   1183     break;
   1184 
   1185     /* The handling of escape sequences consisting of a string of digits
   1186     starting with one that is not zero is not straightforward. Perl has changed
   1187     over the years. Nowadays \g{} for backreferences and \o{} for octal are
   1188     recommended to avoid the ambiguities in the old syntax.
   1189 
   1190     Outside a character class, the digits are read as a decimal number. If the
   1191     number is less than 8 (used to be 10), or if there are that many previous
   1192     extracting left brackets, then it is a back reference. Otherwise, up to
   1193     three octal digits are read to form an escaped byte. Thus \123 is likely to
   1194     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
   1195     the octal value is greater than 377, the least significant 8 bits are
   1196     taken. \8 and \9 are treated as the literal characters 8 and 9.
   1197 
   1198     Inside a character class, \ followed by a digit is always either a literal
   1199     8 or 9 or an octal number. */
   1200 
   1201     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
   1202     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   1203 
   1204     if (!isclass)
   1205       {
   1206       oldptr = ptr;
   1207       /* The integer range is limited by the machine's int representation. */
   1208       s = (int)(c -CHAR_0);
   1209       overflow = FALSE;
   1210       while (IS_DIGIT(ptr[1]))
   1211         {
   1212         if (s > INT_MAX / 10 - 1) /* Integer overflow */
   1213           {
   1214           overflow = TRUE;
   1215           break;
   1216           }
   1217         s = s * 10 + (int)(*(++ptr) - CHAR_0);
   1218         }
   1219       if (overflow) /* Integer overflow */
   1220         {
   1221         while (IS_DIGIT(ptr[1]))
   1222           ptr++;
   1223         *errorcodeptr = ERR61;
   1224         break;
   1225         }
   1226       if (s < 8 || s <= bracount)  /* Check for back reference */
   1227         {
   1228         escape = -s;
   1229         break;
   1230         }
   1231       ptr = oldptr;      /* Put the pointer back and fall through */
   1232       }
   1233 
   1234     /* Handle a digit following \ when the number is not a back reference. If
   1235     the first digit is 8 or 9, Perl used to generate a binary zero byte and
   1236     then treat the digit as a following literal. At least by Perl 5.18 this
   1237     changed so as not to insert the binary zero. */
   1238 
   1239     if ((c = *ptr) >= CHAR_8) break;
   1240 
   1241     /* Fall through with a digit less than 8 */
   1242 
   1243     /* \0 always starts an octal number, but we may drop through to here with a
   1244     larger first octal digit. The original code used just to take the least
   1245     significant 8 bits of octal numbers (I think this is what early Perls used
   1246     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
   1247     but no more than 3 octal digits. */
   1248 
   1249     case CHAR_0:
   1250     c -= CHAR_0;
   1251     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
   1252         c = c * 8 + *(++ptr) - CHAR_0;
   1253 #ifdef COMPILE_PCRE8
   1254     if (!utf && c > 0xff) *errorcodeptr = ERR51;
   1255 #endif
   1256     break;
   1257 
   1258     /* \o is a relatively new Perl feature, supporting a more general way of
   1259     specifying character codes in octal. The only supported form is \o{ddd}. */
   1260 
   1261     case CHAR_o:
   1262     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
   1263     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
   1264       {
   1265       ptr += 2;
   1266       c = 0;
   1267       overflow = FALSE;
   1268       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
   1269         {
   1270         register pcre_uint32 cc = *ptr++;
   1271         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   1272 #ifdef COMPILE_PCRE32
   1273         if (c >= 0x20000000l) { overflow = TRUE; break; }
   1274 #endif
   1275         c = (c << 3) + cc - CHAR_0 ;
   1276 #if defined COMPILE_PCRE8
   1277         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   1278 #elif defined COMPILE_PCRE16
   1279         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   1280 #elif defined COMPILE_PCRE32
   1281         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   1282 #endif
   1283         }
   1284       if (overflow)
   1285         {
   1286         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
   1287         *errorcodeptr = ERR34;
   1288         }
   1289       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1290         {
   1291         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1292         }
   1293       else *errorcodeptr = ERR80;
   1294       }
   1295     break;
   1296 
   1297     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
   1298     numbers. Otherwise it is a lowercase x letter. */
   1299 
   1300     case CHAR_x:
   1301     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
   1302       {
   1303       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
   1304         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
   1305         {
   1306         c = 0;
   1307         for (i = 0; i < 2; ++i)
   1308           {
   1309           register pcre_uint32 cc = *(++ptr);
   1310 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1311           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1312           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1313 #else           /* EBCDIC coding */
   1314           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1315           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1316 #endif
   1317           }
   1318         }
   1319       }    /* End JavaScript handling */
   1320 
   1321     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
   1322     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
   1323     digits. If not, { used to be treated as a data character. However, Perl
   1324     seems to read hex digits up to the first non-such, and ignore the rest, so
   1325     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
   1326     now gives an error. */
   1327 
   1328     else
   1329       {
   1330       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   1331         {
   1332         ptr += 2;
   1333         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1334           {
   1335           *errorcodeptr = ERR86;
   1336           break;
   1337           }
   1338         c = 0;
   1339         overflow = FALSE;
   1340         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
   1341           {
   1342           register pcre_uint32 cc = *ptr++;
   1343           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   1344 
   1345 #ifdef COMPILE_PCRE32
   1346           if (c >= 0x10000000l) { overflow = TRUE; break; }
   1347 #endif
   1348 
   1349 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1350           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1351           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1352 #else           /* EBCDIC coding */
   1353           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1354           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1355 #endif
   1356 
   1357 #if defined COMPILE_PCRE8
   1358           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   1359 #elif defined COMPILE_PCRE16
   1360           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   1361 #elif defined COMPILE_PCRE32
   1362           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   1363 #endif
   1364           }
   1365 
   1366         if (overflow)
   1367           {
   1368           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
   1369           *errorcodeptr = ERR34;
   1370           }
   1371 
   1372         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1373           {
   1374           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1375           }
   1376 
   1377         /* If the sequence of hex digits does not end with '}', give an error.
   1378         We used just to recognize this construct and fall through to the normal
   1379         \x handling, but nowadays Perl gives an error, which seems much more
   1380         sensible, so we do too. */
   1381 
   1382         else *errorcodeptr = ERR79;
   1383         }   /* End of \x{} processing */
   1384 
   1385       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
   1386 
   1387       else
   1388         {
   1389         c = 0;
   1390         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
   1391           {
   1392           pcre_uint32 cc;                          /* Some compilers don't like */
   1393           cc = *(++ptr);                           /* ++ in initializers */
   1394 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1395           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
   1396           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1397 #else           /* EBCDIC coding */
   1398           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
   1399           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1400 #endif
   1401           }
   1402         }     /* End of \xdd handling */
   1403       }       /* End of Perl-style \x handling */
   1404     break;
   1405 
   1406     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
   1407     An error is given if the byte following \c is not an ASCII character. This
   1408     coding is ASCII-specific, but then the whole concept of \cx is
   1409     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
   1410 
   1411     case CHAR_c:
   1412     c = *(++ptr);
   1413     if (c == CHAR_NULL)
   1414       {
   1415       *errorcodeptr = ERR2;
   1416       break;
   1417       }
   1418 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
   1419     if (c > 127)  /* Excludes all non-ASCII in either mode */
   1420       {
   1421       *errorcodeptr = ERR68;
   1422       break;
   1423       }
   1424     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
   1425     c ^= 0x40;
   1426 #else             /* EBCDIC coding */
   1427     if (c >= CHAR_a && c <= CHAR_z) c += 64;
   1428     c ^= 0xC0;
   1429 #endif
   1430     break;
   1431 
   1432     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
   1433     other alphanumeric following \ is an error if PCRE_EXTRA was set;
   1434     otherwise, for Perl compatibility, it is a literal. This code looks a bit
   1435     odd, but there used to be some cases other than the default, and there may
   1436     be again in future, so I haven't "optimized" it. */
   1437 
   1438     default:
   1439     if ((options & PCRE_EXTRA) != 0) switch(c)
   1440       {
   1441       default:
   1442       *errorcodeptr = ERR3;
   1443       break;
   1444       }
   1445     break;
   1446     }
   1447   }
   1448 
   1449 /* Perl supports \N{name} for character names, as well as plain \N for "not
   1450 newline". PCRE does not support \N{name}. However, it does support
   1451 quantification such as \N{2,3}. */
   1452 
   1453 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
   1454      !is_counted_repeat(ptr+2))
   1455   *errorcodeptr = ERR37;
   1456 
   1457 /* If PCRE_UCP is set, we change the values for \d etc. */
   1458 
   1459 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
   1460   escape += (ESC_DU - ESC_D);
   1461 
   1462 /* Set the pointer to the final character before returning. */
   1463 
   1464 *ptrptr = ptr;
   1465 *chptr = c;
   1466 return escape;
   1467 }
   1468 
   1469 
   1470 
   1471 #ifdef SUPPORT_UCP
   1472 /*************************************************
   1473 *               Handle \P and \p                 *
   1474 *************************************************/
   1475 
   1476 /* This function is called after \P or \p has been encountered, provided that
   1477 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
   1478 pointing at the P or p. On exit, it is pointing at the final character of the
   1479 escape sequence.
   1480 
   1481 Argument:
   1482   ptrptr         points to the pattern position pointer
   1483   negptr         points to a boolean that is set TRUE for negation else FALSE
   1484   ptypeptr       points to an unsigned int that is set to the type value
   1485   pdataptr       points to an unsigned int that is set to the detailed property value
   1486   errorcodeptr   points to the error code variable
   1487 
   1488 Returns:         TRUE if the type value was found, or FALSE for an invalid type
   1489 */
   1490 
   1491 static BOOL
   1492 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
   1493   unsigned int *pdataptr, int *errorcodeptr)
   1494 {
   1495 pcre_uchar c;
   1496 int i, bot, top;
   1497 const pcre_uchar *ptr = *ptrptr;
   1498 pcre_uchar name[32];
   1499 
   1500 c = *(++ptr);
   1501 if (c == CHAR_NULL) goto ERROR_RETURN;
   1502 
   1503 *negptr = FALSE;
   1504 
   1505 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
   1506 negation. */
   1507 
   1508 if (c == CHAR_LEFT_CURLY_BRACKET)
   1509   {
   1510   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
   1511     {
   1512     *negptr = TRUE;
   1513     ptr++;
   1514     }
   1515   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
   1516     {
   1517     c = *(++ptr);
   1518     if (c == CHAR_NULL) goto ERROR_RETURN;
   1519     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
   1520     name[i] = c;
   1521     }
   1522   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
   1523   name[i] = 0;
   1524   }
   1525 
   1526 /* Otherwise there is just one following character */
   1527 
   1528 else
   1529   {
   1530   name[0] = c;
   1531   name[1] = 0;
   1532   }
   1533 
   1534 *ptrptr = ptr;
   1535 
   1536 /* Search for a recognized property name using binary chop */
   1537 
   1538 bot = 0;
   1539 top = PRIV(utt_size);
   1540 
   1541 while (bot < top)
   1542   {
   1543   int r;
   1544   i = (bot + top) >> 1;
   1545   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
   1546   if (r == 0)
   1547     {
   1548     *ptypeptr = PRIV(utt)[i].type;
   1549     *pdataptr = PRIV(utt)[i].value;
   1550     return TRUE;
   1551     }
   1552   if (r > 0) bot = i + 1; else top = i;
   1553   }
   1554 
   1555 *errorcodeptr = ERR47;
   1556 *ptrptr = ptr;
   1557 return FALSE;
   1558 
   1559 ERROR_RETURN:
   1560 *errorcodeptr = ERR46;
   1561 *ptrptr = ptr;
   1562 return FALSE;
   1563 }
   1564 #endif
   1565 
   1566 
   1567 
   1568 /*************************************************
   1569 *         Read repeat counts                     *
   1570 *************************************************/
   1571 
   1572 /* Read an item of the form {n,m} and return the values. This is called only
   1573 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
   1574 so the syntax is guaranteed to be correct, but we need to check the values.
   1575 
   1576 Arguments:
   1577   p              pointer to first char after '{'
   1578   minp           pointer to int for min
   1579   maxp           pointer to int for max
   1580                  returned as -1 if no max
   1581   errorcodeptr   points to error code variable
   1582 
   1583 Returns:         pointer to '}' on success;
   1584                  current ptr on error, with errorcodeptr set non-zero
   1585 */
   1586 
   1587 static const pcre_uchar *
   1588 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
   1589 {
   1590 int min = 0;
   1591 int max = -1;
   1592 
   1593 while (IS_DIGIT(*p))
   1594   {
   1595   min = min * 10 + (int)(*p++ - CHAR_0);
   1596   if (min > 65535)
   1597     {
   1598     *errorcodeptr = ERR5;
   1599     return p;
   1600     }
   1601   }
   1602 
   1603 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
   1604   {
   1605   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
   1606     {
   1607     max = 0;
   1608     while(IS_DIGIT(*p))
   1609       {
   1610       max = max * 10 + (int)(*p++ - CHAR_0);
   1611       if (max > 65535)
   1612         {
   1613         *errorcodeptr = ERR5;
   1614         return p;
   1615         }
   1616       }
   1617     if (max < min)
   1618       {
   1619       *errorcodeptr = ERR4;
   1620       return p;
   1621       }
   1622     }
   1623   }
   1624 
   1625 *minp = min;
   1626 *maxp = max;
   1627 return p;
   1628 }
   1629 
   1630 
   1631 
   1632 /*************************************************
   1633 *      Find first significant op code            *
   1634 *************************************************/
   1635 
   1636 /* This is called by several functions that scan a compiled expression looking
   1637 for a fixed first character, or an anchoring op code etc. It skips over things
   1638 that do not influence this. For some calls, it makes sense to skip negative
   1639 forward and all backward assertions, and also the \b assertion; for others it
   1640 does not.
   1641 
   1642 Arguments:
   1643   code         pointer to the start of the group
   1644   skipassert   TRUE if certain assertions are to be skipped
   1645 
   1646 Returns:       pointer to the first significant opcode
   1647 */
   1648 
   1649 static const pcre_uchar*
   1650 first_significant_code(const pcre_uchar *code, BOOL skipassert)
   1651 {
   1652 for (;;)
   1653   {
   1654   switch ((int)*code)
   1655     {
   1656     case OP_ASSERT_NOT:
   1657     case OP_ASSERTBACK:
   1658     case OP_ASSERTBACK_NOT:
   1659     if (!skipassert) return code;
   1660     do code += GET(code, 1); while (*code == OP_ALT);
   1661     code += PRIV(OP_lengths)[*code];
   1662     break;
   1663 
   1664     case OP_WORD_BOUNDARY:
   1665     case OP_NOT_WORD_BOUNDARY:
   1666     if (!skipassert) return code;
   1667     /* Fall through */
   1668 
   1669     case OP_CALLOUT:
   1670     case OP_CREF:
   1671     case OP_DNCREF:
   1672     case OP_RREF:
   1673     case OP_DNRREF:
   1674     case OP_DEF:
   1675     code += PRIV(OP_lengths)[*code];
   1676     break;
   1677 
   1678     default:
   1679     return code;
   1680     }
   1681   }
   1682 /* Control never reaches here */
   1683 }
   1684 
   1685 
   1686 
   1687 /*************************************************
   1688 *        Find the fixed length of a branch       *
   1689 *************************************************/
   1690 
   1691 /* Scan a branch and compute the fixed length of subject that will match it,
   1692 if the length is fixed. This is needed for dealing with backward assertions.
   1693 In UTF8 mode, the result is in characters rather than bytes. The branch is
   1694 temporarily terminated with OP_END when this function is called.
   1695 
   1696 This function is called when a backward assertion is encountered, so that if it
   1697 fails, the error message can point to the correct place in the pattern.
   1698 However, we cannot do this when the assertion contains subroutine calls,
   1699 because they can be forward references. We solve this by remembering this case
   1700 and doing the check at the end; a flag specifies which mode we are running in.
   1701 
   1702 Arguments:
   1703   code     points to the start of the pattern (the bracket)
   1704   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
   1705   atend    TRUE if called when the pattern is complete
   1706   cd       the "compile data" structure
   1707 
   1708 Returns:   the fixed length,
   1709              or -1 if there is no fixed length,
   1710              or -2 if \C was encountered (in UTF-8 mode only)
   1711              or -3 if an OP_RECURSE item was encountered and atend is FALSE
   1712              or -4 if an unknown opcode was encountered (internal error)
   1713 */
   1714 
   1715 static int
   1716 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
   1717 {
   1718 int length = -1;
   1719 
   1720 register int branchlength = 0;
   1721 register pcre_uchar *cc = code + 1 + LINK_SIZE;
   1722 
   1723 /* Scan along the opcodes for this branch. If we get to the end of the
   1724 branch, check the length against that of the other branches. */
   1725 
   1726 for (;;)
   1727   {
   1728   int d;
   1729   pcre_uchar *ce, *cs;
   1730   register pcre_uchar op = *cc;
   1731 
   1732   switch (op)
   1733     {
   1734     /* We only need to continue for OP_CBRA (normal capturing bracket) and
   1735     OP_BRA (normal non-capturing bracket) because the other variants of these
   1736     opcodes are all concerned with unlimited repeated groups, which of course
   1737     are not of fixed length. */
   1738 
   1739     case OP_CBRA:
   1740     case OP_BRA:
   1741     case OP_ONCE:
   1742     case OP_ONCE_NC:
   1743     case OP_COND:
   1744     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
   1745     if (d < 0) return d;
   1746     branchlength += d;
   1747     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1748     cc += 1 + LINK_SIZE;
   1749     break;
   1750 
   1751     /* Reached end of a branch; if it's a ket it is the end of a nested call.
   1752     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
   1753     an ALT. If it is END it's the end of the outer call. All can be handled by
   1754     the same code. Note that we must not include the OP_KETRxxx opcodes here,
   1755     because they all imply an unlimited repeat. */
   1756 
   1757     case OP_ALT:
   1758     case OP_KET:
   1759     case OP_END:
   1760     case OP_ACCEPT:
   1761     case OP_ASSERT_ACCEPT:
   1762     if (length < 0) length = branchlength;
   1763       else if (length != branchlength) return -1;
   1764     if (*cc != OP_ALT) return length;
   1765     cc += 1 + LINK_SIZE;
   1766     branchlength = 0;
   1767     break;
   1768 
   1769     /* A true recursion implies not fixed length, but a subroutine call may
   1770     be OK. If the subroutine is a forward reference, we can't deal with
   1771     it until the end of the pattern, so return -3. */
   1772 
   1773     case OP_RECURSE:
   1774     if (!atend) return -3;
   1775     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
   1776     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
   1777     if (cc > cs && cc < ce) return -1;                    /* Recursion */
   1778     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
   1779     if (d < 0) return d;
   1780     branchlength += d;
   1781     cc += 1 + LINK_SIZE;
   1782     break;
   1783 
   1784     /* Skip over assertive subpatterns */
   1785 
   1786     case OP_ASSERT:
   1787     case OP_ASSERT_NOT:
   1788     case OP_ASSERTBACK:
   1789     case OP_ASSERTBACK_NOT:
   1790     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1791     cc += PRIV(OP_lengths)[*cc];
   1792     break;
   1793 
   1794     /* Skip over things that don't match chars */
   1795 
   1796     case OP_MARK:
   1797     case OP_PRUNE_ARG:
   1798     case OP_SKIP_ARG:
   1799     case OP_THEN_ARG:
   1800     cc += cc[1] + PRIV(OP_lengths)[*cc];
   1801     break;
   1802 
   1803     case OP_CALLOUT:
   1804     case OP_CIRC:
   1805     case OP_CIRCM:
   1806     case OP_CLOSE:
   1807     case OP_COMMIT:
   1808     case OP_CREF:
   1809     case OP_DEF:
   1810     case OP_DNCREF:
   1811     case OP_DNRREF:
   1812     case OP_DOLL:
   1813     case OP_DOLLM:
   1814     case OP_EOD:
   1815     case OP_EODN:
   1816     case OP_FAIL:
   1817     case OP_NOT_WORD_BOUNDARY:
   1818     case OP_PRUNE:
   1819     case OP_REVERSE:
   1820     case OP_RREF:
   1821     case OP_SET_SOM:
   1822     case OP_SKIP:
   1823     case OP_SOD:
   1824     case OP_SOM:
   1825     case OP_THEN:
   1826     case OP_WORD_BOUNDARY:
   1827     cc += PRIV(OP_lengths)[*cc];
   1828     break;
   1829 
   1830     /* Handle literal characters */
   1831 
   1832     case OP_CHAR:
   1833     case OP_CHARI:
   1834     case OP_NOT:
   1835     case OP_NOTI:
   1836     branchlength++;
   1837     cc += 2;
   1838 #ifdef SUPPORT_UTF
   1839     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1840 #endif
   1841     break;
   1842 
   1843     /* Handle exact repetitions. The count is already in characters, but we
   1844     need to skip over a multibyte character in UTF8 mode.  */
   1845 
   1846     case OP_EXACT:
   1847     case OP_EXACTI:
   1848     case OP_NOTEXACT:
   1849     case OP_NOTEXACTI:
   1850     branchlength += (int)GET2(cc,1);
   1851     cc += 2 + IMM2_SIZE;
   1852 #ifdef SUPPORT_UTF
   1853     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1854 #endif
   1855     break;
   1856 
   1857     case OP_TYPEEXACT:
   1858     branchlength += GET2(cc,1);
   1859     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
   1860       cc += 2;
   1861     cc += 1 + IMM2_SIZE + 1;
   1862     break;
   1863 
   1864     /* Handle single-char matchers */
   1865 
   1866     case OP_PROP:
   1867     case OP_NOTPROP:
   1868     cc += 2;
   1869     /* Fall through */
   1870 
   1871     case OP_HSPACE:
   1872     case OP_VSPACE:
   1873     case OP_NOT_HSPACE:
   1874     case OP_NOT_VSPACE:
   1875     case OP_NOT_DIGIT:
   1876     case OP_DIGIT:
   1877     case OP_NOT_WHITESPACE:
   1878     case OP_WHITESPACE:
   1879     case OP_NOT_WORDCHAR:
   1880     case OP_WORDCHAR:
   1881     case OP_ANY:
   1882     case OP_ALLANY:
   1883     branchlength++;
   1884     cc++;
   1885     break;
   1886 
   1887     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
   1888     otherwise \C is coded as OP_ALLANY. */
   1889 
   1890     case OP_ANYBYTE:
   1891     return -2;
   1892 
   1893     /* Check a class for variable quantification */
   1894 
   1895     case OP_CLASS:
   1896     case OP_NCLASS:
   1897 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1898     case OP_XCLASS:
   1899     /* The original code caused an unsigned overflow in 64 bit systems,
   1900     so now we use a conditional statement. */
   1901     if (op == OP_XCLASS)
   1902       cc += GET(cc, 1);
   1903     else
   1904       cc += PRIV(OP_lengths)[OP_CLASS];
   1905 #else
   1906     cc += PRIV(OP_lengths)[OP_CLASS];
   1907 #endif
   1908 
   1909     switch (*cc)
   1910       {
   1911       case OP_CRSTAR:
   1912       case OP_CRMINSTAR:
   1913       case OP_CRPLUS:
   1914       case OP_CRMINPLUS:
   1915       case OP_CRQUERY:
   1916       case OP_CRMINQUERY:
   1917       case OP_CRPOSSTAR:
   1918       case OP_CRPOSPLUS:
   1919       case OP_CRPOSQUERY:
   1920       return -1;
   1921 
   1922       case OP_CRRANGE:
   1923       case OP_CRMINRANGE:
   1924       case OP_CRPOSRANGE:
   1925       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
   1926       branchlength += (int)GET2(cc,1);
   1927       cc += 1 + 2 * IMM2_SIZE;
   1928       break;
   1929 
   1930       default:
   1931       branchlength++;
   1932       }
   1933     break;
   1934 
   1935     /* Anything else is variable length */
   1936 
   1937     case OP_ANYNL:
   1938     case OP_BRAMINZERO:
   1939     case OP_BRAPOS:
   1940     case OP_BRAPOSZERO:
   1941     case OP_BRAZERO:
   1942     case OP_CBRAPOS:
   1943     case OP_EXTUNI:
   1944     case OP_KETRMAX:
   1945     case OP_KETRMIN:
   1946     case OP_KETRPOS:
   1947     case OP_MINPLUS:
   1948     case OP_MINPLUSI:
   1949     case OP_MINQUERY:
   1950     case OP_MINQUERYI:
   1951     case OP_MINSTAR:
   1952     case OP_MINSTARI:
   1953     case OP_MINUPTO:
   1954     case OP_MINUPTOI:
   1955     case OP_NOTMINPLUS:
   1956     case OP_NOTMINPLUSI:
   1957     case OP_NOTMINQUERY:
   1958     case OP_NOTMINQUERYI:
   1959     case OP_NOTMINSTAR:
   1960     case OP_NOTMINSTARI:
   1961     case OP_NOTMINUPTO:
   1962     case OP_NOTMINUPTOI:
   1963     case OP_NOTPLUS:
   1964     case OP_NOTPLUSI:
   1965     case OP_NOTPOSPLUS:
   1966     case OP_NOTPOSPLUSI:
   1967     case OP_NOTPOSQUERY:
   1968     case OP_NOTPOSQUERYI:
   1969     case OP_NOTPOSSTAR:
   1970     case OP_NOTPOSSTARI:
   1971     case OP_NOTPOSUPTO:
   1972     case OP_NOTPOSUPTOI:
   1973     case OP_NOTQUERY:
   1974     case OP_NOTQUERYI:
   1975     case OP_NOTSTAR:
   1976     case OP_NOTSTARI:
   1977     case OP_NOTUPTO:
   1978     case OP_NOTUPTOI:
   1979     case OP_PLUS:
   1980     case OP_PLUSI:
   1981     case OP_POSPLUS:
   1982     case OP_POSPLUSI:
   1983     case OP_POSQUERY:
   1984     case OP_POSQUERYI:
   1985     case OP_POSSTAR:
   1986     case OP_POSSTARI:
   1987     case OP_POSUPTO:
   1988     case OP_POSUPTOI:
   1989     case OP_QUERY:
   1990     case OP_QUERYI:
   1991     case OP_REF:
   1992     case OP_REFI:
   1993     case OP_DNREF:
   1994     case OP_DNREFI:
   1995     case OP_SBRA:
   1996     case OP_SBRAPOS:
   1997     case OP_SCBRA:
   1998     case OP_SCBRAPOS:
   1999     case OP_SCOND:
   2000     case OP_SKIPZERO:
   2001     case OP_STAR:
   2002     case OP_STARI:
   2003     case OP_TYPEMINPLUS:
   2004     case OP_TYPEMINQUERY:
   2005     case OP_TYPEMINSTAR:
   2006     case OP_TYPEMINUPTO:
   2007     case OP_TYPEPLUS:
   2008     case OP_TYPEPOSPLUS:
   2009     case OP_TYPEPOSQUERY:
   2010     case OP_TYPEPOSSTAR:
   2011     case OP_TYPEPOSUPTO:
   2012     case OP_TYPEQUERY:
   2013     case OP_TYPESTAR:
   2014     case OP_TYPEUPTO:
   2015     case OP_UPTO:
   2016     case OP_UPTOI:
   2017     return -1;
   2018 
   2019     /* Catch unrecognized opcodes so that when new ones are added they
   2020     are not forgotten, as has happened in the past. */
   2021 
   2022     default:
   2023     return -4;
   2024     }
   2025   }
   2026 /* Control never gets here */
   2027 }
   2028 
   2029 
   2030 
   2031 /*************************************************
   2032 *    Scan compiled regex for specific bracket    *
   2033 *************************************************/
   2034 
   2035 /* This little function scans through a compiled pattern until it finds a
   2036 capturing bracket with the given number, or, if the number is negative, an
   2037 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
   2038 so that it can be called from pcre_study() when finding the minimum matching
   2039 length.
   2040 
   2041 Arguments:
   2042   code        points to start of expression
   2043   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   2044   number      the required bracket number or negative to find a lookbehind
   2045 
   2046 Returns:      pointer to the opcode for the bracket, or NULL if not found
   2047 */
   2048 
   2049 const pcre_uchar *
   2050 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
   2051 {
   2052 for (;;)
   2053   {
   2054   register pcre_uchar c = *code;
   2055 
   2056   if (c == OP_END) return NULL;
   2057 
   2058   /* XCLASS is used for classes that cannot be represented just by a bit
   2059   map. This includes negated single high-valued characters. The length in
   2060   the table is zero; the actual length is stored in the compiled code. */
   2061 
   2062   if (c == OP_XCLASS) code += GET(code, 1);
   2063 
   2064   /* Handle recursion */
   2065 
   2066   else if (c == OP_REVERSE)
   2067     {
   2068     if (number < 0) return (pcre_uchar *)code;
   2069     code += PRIV(OP_lengths)[c];
   2070     }
   2071 
   2072   /* Handle capturing bracket */
   2073 
   2074   else if (c == OP_CBRA || c == OP_SCBRA ||
   2075            c == OP_CBRAPOS || c == OP_SCBRAPOS)
   2076     {
   2077     int n = (int)GET2(code, 1+LINK_SIZE);
   2078     if (n == number) return (pcre_uchar *)code;
   2079     code += PRIV(OP_lengths)[c];
   2080     }
   2081 
   2082   /* Otherwise, we can get the item's length from the table, except that for
   2083   repeated character types, we have to test for \p and \P, which have an extra
   2084   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   2085   must add in its length. */
   2086 
   2087   else
   2088     {
   2089     switch(c)
   2090       {
   2091       case OP_TYPESTAR:
   2092       case OP_TYPEMINSTAR:
   2093       case OP_TYPEPLUS:
   2094       case OP_TYPEMINPLUS:
   2095       case OP_TYPEQUERY:
   2096       case OP_TYPEMINQUERY:
   2097       case OP_TYPEPOSSTAR:
   2098       case OP_TYPEPOSPLUS:
   2099       case OP_TYPEPOSQUERY:
   2100       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2101       break;
   2102 
   2103       case OP_TYPEUPTO:
   2104       case OP_TYPEMINUPTO:
   2105       case OP_TYPEEXACT:
   2106       case OP_TYPEPOSUPTO:
   2107       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2108         code += 2;
   2109       break;
   2110 
   2111       case OP_MARK:
   2112       case OP_PRUNE_ARG:
   2113       case OP_SKIP_ARG:
   2114       case OP_THEN_ARG:
   2115       code += code[1];
   2116       break;
   2117       }
   2118 
   2119     /* Add in the fixed length from the table */
   2120 
   2121     code += PRIV(OP_lengths)[c];
   2122 
   2123   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
   2124   a multi-byte character. The length in the table is a minimum, so we have to
   2125   arrange to skip the extra bytes. */
   2126 
   2127 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2128     if (utf) switch(c)
   2129       {
   2130       case OP_CHAR:
   2131       case OP_CHARI:
   2132       case OP_EXACT:
   2133       case OP_EXACTI:
   2134       case OP_UPTO:
   2135       case OP_UPTOI:
   2136       case OP_MINUPTO:
   2137       case OP_MINUPTOI:
   2138       case OP_POSUPTO:
   2139       case OP_POSUPTOI:
   2140       case OP_STAR:
   2141       case OP_STARI:
   2142       case OP_MINSTAR:
   2143       case OP_MINSTARI:
   2144       case OP_POSSTAR:
   2145       case OP_POSSTARI:
   2146       case OP_PLUS:
   2147       case OP_PLUSI:
   2148       case OP_MINPLUS:
   2149       case OP_MINPLUSI:
   2150       case OP_POSPLUS:
   2151       case OP_POSPLUSI:
   2152       case OP_QUERY:
   2153       case OP_QUERYI:
   2154       case OP_MINQUERY:
   2155       case OP_MINQUERYI:
   2156       case OP_POSQUERY:
   2157       case OP_POSQUERYI:
   2158       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   2159       break;
   2160       }
   2161 #else
   2162     (void)(utf);  /* Keep compiler happy by referencing function argument */
   2163 #endif
   2164     }
   2165   }
   2166 }
   2167 
   2168 
   2169 
   2170 /*************************************************
   2171 *   Scan compiled regex for recursion reference  *
   2172 *************************************************/
   2173 
   2174 /* This little function scans through a compiled pattern until it finds an
   2175 instance of OP_RECURSE.
   2176 
   2177 Arguments:
   2178   code        points to start of expression
   2179   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   2180 
   2181 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
   2182 */
   2183 
   2184 static const pcre_uchar *
   2185 find_recurse(const pcre_uchar *code, BOOL utf)
   2186 {
   2187 for (;;)
   2188   {
   2189   register pcre_uchar c = *code;
   2190   if (c == OP_END) return NULL;
   2191   if (c == OP_RECURSE) return code;
   2192 
   2193   /* XCLASS is used for classes that cannot be represented just by a bit
   2194   map. This includes negated single high-valued characters. The length in
   2195   the table is zero; the actual length is stored in the compiled code. */
   2196 
   2197   if (c == OP_XCLASS) code += GET(code, 1);
   2198 
   2199   /* Otherwise, we can get the item's length from the table, except that for
   2200   repeated character types, we have to test for \p and \P, which have an extra
   2201   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   2202   must add in its length. */
   2203 
   2204   else
   2205     {
   2206     switch(c)
   2207       {
   2208       case OP_TYPESTAR:
   2209       case OP_TYPEMINSTAR:
   2210       case OP_TYPEPLUS:
   2211       case OP_TYPEMINPLUS:
   2212       case OP_TYPEQUERY:
   2213       case OP_TYPEMINQUERY:
   2214       case OP_TYPEPOSSTAR:
   2215       case OP_TYPEPOSPLUS:
   2216       case OP_TYPEPOSQUERY:
   2217       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2218       break;
   2219 
   2220       case OP_TYPEPOSUPTO:
   2221       case OP_TYPEUPTO:
   2222       case OP_TYPEMINUPTO:
   2223       case OP_TYPEEXACT:
   2224       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2225         code += 2;
   2226       break;
   2227 
   2228       case OP_MARK:
   2229       case OP_PRUNE_ARG:
   2230       case OP_SKIP_ARG:
   2231       case OP_THEN_ARG:
   2232       code += code[1];
   2233       break;
   2234       }
   2235 
   2236     /* Add in the fixed length from the table */
   2237 
   2238     code += PRIV(OP_lengths)[c];
   2239 
   2240     /* In UTF-8 mode, opcodes that are followed by a character may be followed
   2241     by a multi-byte character. The length in the table is a minimum, so we have
   2242     to arrange to skip the extra bytes. */
   2243 
   2244 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2245     if (utf) switch(c)
   2246       {
   2247       case OP_CHAR:
   2248       case OP_CHARI:
   2249       case OP_NOT:
   2250       case OP_NOTI:
   2251       case OP_EXACT:
   2252       case OP_EXACTI:
   2253       case OP_NOTEXACT:
   2254       case OP_NOTEXACTI:
   2255       case OP_UPTO:
   2256       case OP_UPTOI:
   2257       case OP_NOTUPTO:
   2258       case OP_NOTUPTOI:
   2259       case OP_MINUPTO:
   2260       case OP_MINUPTOI:
   2261       case OP_NOTMINUPTO:
   2262       case OP_NOTMINUPTOI:
   2263       case OP_POSUPTO:
   2264       case OP_POSUPTOI:
   2265       case OP_NOTPOSUPTO:
   2266       case OP_NOTPOSUPTOI:
   2267       case OP_STAR:
   2268       case OP_STARI:
   2269       case OP_NOTSTAR:
   2270       case OP_NOTSTARI:
   2271       case OP_MINSTAR:
   2272       case OP_MINSTARI:
   2273       case OP_NOTMINSTAR:
   2274       case OP_NOTMINSTARI:
   2275       case OP_POSSTAR:
   2276       case OP_POSSTARI:
   2277       case OP_NOTPOSSTAR:
   2278       case OP_NOTPOSSTARI:
   2279       case OP_PLUS:
   2280       case OP_PLUSI:
   2281       case OP_NOTPLUS:
   2282       case OP_NOTPLUSI:
   2283       case OP_MINPLUS:
   2284       case OP_MINPLUSI:
   2285       case OP_NOTMINPLUS:
   2286       case OP_NOTMINPLUSI:
   2287       case OP_POSPLUS:
   2288       case OP_POSPLUSI:
   2289       case OP_NOTPOSPLUS:
   2290       case OP_NOTPOSPLUSI:
   2291       case OP_QUERY:
   2292       case OP_QUERYI:
   2293       case OP_NOTQUERY:
   2294       case OP_NOTQUERYI:
   2295       case OP_MINQUERY:
   2296       case OP_MINQUERYI:
   2297       case OP_NOTMINQUERY:
   2298       case OP_NOTMINQUERYI:
   2299       case OP_POSQUERY:
   2300       case OP_POSQUERYI:
   2301       case OP_NOTPOSQUERY:
   2302       case OP_NOTPOSQUERYI:
   2303       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   2304       break;
   2305       }
   2306 #else
   2307     (void)(utf);  /* Keep compiler happy by referencing function argument */
   2308 #endif
   2309     }
   2310   }
   2311 }
   2312 
   2313 
   2314 
   2315 /*************************************************
   2316 *    Scan compiled branch for non-emptiness      *
   2317 *************************************************/
   2318 
   2319 /* This function scans through a branch of a compiled pattern to see whether it
   2320 can match the empty string or not. It is called from could_be_empty()
   2321 below and from compile_branch() when checking for an unlimited repeat of a
   2322 group that can match nothing. Note that first_significant_code() skips over
   2323 backward and negative forward assertions when its final argument is TRUE. If we
   2324 hit an unclosed bracket, we return "empty" - this means we've struck an inner
   2325 bracket whose current branch will already have been scanned.
   2326 
   2327 Arguments:
   2328   code        points to start of search
   2329   endcode     points to where to stop
   2330   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2331   cd          contains pointers to tables etc.
   2332   recurses    chain of recurse_check to catch mutual recursion
   2333 
   2334 Returns:      TRUE if what is matched could be empty
   2335 */
   2336 
   2337 typedef struct recurse_check {
   2338   struct recurse_check *prev;
   2339   const pcre_uchar *group;
   2340 } recurse_check;
   2341 
   2342 static BOOL
   2343 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
   2344   BOOL utf, compile_data *cd, recurse_check *recurses)
   2345 {
   2346 register pcre_uchar c;
   2347 recurse_check this_recurse;
   2348 
   2349 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
   2350      code < endcode;
   2351      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
   2352   {
   2353   const pcre_uchar *ccode;
   2354 
   2355   c = *code;
   2356 
   2357   /* Skip over forward assertions; the other assertions are skipped by
   2358   first_significant_code() with a TRUE final argument. */
   2359 
   2360   if (c == OP_ASSERT)
   2361     {
   2362     do code += GET(code, 1); while (*code == OP_ALT);
   2363     c = *code;
   2364     continue;
   2365     }
   2366 
   2367   /* For a recursion/subroutine call, if its end has been reached, which
   2368   implies a backward reference subroutine call, we can scan it. If it's a
   2369   forward reference subroutine call, we can't. To detect forward reference
   2370   we have to scan up the list that is kept in the workspace. This function is
   2371   called only when doing the real compile, not during the pre-compile that
   2372   measures the size of the compiled pattern. */
   2373 
   2374   if (c == OP_RECURSE)
   2375     {
   2376     const pcre_uchar *scode = cd->start_code + GET(code, 1);
   2377     const pcre_uchar *endgroup = scode;
   2378     BOOL empty_branch;
   2379 
   2380     /* Test for forward reference or uncompleted reference. This is disabled
   2381     when called to scan a completed pattern by setting cd->start_workspace to
   2382     NULL. */
   2383 
   2384     if (cd->start_workspace != NULL)
   2385       {
   2386       const pcre_uchar *tcode;
   2387       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
   2388         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
   2389       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
   2390       }
   2391 
   2392     /* If the reference is to a completed group, we need to detect whether this
   2393     is a recursive call, as otherwise there will be an infinite loop. If it is
   2394     a recursion, just skip over it. Simple recursions are easily detected. For
   2395     mutual recursions we keep a chain on the stack. */
   2396 
   2397     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
   2398     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
   2399     else
   2400       {
   2401       recurse_check *r = recurses;
   2402       for (r = recurses; r != NULL; r = r->prev)
   2403         if (r->group == scode) break;
   2404       if (r != NULL) continue;   /* Mutual recursion */
   2405       }
   2406 
   2407     /* Completed reference; scan the referenced group, remembering it on the
   2408     stack chain to detect mutual recursions. */
   2409 
   2410     empty_branch = FALSE;
   2411     this_recurse.prev = recurses;
   2412     this_recurse.group = scode;
   2413 
   2414     do
   2415       {
   2416       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
   2417         {
   2418         empty_branch = TRUE;
   2419         break;
   2420         }
   2421       scode += GET(scode, 1);
   2422       }
   2423     while (*scode == OP_ALT);
   2424 
   2425     if (!empty_branch) return FALSE;  /* All branches are non-empty */
   2426     continue;
   2427     }
   2428 
   2429   /* Groups with zero repeats can of course be empty; skip them. */
   2430 
   2431   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
   2432       c == OP_BRAPOSZERO)
   2433     {
   2434     code += PRIV(OP_lengths)[c];
   2435     do code += GET(code, 1); while (*code == OP_ALT);
   2436     c = *code;
   2437     continue;
   2438     }
   2439 
   2440   /* A nested group that is already marked as "could be empty" can just be
   2441   skipped. */
   2442 
   2443   if (c == OP_SBRA  || c == OP_SBRAPOS ||
   2444       c == OP_SCBRA || c == OP_SCBRAPOS)
   2445     {
   2446     do code += GET(code, 1); while (*code == OP_ALT);
   2447     c = *code;
   2448     continue;
   2449     }
   2450 
   2451   /* For other groups, scan the branches. */
   2452 
   2453   if (c == OP_BRA  || c == OP_BRAPOS ||
   2454       c == OP_CBRA || c == OP_CBRAPOS ||
   2455       c == OP_ONCE || c == OP_ONCE_NC ||
   2456       c == OP_COND)
   2457     {
   2458     BOOL empty_branch;
   2459     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
   2460 
   2461     /* If a conditional group has only one branch, there is a second, implied,
   2462     empty branch, so just skip over the conditional, because it could be empty.
   2463     Otherwise, scan the individual branches of the group. */
   2464 
   2465     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
   2466       code += GET(code, 1);
   2467     else
   2468       {
   2469       empty_branch = FALSE;
   2470       do
   2471         {
   2472         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
   2473           empty_branch = TRUE;
   2474         code += GET(code, 1);
   2475         }
   2476       while (*code == OP_ALT);
   2477       if (!empty_branch) return FALSE;   /* All branches are non-empty */
   2478       }
   2479 
   2480     c = *code;
   2481     continue;
   2482     }
   2483 
   2484   /* Handle the other opcodes */
   2485 
   2486   switch (c)
   2487     {
   2488     /* Check for quantifiers after a class. XCLASS is used for classes that
   2489     cannot be represented just by a bit map. This includes negated single
   2490     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
   2491     actual length is stored in the compiled code, so we must update "code"
   2492     here. */
   2493 
   2494 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   2495     case OP_XCLASS:
   2496     ccode = code += GET(code, 1);
   2497     goto CHECK_CLASS_REPEAT;
   2498 #endif
   2499 
   2500     case OP_CLASS:
   2501     case OP_NCLASS:
   2502     ccode = code + PRIV(OP_lengths)[OP_CLASS];
   2503 
   2504 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   2505     CHECK_CLASS_REPEAT:
   2506 #endif
   2507 
   2508     switch (*ccode)
   2509       {
   2510       case OP_CRSTAR:            /* These could be empty; continue */
   2511       case OP_CRMINSTAR:
   2512       case OP_CRQUERY:
   2513       case OP_CRMINQUERY:
   2514       case OP_CRPOSSTAR:
   2515       case OP_CRPOSQUERY:
   2516       break;
   2517 
   2518       default:                   /* Non-repeat => class must match */
   2519       case OP_CRPLUS:            /* These repeats aren't empty */
   2520       case OP_CRMINPLUS:
   2521       case OP_CRPOSPLUS:
   2522       return FALSE;
   2523 
   2524       case OP_CRRANGE:
   2525       case OP_CRMINRANGE:
   2526       case OP_CRPOSRANGE:
   2527       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
   2528       break;
   2529       }
   2530     break;
   2531 
   2532     /* Opcodes that must match a character */
   2533 
   2534     case OP_ANY:
   2535     case OP_ALLANY:
   2536     case OP_ANYBYTE:
   2537 
   2538     case OP_PROP:
   2539     case OP_NOTPROP:
   2540     case OP_ANYNL:
   2541 
   2542     case OP_NOT_HSPACE:
   2543     case OP_HSPACE:
   2544     case OP_NOT_VSPACE:
   2545     case OP_VSPACE:
   2546     case OP_EXTUNI:
   2547 
   2548     case OP_NOT_DIGIT:
   2549     case OP_DIGIT:
   2550     case OP_NOT_WHITESPACE:
   2551     case OP_WHITESPACE:
   2552     case OP_NOT_WORDCHAR:
   2553     case OP_WORDCHAR:
   2554 
   2555     case OP_CHAR:
   2556     case OP_CHARI:
   2557     case OP_NOT:
   2558     case OP_NOTI:
   2559 
   2560     case OP_PLUS:
   2561     case OP_PLUSI:
   2562     case OP_MINPLUS:
   2563     case OP_MINPLUSI:
   2564 
   2565     case OP_NOTPLUS:
   2566     case OP_NOTPLUSI:
   2567     case OP_NOTMINPLUS:
   2568     case OP_NOTMINPLUSI:
   2569 
   2570     case OP_POSPLUS:
   2571     case OP_POSPLUSI:
   2572     case OP_NOTPOSPLUS:
   2573     case OP_NOTPOSPLUSI:
   2574 
   2575     case OP_EXACT:
   2576     case OP_EXACTI:
   2577     case OP_NOTEXACT:
   2578     case OP_NOTEXACTI:
   2579 
   2580     case OP_TYPEPLUS:
   2581     case OP_TYPEMINPLUS:
   2582     case OP_TYPEPOSPLUS:
   2583     case OP_TYPEEXACT:
   2584 
   2585     return FALSE;
   2586 
   2587     /* These are going to continue, as they may be empty, but we have to
   2588     fudge the length for the \p and \P cases. */
   2589 
   2590     case OP_TYPESTAR:
   2591     case OP_TYPEMINSTAR:
   2592     case OP_TYPEPOSSTAR:
   2593     case OP_TYPEQUERY:
   2594     case OP_TYPEMINQUERY:
   2595     case OP_TYPEPOSQUERY:
   2596     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2597     break;
   2598 
   2599     /* Same for these */
   2600 
   2601     case OP_TYPEUPTO:
   2602     case OP_TYPEMINUPTO:
   2603     case OP_TYPEPOSUPTO:
   2604     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2605       code += 2;
   2606     break;
   2607 
   2608     /* End of branch */
   2609 
   2610     case OP_KET:
   2611     case OP_KETRMAX:
   2612     case OP_KETRMIN:
   2613     case OP_KETRPOS:
   2614     case OP_ALT:
   2615     return TRUE;
   2616 
   2617     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
   2618     MINUPTO, and POSUPTO and their caseless and negative versions may be
   2619     followed by a multibyte character. */
   2620 
   2621 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2622     case OP_STAR:
   2623     case OP_STARI:
   2624     case OP_NOTSTAR:
   2625     case OP_NOTSTARI:
   2626 
   2627     case OP_MINSTAR:
   2628     case OP_MINSTARI:
   2629     case OP_NOTMINSTAR:
   2630     case OP_NOTMINSTARI:
   2631 
   2632     case OP_POSSTAR:
   2633     case OP_POSSTARI:
   2634     case OP_NOTPOSSTAR:
   2635     case OP_NOTPOSSTARI:
   2636 
   2637     case OP_QUERY:
   2638     case OP_QUERYI:
   2639     case OP_NOTQUERY:
   2640     case OP_NOTQUERYI:
   2641 
   2642     case OP_MINQUERY:
   2643     case OP_MINQUERYI:
   2644     case OP_NOTMINQUERY:
   2645     case OP_NOTMINQUERYI:
   2646 
   2647     case OP_POSQUERY:
   2648     case OP_POSQUERYI:
   2649     case OP_NOTPOSQUERY:
   2650     case OP_NOTPOSQUERYI:
   2651 
   2652     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
   2653     break;
   2654 
   2655     case OP_UPTO:
   2656     case OP_UPTOI:
   2657     case OP_NOTUPTO:
   2658     case OP_NOTUPTOI:
   2659 
   2660     case OP_MINUPTO:
   2661     case OP_MINUPTOI:
   2662     case OP_NOTMINUPTO:
   2663     case OP_NOTMINUPTOI:
   2664 
   2665     case OP_POSUPTO:
   2666     case OP_POSUPTOI:
   2667     case OP_NOTPOSUPTO:
   2668     case OP_NOTPOSUPTOI:
   2669 
   2670     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
   2671     break;
   2672 #endif
   2673 
   2674     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
   2675     string. */
   2676 
   2677     case OP_MARK:
   2678     case OP_PRUNE_ARG:
   2679     case OP_SKIP_ARG:
   2680     case OP_THEN_ARG:
   2681     code += code[1];
   2682     break;
   2683 
   2684     /* None of the remaining opcodes are required to match a character. */
   2685 
   2686     default:
   2687     break;
   2688     }
   2689   }
   2690 
   2691 return TRUE;
   2692 }
   2693 
   2694 
   2695 
   2696 /*************************************************
   2697 *    Scan compiled regex for non-emptiness       *
   2698 *************************************************/
   2699 
   2700 /* This function is called to check for left recursive calls. We want to check
   2701 the current branch of the current pattern to see if it could match the empty
   2702 string. If it could, we must look outwards for branches at other levels,
   2703 stopping when we pass beyond the bracket which is the subject of the recursion.
   2704 This function is called only during the real compile, not during the
   2705 pre-compile.
   2706 
   2707 Arguments:
   2708   code        points to start of the recursion
   2709   endcode     points to where to stop (current RECURSE item)
   2710   bcptr       points to the chain of current (unclosed) branch starts
   2711   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2712   cd          pointers to tables etc
   2713 
   2714 Returns:      TRUE if what is matched could be empty
   2715 */
   2716 
   2717 static BOOL
   2718 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
   2719   branch_chain *bcptr, BOOL utf, compile_data *cd)
   2720 {
   2721 while (bcptr != NULL && bcptr->current_branch >= code)
   2722   {
   2723   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
   2724     return FALSE;
   2725   bcptr = bcptr->outer;
   2726   }
   2727 return TRUE;
   2728 }
   2729 
   2730 
   2731 
   2732 /*************************************************
   2733 *        Base opcode of repeated opcodes         *
   2734 *************************************************/
   2735 
   2736 /* Returns the base opcode for repeated single character type opcodes. If the
   2737 opcode is not a repeated character type, it returns with the original value.
   2738 
   2739 Arguments:  c opcode
   2740 Returns:    base opcode for the type
   2741 */
   2742 
   2743 static pcre_uchar
   2744 get_repeat_base(pcre_uchar c)
   2745 {
   2746 return (c > OP_TYPEPOSUPTO)? c :
   2747        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
   2748        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
   2749        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
   2750        (c >= OP_STARI)?      OP_STARI :
   2751                              OP_STAR;
   2752 }
   2753 
   2754 
   2755 
   2756 #ifdef SUPPORT_UCP
   2757 /*************************************************
   2758 *        Check a character and a property        *
   2759 *************************************************/
   2760 
   2761 /* This function is called by check_auto_possessive() when a property item
   2762 is adjacent to a fixed character.
   2763 
   2764 Arguments:
   2765   c            the character
   2766   ptype        the property type
   2767   pdata        the data for the type
   2768   negated      TRUE if it's a negated property (\P or \p{^)
   2769 
   2770 Returns:       TRUE if auto-possessifying is OK
   2771 */
   2772 
   2773 static BOOL
   2774 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
   2775   BOOL negated)
   2776 {
   2777 const pcre_uint32 *p;
   2778 const ucd_record *prop = GET_UCD(c);
   2779 
   2780 switch(ptype)
   2781   {
   2782   case PT_LAMP:
   2783   return (prop->chartype == ucp_Lu ||
   2784           prop->chartype == ucp_Ll ||
   2785           prop->chartype == ucp_Lt) == negated;
   2786 
   2787   case PT_GC:
   2788   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
   2789 
   2790   case PT_PC:
   2791   return (pdata == prop->chartype) == negated;
   2792 
   2793   case PT_SC:
   2794   return (pdata == prop->script) == negated;
   2795 
   2796   /* These are specials */
   2797 
   2798   case PT_ALNUM:
   2799   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
   2800           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
   2801 
   2802   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
   2803   means that Perl space and POSIX space are now identical. PCRE was changed
   2804   at release 8.34. */
   2805 
   2806   case PT_SPACE:    /* Perl space */
   2807   case PT_PXSPACE:  /* POSIX space */
   2808   switch(c)
   2809     {
   2810     HSPACE_CASES:
   2811     VSPACE_CASES:
   2812     return negated;
   2813 
   2814     default:
   2815     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
   2816     }
   2817   break;  /* Control never reaches here */
   2818 
   2819   case PT_WORD:
   2820   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
   2821           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
   2822           c == CHAR_UNDERSCORE) == negated;
   2823 
   2824   case PT_CLIST:
   2825   p = PRIV(ucd_caseless_sets) + prop->caseset;
   2826   for (;;)
   2827     {
   2828     if (c < *p) return !negated;
   2829     if (c == *p++) return negated;
   2830     }
   2831   break;  /* Control never reaches here */
   2832   }
   2833 
   2834 return FALSE;
   2835 }
   2836 #endif  /* SUPPORT_UCP */
   2837 
   2838 
   2839 
   2840 /*************************************************
   2841 *        Fill the character property list        *
   2842 *************************************************/
   2843 
   2844 /* Checks whether the code points to an opcode that can take part in auto-
   2845 possessification, and if so, fills a list with its properties.
   2846 
   2847 Arguments:
   2848   code        points to start of expression
   2849   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2850   fcc         points to case-flipping table
   2851   list        points to output list
   2852               list[0] will be filled with the opcode
   2853               list[1] will be non-zero if this opcode
   2854                 can match an empty character string
   2855               list[2..7] depends on the opcode
   2856 
   2857 Returns:      points to the start of the next opcode if *code is accepted
   2858               NULL if *code is not accepted
   2859 */
   2860 
   2861 static const pcre_uchar *
   2862 get_chr_property_list(const pcre_uchar *code, BOOL utf,
   2863   const pcre_uint8 *fcc, pcre_uint32 *list)
   2864 {
   2865 pcre_uchar c = *code;
   2866 pcre_uchar base;
   2867 const pcre_uchar *end;
   2868 pcre_uint32 chr;
   2869 
   2870 #ifdef SUPPORT_UCP
   2871 pcre_uint32 *clist_dest;
   2872 const pcre_uint32 *clist_src;
   2873 #else
   2874 utf = utf;  /* Suppress "unused parameter" compiler warning */
   2875 #endif
   2876 
   2877 list[0] = c;
   2878 list[1] = FALSE;
   2879 code++;
   2880 
   2881 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
   2882   {
   2883   base = get_repeat_base(c);
   2884   c -= (base - OP_STAR);
   2885 
   2886   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
   2887     code += IMM2_SIZE;
   2888 
   2889   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
   2890 
   2891   switch(base)
   2892     {
   2893     case OP_STAR:
   2894     list[0] = OP_CHAR;
   2895     break;
   2896 
   2897     case OP_STARI:
   2898     list[0] = OP_CHARI;
   2899     break;
   2900 
   2901     case OP_NOTSTAR:
   2902     list[0] = OP_NOT;
   2903     break;
   2904 
   2905     case OP_NOTSTARI:
   2906     list[0] = OP_NOTI;
   2907     break;
   2908 
   2909     case OP_TYPESTAR:
   2910     list[0] = *code;
   2911     code++;
   2912     break;
   2913     }
   2914   c = list[0];
   2915   }
   2916 
   2917 switch(c)
   2918   {
   2919   case OP_NOT_DIGIT:
   2920   case OP_DIGIT:
   2921   case OP_NOT_WHITESPACE:
   2922   case OP_WHITESPACE:
   2923   case OP_NOT_WORDCHAR:
   2924   case OP_WORDCHAR:
   2925   case OP_ANY:
   2926   case OP_ALLANY:
   2927   case OP_ANYNL:
   2928   case OP_NOT_HSPACE:
   2929   case OP_HSPACE:
   2930   case OP_NOT_VSPACE:
   2931   case OP_VSPACE:
   2932   case OP_EXTUNI:
   2933   case OP_EODN:
   2934   case OP_EOD:
   2935   case OP_DOLL:
   2936   case OP_DOLLM:
   2937   return code;
   2938 
   2939   case OP_CHAR:
   2940   case OP_NOT:
   2941   GETCHARINCTEST(chr, code);
   2942   list[2] = chr;
   2943   list[3] = NOTACHAR;
   2944   return code;
   2945 
   2946   case OP_CHARI:
   2947   case OP_NOTI:
   2948   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
   2949   GETCHARINCTEST(chr, code);
   2950   list[2] = chr;
   2951 
   2952 #ifdef SUPPORT_UCP
   2953   if (chr < 128 || (chr < 256 && !utf))
   2954     list[3] = fcc[chr];
   2955   else
   2956     list[3] = UCD_OTHERCASE(chr);
   2957 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
   2958   list[3] = (chr < 256) ? fcc[chr] : chr;
   2959 #else
   2960   list[3] = fcc[chr];
   2961 #endif
   2962 
   2963   /* The othercase might be the same value. */
   2964 
   2965   if (chr == list[3])
   2966     list[3] = NOTACHAR;
   2967   else
   2968     list[4] = NOTACHAR;
   2969   return code;
   2970 
   2971 #ifdef SUPPORT_UCP
   2972   case OP_PROP:
   2973   case OP_NOTPROP:
   2974   if (code[0] != PT_CLIST)
   2975     {
   2976     list[2] = code[0];
   2977     list[3] = code[1];
   2978     return code + 2;
   2979     }
   2980 
   2981   /* Convert only if we have enough space. */
   2982 
   2983   clist_src = PRIV(ucd_caseless_sets) + code[1];
   2984   clist_dest = list + 2;
   2985   code += 2;
   2986 
   2987   do {
   2988      if (clist_dest >= list + 8)
   2989        {
   2990        /* Early return if there is not enough space. This should never
   2991        happen, since all clists are shorter than 5 character now. */
   2992        list[2] = code[0];
   2993        list[3] = code[1];
   2994        return code;
   2995        }
   2996      *clist_dest++ = *clist_src;
   2997      }
   2998   while(*clist_src++ != NOTACHAR);
   2999 
   3000   /* All characters are stored. The terminating NOTACHAR
   3001   is copied form the clist itself. */
   3002 
   3003   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
   3004   return code;
   3005 #endif
   3006 
   3007   case OP_NCLASS:
   3008   case OP_CLASS:
   3009 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3010   case OP_XCLASS:
   3011   if (c == OP_XCLASS)
   3012     end = code + GET(code, 0) - 1;
   3013   else
   3014 #endif
   3015     end = code + 32 / sizeof(pcre_uchar);
   3016 
   3017   switch(*end)
   3018     {
   3019     case OP_CRSTAR:
   3020     case OP_CRMINSTAR:
   3021     case OP_CRQUERY:
   3022     case OP_CRMINQUERY:
   3023     case OP_CRPOSSTAR:
   3024     case OP_CRPOSQUERY:
   3025     list[1] = TRUE;
   3026     end++;
   3027     break;
   3028 
   3029     case OP_CRPLUS:
   3030     case OP_CRMINPLUS:
   3031     case OP_CRPOSPLUS:
   3032     end++;
   3033     break;
   3034 
   3035     case OP_CRRANGE:
   3036     case OP_CRMINRANGE:
   3037     case OP_CRPOSRANGE:
   3038     list[1] = (GET2(end, 1) == 0);
   3039     end += 1 + 2 * IMM2_SIZE;
   3040     break;
   3041     }
   3042   list[2] = (pcre_uint32)(end - code);
   3043   return end;
   3044   }
   3045 return NULL;    /* Opcode not accepted */
   3046 }
   3047 
   3048 
   3049 
   3050 /*************************************************
   3051 *    Scan further character sets for match       *
   3052 *************************************************/
   3053 
   3054 /* Checks whether the base and the current opcode have a common character, in
   3055 which case the base cannot be possessified.
   3056 
   3057 Arguments:
   3058   code        points to the byte code
   3059   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   3060   cd          static compile data
   3061   base_list   the data list of the base opcode
   3062 
   3063 Returns:      TRUE if the auto-possessification is possible
   3064 */
   3065 
   3066 static BOOL
   3067 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
   3068   const pcre_uint32 *base_list, const pcre_uchar *base_end)
   3069 {
   3070 pcre_uchar c;
   3071 pcre_uint32 list[8];
   3072 const pcre_uint32 *chr_ptr;
   3073 const pcre_uint32 *ochr_ptr;
   3074 const pcre_uint32 *list_ptr;
   3075 const pcre_uchar *next_code;
   3076 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3077 const pcre_uchar *xclass_flags;
   3078 #endif
   3079 const pcre_uint8 *class_bitset;
   3080 const pcre_uint8 *set1, *set2, *set_end;
   3081 pcre_uint32 chr;
   3082 BOOL accepted, invert_bits;
   3083 BOOL entered_a_group = FALSE;
   3084 
   3085 /* Note: the base_list[1] contains whether the current opcode has greedy
   3086 (represented by a non-zero value) quantifier. This is a different from
   3087 other character type lists, which stores here that the character iterator
   3088 matches to an empty string (also represented by a non-zero value). */
   3089 
   3090 for(;;)
   3091   {
   3092   /* All operations move the code pointer forward.
   3093   Therefore infinite recursions are not possible. */
   3094 
   3095   c = *code;
   3096 
   3097   /* Skip over callouts */
   3098 
   3099   if (c == OP_CALLOUT)
   3100     {
   3101     code += PRIV(OP_lengths)[c];
   3102     continue;
   3103     }
   3104 
   3105   if (c == OP_ALT)
   3106     {
   3107     do code += GET(code, 1); while (*code == OP_ALT);
   3108     c = *code;
   3109     }
   3110 
   3111   switch(c)
   3112     {
   3113     case OP_END:
   3114     case OP_KETRPOS:
   3115     /* TRUE only in greedy case. The non-greedy case could be replaced by
   3116     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
   3117     uses more memory, which we cannot get at this stage.) */
   3118 
   3119     return base_list[1] != 0;
   3120 
   3121     case OP_KET:
   3122     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
   3123     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
   3124     cannot be converted to a possessive form. */
   3125 
   3126     if (base_list[1] == 0) return FALSE;
   3127 
   3128     switch(*(code - GET(code, 1)))
   3129       {
   3130       case OP_ASSERT:
   3131       case OP_ASSERT_NOT:
   3132       case OP_ASSERTBACK:
   3133       case OP_ASSERTBACK_NOT:
   3134       case OP_ONCE:
   3135       case OP_ONCE_NC:
   3136       /* Atomic sub-patterns and assertions can always auto-possessify their
   3137       last iterator. However, if the group was entered as a result of checking
   3138       a previous iterator, this is not possible. */
   3139 
   3140       return !entered_a_group;
   3141       }
   3142 
   3143     code += PRIV(OP_lengths)[c];
   3144     continue;
   3145 
   3146     case OP_ONCE:
   3147     case OP_ONCE_NC:
   3148     case OP_BRA:
   3149     case OP_CBRA:
   3150     next_code = code + GET(code, 1);
   3151     code += PRIV(OP_lengths)[c];
   3152 
   3153     while (*next_code == OP_ALT)
   3154       {
   3155       if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
   3156       code = next_code + 1 + LINK_SIZE;
   3157       next_code += GET(next_code, 1);
   3158       }
   3159 
   3160     entered_a_group = TRUE;
   3161     continue;
   3162 
   3163     case OP_BRAZERO:
   3164     case OP_BRAMINZERO:
   3165 
   3166     next_code = code + 1;
   3167     if (*next_code != OP_BRA && *next_code != OP_CBRA
   3168         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
   3169 
   3170     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
   3171 
   3172     /* The bracket content will be checked by the
   3173     OP_BRA/OP_CBRA case above. */
   3174     next_code += 1 + LINK_SIZE;
   3175     if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
   3176       return FALSE;
   3177 
   3178     code += PRIV(OP_lengths)[c];
   3179     continue;
   3180 
   3181     default:
   3182     break;
   3183     }
   3184 
   3185   /* Check for a supported opcode, and load its properties. */
   3186 
   3187   code = get_chr_property_list(code, utf, cd->fcc, list);
   3188   if (code == NULL) return FALSE;    /* Unsupported */
   3189 
   3190   /* If either opcode is a small character list, set pointers for comparing
   3191   characters from that list with another list, or with a property. */
   3192 
   3193   if (base_list[0] == OP_CHAR)
   3194     {
   3195     chr_ptr = base_list + 2;
   3196     list_ptr = list;
   3197     }
   3198   else if (list[0] == OP_CHAR)
   3199     {
   3200     chr_ptr = list + 2;
   3201     list_ptr = base_list;
   3202     }
   3203 
   3204   /* Character bitsets can also be compared to certain opcodes. */
   3205 
   3206   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
   3207 #ifdef COMPILE_PCRE8
   3208       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
   3209       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
   3210 #endif
   3211       )
   3212     {
   3213 #ifdef COMPILE_PCRE8
   3214     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
   3215 #else
   3216     if (base_list[0] == OP_CLASS)
   3217 #endif
   3218       {
   3219       set1 = (pcre_uint8 *)(base_end - base_list[2]);
   3220       list_ptr = list;
   3221       }
   3222     else
   3223       {
   3224       set1 = (pcre_uint8 *)(code - list[2]);
   3225       list_ptr = base_list;
   3226       }
   3227 
   3228     invert_bits = FALSE;
   3229     switch(list_ptr[0])
   3230       {
   3231       case OP_CLASS:
   3232       case OP_NCLASS:
   3233       set2 = (pcre_uint8 *)
   3234         ((list_ptr == list ? code : base_end) - list_ptr[2]);
   3235       break;
   3236 
   3237 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3238       case OP_XCLASS:
   3239       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
   3240       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
   3241       if ((*xclass_flags & XCL_MAP) == 0)
   3242         {
   3243         /* No bits are set for characters < 256. */
   3244         if (list[1] == 0) return TRUE;
   3245         /* Might be an empty repeat. */
   3246         continue;
   3247         }
   3248       set2 = (pcre_uint8 *)(xclass_flags + 1);
   3249       break;
   3250 #endif
   3251 
   3252       case OP_NOT_DIGIT:
   3253       invert_bits = TRUE;
   3254       /* Fall through */
   3255       case OP_DIGIT:
   3256       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
   3257       break;
   3258 
   3259       case OP_NOT_WHITESPACE:
   3260       invert_bits = TRUE;
   3261       /* Fall through */
   3262       case OP_WHITESPACE:
   3263       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
   3264       break;
   3265 
   3266       case OP_NOT_WORDCHAR:
   3267       invert_bits = TRUE;
   3268       /* Fall through */
   3269       case OP_WORDCHAR:
   3270       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
   3271       break;
   3272 
   3273       default:
   3274       return FALSE;
   3275       }
   3276 
   3277     /* Because the sets are unaligned, we need
   3278     to perform byte comparison here. */
   3279     set_end = set1 + 32;
   3280     if (invert_bits)
   3281       {
   3282       do
   3283         {
   3284         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
   3285         }
   3286       while (set1 < set_end);
   3287       }
   3288     else
   3289       {
   3290       do
   3291         {
   3292         if ((*set1++ & *set2++) != 0) return FALSE;
   3293         }
   3294       while (set1 < set_end);
   3295       }
   3296 
   3297     if (list[1] == 0) return TRUE;
   3298     /* Might be an empty repeat. */
   3299     continue;
   3300     }
   3301 
   3302   /* Some property combinations also acceptable. Unicode property opcodes are
   3303   processed specially; the rest can be handled with a lookup table. */
   3304 
   3305   else
   3306     {
   3307     pcre_uint32 leftop, rightop;
   3308 
   3309     leftop = base_list[0];
   3310     rightop = list[0];
   3311 
   3312 #ifdef SUPPORT_UCP
   3313     accepted = FALSE; /* Always set in non-unicode case. */
   3314     if (leftop == OP_PROP || leftop == OP_NOTPROP)
   3315       {
   3316       if (rightop == OP_EOD)
   3317         accepted = TRUE;
   3318       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
   3319         {
   3320         int n;
   3321         const pcre_uint8 *p;
   3322         BOOL same = leftop == rightop;
   3323         BOOL lisprop = leftop == OP_PROP;
   3324         BOOL risprop = rightop == OP_PROP;
   3325         BOOL bothprop = lisprop && risprop;
   3326 
   3327         /* There's a table that specifies how each combination is to be
   3328         processed:
   3329           0   Always return FALSE (never auto-possessify)
   3330           1   Character groups are distinct (possessify if both are OP_PROP)
   3331           2   Check character categories in the same group (general or particular)
   3332           3   Return TRUE if the two opcodes are not the same
   3333           ... see comments below
   3334         */
   3335 
   3336         n = propposstab[base_list[2]][list[2]];
   3337         switch(n)
   3338           {
   3339           case 0: break;
   3340           case 1: accepted = bothprop; break;
   3341           case 2: accepted = (base_list[3] == list[3]) != same; break;
   3342           case 3: accepted = !same; break;
   3343 
   3344           case 4:  /* Left general category, right particular category */
   3345           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
   3346           break;
   3347 
   3348           case 5:  /* Right general category, left particular category */
   3349           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
   3350           break;
   3351 
   3352           /* This code is logically tricky. Think hard before fiddling with it.
   3353           The posspropstab table has four entries per row. Each row relates to
   3354           one of PCRE's special properties such as ALNUM or SPACE or WORD.
   3355           Only WORD actually needs all four entries, but using repeats for the
   3356           others means they can all use the same code below.
   3357 
   3358           The first two entries in each row are Unicode general categories, and
   3359           apply always, because all the characters they include are part of the
   3360           PCRE character set. The third and fourth entries are a general and a
   3361           particular category, respectively, that include one or more relevant
   3362           characters. One or the other is used, depending on whether the check
   3363           is for a general or a particular category. However, in both cases the
   3364           category contains more characters than the specials that are defined
   3365           for the property being tested against. Therefore, it cannot be used
   3366           in a NOTPROP case.
   3367 
   3368           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
   3369           Underscore is covered by ucp_P or ucp_Po. */
   3370 
   3371           case 6:  /* Left alphanum vs right general category */
   3372           case 7:  /* Left space vs right general category */
   3373           case 8:  /* Left word vs right general category */
   3374           p = posspropstab[n-6];
   3375           accepted = risprop && lisprop ==
   3376             (list[3] != p[0] &&
   3377              list[3] != p[1] &&
   3378             (list[3] != p[2] || !lisprop));
   3379           break;
   3380 
   3381           case 9:   /* Right alphanum vs left general category */
   3382           case 10:  /* Right space vs left general category */
   3383           case 11:  /* Right word vs left general category */
   3384           p = posspropstab[n-9];
   3385           accepted = lisprop && risprop ==
   3386             (base_list[3] != p[0] &&
   3387              base_list[3] != p[1] &&
   3388             (base_list[3] != p[2] || !risprop));
   3389           break;
   3390 
   3391           case 12:  /* Left alphanum vs right particular category */
   3392           case 13:  /* Left space vs right particular category */
   3393           case 14:  /* Left word vs right particular category */
   3394           p = posspropstab[n-12];
   3395           accepted = risprop && lisprop ==
   3396             (catposstab[p[0]][list[3]] &&
   3397              catposstab[p[1]][list[3]] &&
   3398             (list[3] != p[3] || !lisprop));
   3399           break;
   3400 
   3401           case 15:  /* Right alphanum vs left particular category */
   3402           case 16:  /* Right space vs left particular category */
   3403           case 17:  /* Right word vs left particular category */
   3404           p = posspropstab[n-15];
   3405           accepted = lisprop && risprop ==
   3406             (catposstab[p[0]][base_list[3]] &&
   3407              catposstab[p[1]][base_list[3]] &&
   3408             (base_list[3] != p[3] || !risprop));
   3409           break;
   3410           }
   3411         }
   3412       }
   3413 
   3414     else
   3415 #endif  /* SUPPORT_UCP */
   3416 
   3417     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
   3418            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
   3419            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
   3420 
   3421     if (!accepted) return FALSE;
   3422 
   3423     if (list[1] == 0) return TRUE;
   3424     /* Might be an empty repeat. */
   3425     continue;
   3426     }
   3427 
   3428   /* Control reaches here only if one of the items is a small character list.
   3429   All characters are checked against the other side. */
   3430 
   3431   do
   3432     {
   3433     chr = *chr_ptr;
   3434 
   3435     switch(list_ptr[0])
   3436       {
   3437       case OP_CHAR:
   3438       ochr_ptr = list_ptr + 2;
   3439       do
   3440         {
   3441         if (chr == *ochr_ptr) return FALSE;
   3442         ochr_ptr++;
   3443         }
   3444       while(*ochr_ptr != NOTACHAR);
   3445       break;
   3446 
   3447       case OP_NOT:
   3448       ochr_ptr = list_ptr + 2;
   3449       do
   3450         {
   3451         if (chr == *ochr_ptr)
   3452           break;
   3453         ochr_ptr++;
   3454         }
   3455       while(*ochr_ptr != NOTACHAR);
   3456       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
   3457       break;
   3458 
   3459       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
   3460       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
   3461 
   3462       case OP_DIGIT:
   3463       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
   3464       break;
   3465 
   3466       case OP_NOT_DIGIT:
   3467       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
   3468       break;
   3469 
   3470       case OP_WHITESPACE:
   3471       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
   3472       break;
   3473 
   3474       case OP_NOT_WHITESPACE:
   3475       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
   3476       break;
   3477 
   3478       case OP_WORDCHAR:
   3479       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
   3480       break;
   3481 
   3482       case OP_NOT_WORDCHAR:
   3483       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
   3484       break;
   3485 
   3486       case OP_HSPACE:
   3487       switch(chr)
   3488         {
   3489         HSPACE_CASES: return FALSE;
   3490         default: break;
   3491         }
   3492       break;
   3493 
   3494       case OP_NOT_HSPACE:
   3495       switch(chr)
   3496         {
   3497         HSPACE_CASES: break;
   3498         default: return FALSE;
   3499         }
   3500       break;
   3501 
   3502       case OP_ANYNL:
   3503       case OP_VSPACE:
   3504       switch(chr)
   3505         {
   3506         VSPACE_CASES: return FALSE;
   3507         default: break;
   3508         }
   3509       break;
   3510 
   3511       case OP_NOT_VSPACE:
   3512       switch(chr)
   3513         {
   3514         VSPACE_CASES: break;
   3515         default: return FALSE;
   3516         }
   3517       break;
   3518 
   3519       case OP_DOLL:
   3520       case OP_EODN:
   3521       switch (chr)
   3522         {
   3523         case CHAR_CR:
   3524         case CHAR_LF:
   3525         case CHAR_VT:
   3526         case CHAR_FF:
   3527         case CHAR_NEL:
   3528 #ifndef EBCDIC
   3529         case 0x2028:
   3530         case 0x2029:
   3531 #endif  /* Not EBCDIC */
   3532         return FALSE;
   3533         }
   3534       break;
   3535 
   3536       case OP_EOD:    /* Can always possessify before \z */
   3537       break;
   3538 
   3539 #ifdef SUPPORT_UCP
   3540       case OP_PROP:
   3541       case OP_NOTPROP:
   3542       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
   3543             list_ptr[0] == OP_NOTPROP))
   3544         return FALSE;
   3545       break;
   3546 #endif
   3547 
   3548       case OP_NCLASS:
   3549       if (chr > 255) return FALSE;
   3550       /* Fall through */
   3551 
   3552       case OP_CLASS:
   3553       if (chr > 255) break;
   3554       class_bitset = (pcre_uint8 *)
   3555         ((list_ptr == list ? code : base_end) - list_ptr[2]);
   3556       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
   3557       break;
   3558 
   3559 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3560       case OP_XCLASS:
   3561       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
   3562           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
   3563       break;
   3564 #endif
   3565 
   3566       default:
   3567       return FALSE;
   3568       }
   3569 
   3570     chr_ptr++;
   3571     }
   3572   while(*chr_ptr != NOTACHAR);
   3573 
   3574   /* At least one character must be matched from this opcode. */
   3575 
   3576   if (list[1] == 0) return TRUE;
   3577   }
   3578 
   3579 /* Control never reaches here. There used to be a fail-save return FALSE; here,
   3580 but some compilers complain about an unreachable statement. */
   3581 
   3582 }
   3583 
   3584 
   3585 
   3586 /*************************************************
   3587 *    Scan compiled regex for auto-possession     *
   3588 *************************************************/
   3589 
   3590 /* Replaces single character iterations with their possessive alternatives
   3591 if appropriate. This function modifies the compiled opcode!
   3592 
   3593 Arguments:
   3594   code        points to start of the byte code
   3595   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   3596   cd          static compile data
   3597 
   3598 Returns:      nothing
   3599 */
   3600 
   3601 static void
   3602 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
   3603 {
   3604 register pcre_uchar c;
   3605 const pcre_uchar *end;
   3606 pcre_uchar *repeat_opcode;
   3607 pcre_uint32 list[8];
   3608 
   3609 for (;;)
   3610   {
   3611   c = *code;
   3612 
   3613   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
   3614     {
   3615     c -= get_repeat_base(c) - OP_STAR;
   3616     end = (c <= OP_MINUPTO) ?
   3617       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
   3618     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
   3619 
   3620     if (end != NULL && compare_opcodes(end, utf, cd, list, end))
   3621       {
   3622       switch(c)
   3623         {
   3624         case OP_STAR:
   3625         *code += OP_POSSTAR - OP_STAR;
   3626         break;
   3627 
   3628         case OP_MINSTAR:
   3629         *code += OP_POSSTAR - OP_MINSTAR;
   3630         break;
   3631 
   3632         case OP_PLUS:
   3633         *code += OP_POSPLUS - OP_PLUS;
   3634         break;
   3635 
   3636         case OP_MINPLUS:
   3637         *code += OP_POSPLUS - OP_MINPLUS;
   3638         break;
   3639 
   3640         case OP_QUERY:
   3641         *code += OP_POSQUERY - OP_QUERY;
   3642         break;
   3643 
   3644         case OP_MINQUERY:
   3645         *code += OP_POSQUERY - OP_MINQUERY;
   3646         break;
   3647 
   3648         case OP_UPTO:
   3649         *code += OP_POSUPTO - OP_UPTO;
   3650         break;
   3651 
   3652         case OP_MINUPTO:
   3653         *code += OP_POSUPTO - OP_MINUPTO;
   3654         break;
   3655         }
   3656       }
   3657     c = *code;
   3658     }
   3659   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
   3660     {
   3661 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3662     if (c == OP_XCLASS)
   3663       repeat_opcode = code + GET(code, 1);
   3664     else
   3665 #endif
   3666       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
   3667 
   3668     c = *repeat_opcode;
   3669     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
   3670       {
   3671       /* end must not be NULL. */
   3672       end = get_chr_property_list(code, utf, cd->fcc, list);
   3673 
   3674       list[1] = (c & 1) == 0;
   3675 
   3676       if (compare_opcodes(end, utf, cd, list, end))
   3677         {
   3678         switch (c)
   3679           {
   3680           case OP_CRSTAR:
   3681           case OP_CRMINSTAR:
   3682           *repeat_opcode = OP_CRPOSSTAR;
   3683           break;
   3684 
   3685           case OP_CRPLUS:
   3686           case OP_CRMINPLUS:
   3687           *repeat_opcode = OP_CRPOSPLUS;
   3688           break;
   3689 
   3690           case OP_CRQUERY:
   3691           case OP_CRMINQUERY:
   3692           *repeat_opcode = OP_CRPOSQUERY;
   3693           break;
   3694 
   3695           case OP_CRRANGE:
   3696           case OP_CRMINRANGE:
   3697           *repeat_opcode = OP_CRPOSRANGE;
   3698           break;
   3699           }
   3700         }
   3701       }
   3702     c = *code;
   3703     }
   3704 
   3705   switch(c)
   3706     {
   3707     case OP_END:
   3708     return;
   3709 
   3710     case OP_TYPESTAR:
   3711     case OP_TYPEMINSTAR:
   3712     case OP_TYPEPLUS:
   3713     case OP_TYPEMINPLUS:
   3714     case OP_TYPEQUERY:
   3715     case OP_TYPEMINQUERY:
   3716     case OP_TYPEPOSSTAR:
   3717     case OP_TYPEPOSPLUS:
   3718     case OP_TYPEPOSQUERY:
   3719     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   3720     break;
   3721 
   3722     case OP_TYPEUPTO:
   3723     case OP_TYPEMINUPTO:
   3724     case OP_TYPEEXACT:
   3725     case OP_TYPEPOSUPTO:
   3726     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   3727       code += 2;
   3728     break;
   3729 
   3730 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3731     case OP_XCLASS:
   3732     code += GET(code, 1);
   3733     break;
   3734 #endif
   3735 
   3736     case OP_MARK:
   3737     case OP_PRUNE_ARG:
   3738     case OP_SKIP_ARG:
   3739     case OP_THEN_ARG:
   3740     code += code[1];
   3741     break;
   3742     }
   3743 
   3744   /* Add in the fixed length from the table */
   3745 
   3746   code += PRIV(OP_lengths)[c];
   3747 
   3748   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
   3749   a multi-byte character. The length in the table is a minimum, so we have to
   3750   arrange to skip the extra bytes. */
   3751 
   3752 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   3753   if (utf) switch(c)
   3754     {
   3755     case OP_CHAR:
   3756     case OP_CHARI:
   3757     case OP_NOT:
   3758     case OP_NOTI:
   3759     case OP_STAR:
   3760     case OP_MINSTAR:
   3761     case OP_PLUS:
   3762     case OP_MINPLUS:
   3763     case OP_QUERY:
   3764     case OP_MINQUERY:
   3765     case OP_UPTO:
   3766     case OP_MINUPTO:
   3767     case OP_EXACT:
   3768     case OP_POSSTAR:
   3769     case OP_POSPLUS:
   3770     case OP_POSQUERY:
   3771     case OP_POSUPTO:
   3772     case OP_STARI:
   3773     case OP_MINSTARI:
   3774     case OP_PLUSI:
   3775     case OP_MINPLUSI:
   3776     case OP_QUERYI:
   3777     case OP_MINQUERYI:
   3778     case OP_UPTOI:
   3779     case OP_MINUPTOI:
   3780     case OP_EXACTI:
   3781     case OP_POSSTARI:
   3782     case OP_POSPLUSI:
   3783     case OP_POSQUERYI:
   3784     case OP_POSUPTOI:
   3785     case OP_NOTSTAR:
   3786     case OP_NOTMINSTAR:
   3787     case OP_NOTPLUS:
   3788     case OP_NOTMINPLUS:
   3789     case OP_NOTQUERY:
   3790     case OP_NOTMINQUERY:
   3791     case OP_NOTUPTO:
   3792     case OP_NOTMINUPTO:
   3793     case OP_NOTEXACT:
   3794     case OP_NOTPOSSTAR:
   3795     case OP_NOTPOSPLUS:
   3796     case OP_NOTPOSQUERY:
   3797     case OP_NOTPOSUPTO:
   3798     case OP_NOTSTARI:
   3799     case OP_NOTMINSTARI:
   3800     case OP_NOTPLUSI:
   3801     case OP_NOTMINPLUSI:
   3802     case OP_NOTQUERYI:
   3803     case OP_NOTMINQUERYI:
   3804     case OP_NOTUPTOI:
   3805     case OP_NOTMINUPTOI:
   3806     case OP_NOTEXACTI:
   3807     case OP_NOTPOSSTARI:
   3808     case OP_NOTPOSPLUSI:
   3809     case OP_NOTPOSQUERYI:
   3810     case OP_NOTPOSUPTOI:
   3811     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   3812     break;
   3813     }
   3814 #else
   3815   (void)(utf);  /* Keep compiler happy by referencing function argument */
   3816 #endif
   3817   }
   3818 }
   3819 
   3820 
   3821 
   3822 /*************************************************
   3823 *           Check for POSIX class syntax         *
   3824 *************************************************/
   3825 
   3826 /* This function is called when the sequence "[:" or "[." or "[=" is
   3827 encountered in a character class. It checks whether this is followed by a
   3828 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
   3829 reach an unescaped ']' without the special preceding character, return FALSE.
   3830 
   3831 Originally, this function only recognized a sequence of letters between the
   3832 terminators, but it seems that Perl recognizes any sequence of characters,
   3833 though of course unknown POSIX names are subsequently rejected. Perl gives an
   3834 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
   3835 didn't consider this to be a POSIX class. Likewise for [:1234:].
   3836 
   3837 The problem in trying to be exactly like Perl is in the handling of escapes. We
   3838 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
   3839 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
   3840 below handles the special case of \], but does not try to do any other escape
   3841 processing. This makes it different from Perl for cases such as [:l\ower:]
   3842 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
   3843 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
   3844 I think.
   3845 
   3846 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
   3847 It seems that the appearance of a nested POSIX class supersedes an apparent
   3848 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
   3849 a digit.
   3850 
   3851 In Perl, unescaped square brackets may also appear as part of class names. For
   3852 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
   3853 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
   3854 seem right at all. PCRE does not allow closing square brackets in POSIX class
   3855 names.
   3856 
   3857 Arguments:
   3858   ptr      pointer to the initial [
   3859   endptr   where to return the end pointer
   3860 
   3861 Returns:   TRUE or FALSE
   3862 */
   3863 
   3864 static BOOL
   3865 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
   3866 {
   3867 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
   3868 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
   3869 for (++ptr; *ptr != CHAR_NULL; ptr++)
   3870   {
   3871   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   3872     ptr++;
   3873   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
   3874   else
   3875     {
   3876     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   3877       {
   3878       *endptr = ptr;
   3879       return TRUE;
   3880       }
   3881     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
   3882          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   3883           ptr[1] == CHAR_EQUALS_SIGN) &&
   3884         check_posix_syntax(ptr, endptr))
   3885       return FALSE;
   3886     }
   3887   }
   3888 return FALSE;
   3889 }
   3890 
   3891 
   3892 
   3893 
   3894 /*************************************************
   3895 *          Check POSIX class name                *
   3896 *************************************************/
   3897 
   3898 /* This function is called to check the name given in a POSIX-style class entry
   3899 such as [:alnum:].
   3900 
   3901 Arguments:
   3902   ptr        points to the first letter
   3903   len        the length of the name
   3904 
   3905 Returns:     a value representing the name, or -1 if unknown
   3906 */
   3907 
   3908 static int
   3909 check_posix_name(const pcre_uchar *ptr, int len)
   3910 {
   3911 const char *pn = posix_names;
   3912 register int yield = 0;
   3913 while (posix_name_lengths[yield] != 0)
   3914   {
   3915   if (len == posix_name_lengths[yield] &&
   3916     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
   3917   pn += posix_name_lengths[yield] + 1;
   3918   yield++;
   3919   }
   3920 return -1;
   3921 }
   3922 
   3923 
   3924 /*************************************************
   3925 *    Adjust OP_RECURSE items in repeated group   *
   3926 *************************************************/
   3927 
   3928 /* OP_RECURSE items contain an offset from the start of the regex to the group
   3929 that is referenced. This means that groups can be replicated for fixed
   3930 repetition simply by copying (because the recursion is allowed to refer to
   3931 earlier groups that are outside the current group). However, when a group is
   3932 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
   3933 inserted before it, after it has been compiled. This means that any OP_RECURSE
   3934 items within it that refer to the group itself or any contained groups have to
   3935 have their offsets adjusted. That one of the jobs of this function. Before it
   3936 is called, the partially compiled regex must be temporarily terminated with
   3937 OP_END.
   3938 
   3939 This function has been extended with the possibility of forward references for
   3940 recursions and subroutine calls. It must also check the list of such references
   3941 for the group we are dealing with. If it finds that one of the recursions in
   3942 the current group is on this list, it adjusts the offset in the list, not the
   3943 value in the reference (which is a group number).
   3944 
   3945 Arguments:
   3946   group      points to the start of the group
   3947   adjust     the amount by which the group is to be moved
   3948   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
   3949   cd         contains pointers to tables etc.
   3950   save_hwm   the hwm forward reference pointer at the start of the group
   3951 
   3952 Returns:     nothing
   3953 */
   3954 
   3955 static void
   3956 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
   3957   pcre_uchar *save_hwm)
   3958 {
   3959 pcre_uchar *ptr = group;
   3960 
   3961 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
   3962   {
   3963   int offset;
   3964   pcre_uchar *hc;
   3965 
   3966   /* See if this recursion is on the forward reference list. If so, adjust the
   3967   reference. */
   3968 
   3969   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
   3970     {
   3971     offset = (int)GET(hc, 0);
   3972     if (cd->start_code + offset == ptr + 1)
   3973       {
   3974       PUT(hc, 0, offset + adjust);
   3975       break;
   3976       }
   3977     }
   3978 
   3979   /* Otherwise, adjust the recursion offset if it's after the start of this
   3980   group. */
   3981 
   3982   if (hc >= cd->hwm)
   3983     {
   3984     offset = (int)GET(ptr, 1);
   3985     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
   3986     }
   3987 
   3988   ptr += 1 + LINK_SIZE;
   3989   }
   3990 }
   3991 
   3992 
   3993 
   3994 /*************************************************
   3995 *        Insert an automatic callout point       *
   3996 *************************************************/
   3997 
   3998 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
   3999 callout points before each pattern item.
   4000 
   4001 Arguments:
   4002   code           current code pointer
   4003   ptr            current pattern pointer
   4004   cd             pointers to tables etc
   4005 
   4006 Returns:         new code pointer
   4007 */
   4008 
   4009 static pcre_uchar *
   4010 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
   4011 {
   4012 *code++ = OP_CALLOUT;
   4013 *code++ = 255;
   4014 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
   4015 PUT(code, LINK_SIZE, 0);                       /* Default length */
   4016 return code + 2 * LINK_SIZE;
   4017 }
   4018 
   4019 
   4020 
   4021 /*************************************************
   4022 *         Complete a callout item                *
   4023 *************************************************/
   4024 
   4025 /* A callout item contains the length of the next item in the pattern, which
   4026 we can't fill in till after we have reached the relevant point. This is used
   4027 for both automatic and manual callouts.
   4028 
   4029 Arguments:
   4030   previous_callout   points to previous callout item
   4031   ptr                current pattern pointer
   4032   cd                 pointers to tables etc
   4033 
   4034 Returns:             nothing
   4035 */
   4036 
   4037 static void
   4038 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
   4039 {
   4040 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
   4041 PUT(previous_callout, 2 + LINK_SIZE, length);
   4042 }
   4043 
   4044 
   4045 
   4046 #ifdef SUPPORT_UCP
   4047 /*************************************************
   4048 *           Get othercase range                  *
   4049 *************************************************/
   4050 
   4051 /* This function is passed the start and end of a class range, in UTF-8 mode
   4052 with UCP support. It searches up the characters, looking for ranges of
   4053 characters in the "other" case. Each call returns the next one, updating the
   4054 start address. A character with multiple other cases is returned on its own
   4055 with a special return value.
   4056 
   4057 Arguments:
   4058   cptr        points to starting character value; updated
   4059   d           end value
   4060   ocptr       where to put start of othercase range
   4061   odptr       where to put end of othercase range
   4062 
   4063 Yield:        -1 when no more
   4064                0 when a range is returned
   4065               >0 the CASESET offset for char with multiple other cases
   4066                 in this case, ocptr contains the original
   4067 */
   4068 
   4069 static int
   4070 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
   4071   pcre_uint32 *odptr)
   4072 {
   4073 pcre_uint32 c, othercase, next;
   4074 unsigned int co;
   4075 
   4076 /* Find the first character that has an other case. If it has multiple other
   4077 cases, return its case offset value. */
   4078 
   4079 for (c = *cptr; c <= d; c++)
   4080   {
   4081   if ((co = UCD_CASESET(c)) != 0)
   4082     {
   4083     *ocptr = c++;   /* Character that has the set */
   4084     *cptr = c;      /* Rest of input range */
   4085     return (int)co;
   4086     }
   4087   if ((othercase = UCD_OTHERCASE(c)) != c) break;
   4088   }
   4089 
   4090 if (c > d) return -1;  /* Reached end of range */
   4091 
   4092 /* Found a character that has a single other case. Search for the end of the
   4093 range, which is either the end of the input range, or a character that has zero
   4094 or more than one other cases. */
   4095 
   4096 *ocptr = othercase;
   4097 next = othercase + 1;
   4098 
   4099 for (++c; c <= d; c++)
   4100   {
   4101   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
   4102   next++;
   4103   }
   4104 
   4105 *odptr = next - 1;     /* End of othercase range */
   4106 *cptr = c;             /* Rest of input range */
   4107 return 0;
   4108 }
   4109 #endif  /* SUPPORT_UCP */
   4110 
   4111 
   4112 
   4113 /*************************************************
   4114 *        Add a character or range to a class     *
   4115 *************************************************/
   4116 
   4117 /* This function packages up the logic of adding a character or range of
   4118 characters to a class. The character values in the arguments will be within the
   4119 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
   4120 mutually recursive with the function immediately below.
   4121 
   4122 Arguments:
   4123   classbits     the bit map for characters < 256
   4124   uchardptr     points to the pointer for extra data
   4125   options       the options word
   4126   cd            contains pointers to tables etc.
   4127   start         start of range character
   4128   end           end of range character
   4129 
   4130 Returns:        the number of < 256 characters added
   4131                 the pointer to extra data is updated
   4132 */
   4133 
   4134 static int
   4135 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
   4136   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
   4137 {
   4138 pcre_uint32 c;
   4139 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
   4140 int n8 = 0;
   4141 
   4142 /* If caseless matching is required, scan the range and process alternate
   4143 cases. In Unicode, there are 8-bit characters that have alternate cases that
   4144 are greater than 255 and vice-versa. Sometimes we can just extend the original
   4145 range. */
   4146 
   4147 if ((options & PCRE_CASELESS) != 0)
   4148   {
   4149 #ifdef SUPPORT_UCP
   4150   if ((options & PCRE_UTF8) != 0)
   4151     {
   4152     int rc;
   4153     pcre_uint32 oc, od;
   4154 
   4155     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
   4156     c = start;
   4157 
   4158     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
   4159       {
   4160       /* Handle a single character that has more than one other case. */
   4161 
   4162       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
   4163         PRIV(ucd_caseless_sets) + rc, oc);
   4164 
   4165       /* Do nothing if the other case range is within the original range. */
   4166 
   4167       else if (oc >= start && od <= end) continue;
   4168 
   4169       /* Extend the original range if there is overlap, noting that if oc < c, we
   4170       can't have od > end because a subrange is always shorter than the basic
   4171       range. Otherwise, use a recursive call to add the additional range. */
   4172 
   4173       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
   4174       else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
   4175       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
   4176       }
   4177     }
   4178   else
   4179 #endif  /* SUPPORT_UCP */
   4180 
   4181   /* Not UTF-mode, or no UCP */
   4182 
   4183   for (c = start; c <= classbits_end; c++)
   4184     {
   4185     SETBIT(classbits, cd->fcc[c]);
   4186     n8++;
   4187     }
   4188   }
   4189 
   4190 /* Now handle the original range. Adjust the final value according to the bit
   4191 length - this means that the same lists of (e.g.) horizontal spaces can be used
   4192 in all cases. */
   4193 
   4194 #if defined COMPILE_PCRE8
   4195 #ifdef SUPPORT_UTF
   4196   if ((options & PCRE_UTF8) == 0)
   4197 #endif
   4198   if (end > 0xff) end = 0xff;
   4199 
   4200 #elif defined COMPILE_PCRE16
   4201 #ifdef SUPPORT_UTF
   4202   if ((options & PCRE_UTF16) == 0)
   4203 #endif
   4204   if (end > 0xffff) end = 0xffff;
   4205 
   4206 #endif /* COMPILE_PCRE[8|16] */
   4207 
   4208 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
   4209 
   4210 for (c = start; c <= classbits_end; c++)
   4211   {
   4212   /* Regardless of start, c will always be <= 255. */
   4213   SETBIT(classbits, c);
   4214   n8++;
   4215   }
   4216 
   4217 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4218 if (start <= 0xff) start = 0xff + 1;
   4219 
   4220 if (end >= start)
   4221   {
   4222   pcre_uchar *uchardata = *uchardptr;
   4223 #ifdef SUPPORT_UTF
   4224   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
   4225     {
   4226     if (start < end)
   4227       {
   4228       *uchardata++ = XCL_RANGE;
   4229       uchardata += PRIV(ord2utf)(start, uchardata);
   4230       uchardata += PRIV(ord2utf)(end, uchardata);
   4231       }
   4232     else if (start == end)
   4233       {
   4234       *uchardata++ = XCL_SINGLE;
   4235       uchardata += PRIV(ord2utf)(start, uchardata);
   4236       }
   4237     }
   4238   else
   4239 #endif  /* SUPPORT_UTF */
   4240 
   4241   /* Without UTF support, character values are constrained by the bit length,
   4242   and can only be > 256 for 16-bit and 32-bit libraries. */
   4243 
   4244 #ifdef COMPILE_PCRE8
   4245     {}
   4246 #else
   4247   if (start < end)
   4248     {
   4249     *uchardata++ = XCL_RANGE;
   4250     *uchardata++ = start;
   4251     *uchardata++ = end;
   4252     }
   4253   else if (start == end)
   4254     {
   4255     *uchardata++ = XCL_SINGLE;
   4256     *uchardata++ = start;
   4257     }
   4258 #endif
   4259 
   4260   *uchardptr = uchardata;   /* Updata extra data pointer */
   4261   }
   4262 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
   4263 
   4264 return n8;    /* Number of 8-bit characters */
   4265 }
   4266 
   4267 
   4268 
   4269 
   4270 /*************************************************
   4271 *        Add a list of characters to a class     *
   4272 *************************************************/
   4273 
   4274 /* This function is used for adding a list of case-equivalent characters to a
   4275 class, and also for adding a list of horizontal or vertical whitespace. If the
   4276 list is in order (which it should be), ranges of characters are detected and
   4277 handled appropriately. This function is mutually recursive with the function
   4278 above.
   4279 
   4280 Arguments:
   4281   classbits     the bit map for characters < 256
   4282   uchardptr     points to the pointer for extra data
   4283   options       the options word
   4284   cd            contains pointers to tables etc.
   4285   p             points to row of 32-bit values, terminated by NOTACHAR
   4286   except        character to omit; this is used when adding lists of
   4287                   case-equivalent characters to avoid including the one we
   4288                   already know about
   4289 
   4290 Returns:        the number of < 256 characters added
   4291                 the pointer to extra data is updated
   4292 */
   4293 
   4294 static int
   4295 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
   4296   compile_data *cd, const pcre_uint32 *p, unsigned int except)
   4297 {
   4298 int n8 = 0;
   4299 while (p[0] < NOTACHAR)
   4300   {
   4301   int n = 0;
   4302   if (p[0] != except)
   4303     {
   4304     while(p[n+1] == p[0] + n + 1) n++;
   4305     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
   4306     }
   4307   p += n + 1;
   4308   }
   4309 return n8;
   4310 }
   4311 
   4312 
   4313 
   4314 /*************************************************
   4315 *    Add characters not in a list to a class     *
   4316 *************************************************/
   4317 
   4318 /* This function is used for adding the complement of a list of horizontal or
   4319 vertical whitespace to a class. The list must be in order.
   4320 
   4321 Arguments:
   4322   classbits     the bit map for characters < 256
   4323   uchardptr     points to the pointer for extra data
   4324   options       the options word
   4325   cd            contains pointers to tables etc.
   4326   p             points to row of 32-bit values, terminated by NOTACHAR
   4327 
   4328 Returns:        the number of < 256 characters added
   4329                 the pointer to extra data is updated
   4330 */
   4331 
   4332 static int
   4333 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
   4334   int options, compile_data *cd, const pcre_uint32 *p)
   4335 {
   4336 BOOL utf = (options & PCRE_UTF8) != 0;
   4337 int n8 = 0;
   4338 if (p[0] > 0)
   4339   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
   4340 while (p[0] < NOTACHAR)
   4341   {
   4342   while (p[1] == p[0] + 1) p++;
   4343   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
   4344     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
   4345   p++;
   4346   }
   4347 return n8;
   4348 }
   4349 
   4350 
   4351 
   4352 /*************************************************
   4353 *           Compile one branch                   *
   4354 *************************************************/
   4355 
   4356 /* Scan the pattern, compiling it into the a vector. If the options are
   4357 changed during the branch, the pointer is used to change the external options
   4358 bits. This function is used during the pre-compile phase when we are trying
   4359 to find out the amount of memory needed, as well as during the real compile
   4360 phase. The value of lengthptr distinguishes the two phases.
   4361 
   4362 Arguments:
   4363   optionsptr        pointer to the option bits
   4364   codeptr           points to the pointer to the current code point
   4365   ptrptr            points to the current pattern pointer
   4366   errorcodeptr      points to error code variable
   4367   firstcharptr      place to put the first required character
   4368   firstcharflagsptr place to put the first character flags, or a negative number
   4369   reqcharptr        place to put the last required character
   4370   reqcharflagsptr   place to put the last required character flags, or a negative number
   4371   bcptr             points to current branch chain
   4372   cond_depth        conditional nesting depth
   4373   cd                contains pointers to tables etc.
   4374   lengthptr         NULL during the real compile phase
   4375                     points to length accumulator during pre-compile phase
   4376 
   4377 Returns:            TRUE on success
   4378                     FALSE, with *errorcodeptr set non-zero on error
   4379 */
   4380 
   4381 static BOOL
   4382 compile_branch(int *optionsptr, pcre_uchar **codeptr,
   4383   const pcre_uchar **ptrptr, int *errorcodeptr,
   4384   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
   4385   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
   4386   branch_chain *bcptr, int cond_depth,
   4387   compile_data *cd, int *lengthptr)
   4388 {
   4389 int repeat_type, op_type;
   4390 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
   4391 int bravalue = 0;
   4392 int greedy_default, greedy_non_default;
   4393 pcre_uint32 firstchar, reqchar;
   4394 pcre_int32 firstcharflags, reqcharflags;
   4395 pcre_uint32 zeroreqchar, zerofirstchar;
   4396 pcre_int32 zeroreqcharflags, zerofirstcharflags;
   4397 pcre_int32 req_caseopt, reqvary, tempreqvary;
   4398 int options = *optionsptr;               /* May change dynamically */
   4399 int after_manual_callout = 0;
   4400 int length_prevgroup = 0;
   4401 register pcre_uint32 c;
   4402 int escape;
   4403 register pcre_uchar *code = *codeptr;
   4404 pcre_uchar *last_code = code;
   4405 pcre_uchar *orig_code = code;
   4406 pcre_uchar *tempcode;
   4407 BOOL inescq = FALSE;
   4408 BOOL groupsetfirstchar = FALSE;
   4409 const pcre_uchar *ptr = *ptrptr;
   4410 const pcre_uchar *tempptr;
   4411 const pcre_uchar *nestptr = NULL;
   4412 pcre_uchar *previous = NULL;
   4413 pcre_uchar *previous_callout = NULL;
   4414 pcre_uchar *save_hwm = NULL;
   4415 pcre_uint8 classbits[32];
   4416 
   4417 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
   4418 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
   4419 dynamically as we process the pattern. */
   4420 
   4421 #ifdef SUPPORT_UTF
   4422 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
   4423 BOOL utf = (options & PCRE_UTF8) != 0;
   4424 #ifndef COMPILE_PCRE32
   4425 pcre_uchar utf_chars[6];
   4426 #endif
   4427 #else
   4428 BOOL utf = FALSE;
   4429 #endif
   4430 
   4431 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
   4432 class_uchardata always so that it can be passed to add_to_class() always,
   4433 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
   4434 alternative calls for the different cases. */
   4435 
   4436 pcre_uchar *class_uchardata;
   4437 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4438 BOOL xclass;
   4439 pcre_uchar *class_uchardata_base;
   4440 #endif
   4441 
   4442 #ifdef PCRE_DEBUG
   4443 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
   4444 #endif
   4445 
   4446 /* Set up the default and non-default settings for greediness */
   4447 
   4448 greedy_default = ((options & PCRE_UNGREEDY) != 0);
   4449 greedy_non_default = greedy_default ^ 1;
   4450 
   4451 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
   4452 matching encountered yet". It gets changed to REQ_NONE if we hit something that
   4453 matches a non-fixed char first char; reqchar just remains unset if we never
   4454 find one.
   4455 
   4456 When we hit a repeat whose minimum is zero, we may have to adjust these values
   4457 to take the zero repeat into account. This is implemented by setting them to
   4458 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
   4459 item types that can be repeated set these backoff variables appropriately. */
   4460 
   4461 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
   4462 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
   4463 
   4464 /* The variable req_caseopt contains either the REQ_CASELESS value
   4465 or zero, according to the current setting of the caseless flag. The
   4466 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
   4467 firstchar or reqchar variables to record the case status of the
   4468 value. This is used only for ASCII characters. */
   4469 
   4470 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
   4471 
   4472 /* Switch on next character until the end of the branch */
   4473 
   4474 for (;; ptr++)
   4475   {
   4476   BOOL negate_class;
   4477   BOOL should_flip_negation;
   4478   BOOL possessive_quantifier;
   4479   BOOL is_quantifier;
   4480   BOOL is_recurse;
   4481   BOOL reset_bracount;
   4482   int class_has_8bitchar;
   4483   int class_one_char;
   4484 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4485   BOOL xclass_has_prop;
   4486 #endif
   4487   int newoptions;
   4488   int recno;
   4489   int refsign;
   4490   int skipbytes;
   4491   pcre_uint32 subreqchar, subfirstchar;
   4492   pcre_int32 subreqcharflags, subfirstcharflags;
   4493   int terminator;
   4494   unsigned int mclength;
   4495   unsigned int tempbracount;
   4496   pcre_uint32 ec;
   4497   pcre_uchar mcbuffer[8];
   4498 
   4499   /* Get next character in the pattern */
   4500 
   4501   c = *ptr;
   4502 
   4503   /* If we are at the end of a nested substitution, revert to the outer level
   4504   string. Nesting only happens one level deep. */
   4505 
   4506   if (c == CHAR_NULL && nestptr != NULL)
   4507     {
   4508     ptr = nestptr;
   4509     nestptr = NULL;
   4510     c = *ptr;
   4511     }
   4512 
   4513   /* If we are in the pre-compile phase, accumulate the length used for the
   4514   previous cycle of this loop. */
   4515 
   4516   if (lengthptr != NULL)
   4517     {
   4518 #ifdef PCRE_DEBUG
   4519     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
   4520 #endif
   4521     if (code > cd->start_workspace + cd->workspace_size -
   4522         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
   4523       {
   4524       *errorcodeptr = ERR52;
   4525       goto FAILED;
   4526       }
   4527 
   4528     /* There is at least one situation where code goes backwards: this is the
   4529     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
   4530     the class is simply eliminated. However, it is created first, so we have to
   4531     allow memory for it. Therefore, don't ever reduce the length at this point.
   4532     */
   4533 
   4534     if (code < last_code) code = last_code;
   4535 
   4536     /* Paranoid check for integer overflow */
   4537 
   4538     if (OFLOW_MAX - *lengthptr < code - last_code)
   4539       {
   4540       *errorcodeptr = ERR20;
   4541       goto FAILED;
   4542       }
   4543 
   4544     *lengthptr += (int)(code - last_code);
   4545     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
   4546       (int)(code - last_code), c, c));
   4547 
   4548     /* If "previous" is set and it is not at the start of the work space, move
   4549     it back to there, in order to avoid filling up the work space. Otherwise,
   4550     if "previous" is NULL, reset the current code pointer to the start. */
   4551 
   4552     if (previous != NULL)
   4553       {
   4554       if (previous > orig_code)
   4555         {
   4556         memmove(orig_code, previous, IN_UCHARS(code - previous));
   4557         code -= previous - orig_code;
   4558         previous = orig_code;
   4559         }
   4560       }
   4561     else code = orig_code;
   4562 
   4563     /* Remember where this code item starts so we can pick up the length
   4564     next time round. */
   4565 
   4566     last_code = code;
   4567     }
   4568 
   4569   /* In the real compile phase, just check the workspace used by the forward
   4570   reference list. */
   4571 
   4572   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
   4573            WORK_SIZE_SAFETY_MARGIN)
   4574     {
   4575     *errorcodeptr = ERR52;
   4576     goto FAILED;
   4577     }
   4578 
   4579   /* If in \Q...\E, check for the end; if not, we have a literal */
   4580 
   4581   if (inescq && c != CHAR_NULL)
   4582     {
   4583     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   4584       {
   4585       inescq = FALSE;
   4586       ptr++;
   4587       continue;
   4588       }
   4589     else
   4590       {
   4591       if (previous_callout != NULL)
   4592         {
   4593         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
   4594           complete_callout(previous_callout, ptr, cd);
   4595         previous_callout = NULL;
   4596         }
   4597       if ((options & PCRE_AUTO_CALLOUT) != 0)
   4598         {
   4599         previous_callout = code;
   4600         code = auto_callout(code, ptr, cd);
   4601         }
   4602       goto NORMAL_CHAR;
   4603       }
   4604     /* Control does not reach here. */
   4605     }
   4606 
   4607   /* In extended mode, skip white space and comments. We need a loop in order
   4608   to check for more white space and more comments after a comment. */
   4609 
   4610   if ((options & PCRE_EXTENDED) != 0)
   4611     {
   4612     for (;;)
   4613       {
   4614       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
   4615       if (c != CHAR_NUMBER_SIGN) break;
   4616       ptr++;
   4617       while (*ptr != CHAR_NULL)
   4618         {
   4619         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
   4620           {                          /* IS_NEWLINE sets cd->nllen. */
   4621           ptr += cd->nllen;
   4622           break;
   4623           }
   4624         ptr++;
   4625 #ifdef SUPPORT_UTF
   4626         if (utf) FORWARDCHAR(ptr);
   4627 #endif
   4628         }
   4629       c = *ptr;     /* Either NULL or the char after a newline */
   4630       }
   4631     }
   4632 
   4633   /* See if the next thing is a quantifier. */
   4634 
   4635   is_quantifier =
   4636     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
   4637     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
   4638 
   4639   /* Fill in length of a previous callout, except when the next thing is a
   4640   quantifier or when processing a property substitution string in UCP mode. */
   4641 
   4642   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
   4643        after_manual_callout-- <= 0)
   4644     {
   4645     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
   4646       complete_callout(previous_callout, ptr, cd);
   4647     previous_callout = NULL;
   4648     }
   4649 
   4650   /* Create auto callout, except for quantifiers, or while processing property
   4651   strings that are substituted for \w etc in UCP mode. */
   4652 
   4653   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
   4654     {
   4655     previous_callout = code;
   4656     code = auto_callout(code, ptr, cd);
   4657     }
   4658 
   4659   /* Process the next pattern item. */
   4660 
   4661   switch(c)
   4662     {
   4663     /* ===================================================================*/
   4664     case CHAR_NULL:                /* The branch terminates at string end */
   4665     case CHAR_VERTICAL_LINE:       /* or | or ) */
   4666     case CHAR_RIGHT_PARENTHESIS:
   4667     *firstcharptr = firstchar;
   4668     *firstcharflagsptr = firstcharflags;
   4669     *reqcharptr = reqchar;
   4670     *reqcharflagsptr = reqcharflags;
   4671     *codeptr = code;
   4672     *ptrptr = ptr;
   4673     if (lengthptr != NULL)
   4674       {
   4675       if (OFLOW_MAX - *lengthptr < code - last_code)
   4676         {
   4677         *errorcodeptr = ERR20;
   4678         goto FAILED;
   4679         }
   4680       *lengthptr += (int)(code - last_code);   /* To include callout length */
   4681       DPRINTF((">> end branch\n"));
   4682       }
   4683     return TRUE;
   4684 
   4685 
   4686     /* ===================================================================*/
   4687     /* Handle single-character metacharacters. In multiline mode, ^ disables
   4688     the setting of any following char as a first character. */
   4689 
   4690     case CHAR_CIRCUMFLEX_ACCENT:
   4691     previous = NULL;
   4692     if ((options & PCRE_MULTILINE) != 0)
   4693       {
   4694       if (firstcharflags == REQ_UNSET)
   4695         zerofirstcharflags = firstcharflags = REQ_NONE;
   4696       *code++ = OP_CIRCM;
   4697       }
   4698     else *code++ = OP_CIRC;
   4699     break;
   4700 
   4701     case CHAR_DOLLAR_SIGN:
   4702     previous = NULL;
   4703     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
   4704     break;
   4705 
   4706     /* There can never be a first char if '.' is first, whatever happens about
   4707     repeats. The value of reqchar doesn't change either. */
   4708 
   4709     case CHAR_DOT:
   4710     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   4711     zerofirstchar = firstchar;
   4712     zerofirstcharflags = firstcharflags;
   4713     zeroreqchar = reqchar;
   4714     zeroreqcharflags = reqcharflags;
   4715     previous = code;
   4716     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
   4717     break;
   4718 
   4719 
   4720     /* ===================================================================*/
   4721     /* Character classes. If the included characters are all < 256, we build a
   4722     32-byte bitmap of the permitted characters, except in the special case
   4723     where there is only one such character. For negated classes, we build the
   4724     map as usual, then invert it at the end. However, we use a different opcode
   4725     so that data characters > 255 can be handled correctly.
   4726 
   4727     If the class contains characters outside the 0-255 range, a different
   4728     opcode is compiled. It may optionally have a bit map for characters < 256,
   4729     but those above are are explicitly listed afterwards. A flag byte tells
   4730     whether the bitmap is present, and whether this is a negated class or not.
   4731 
   4732     In JavaScript compatibility mode, an isolated ']' causes an error. In
   4733     default (Perl) mode, it is treated as a data character. */
   4734 
   4735     case CHAR_RIGHT_SQUARE_BRACKET:
   4736     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   4737       {
   4738       *errorcodeptr = ERR64;
   4739       goto FAILED;
   4740       }
   4741     goto NORMAL_CHAR;
   4742 
   4743     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
   4744     used for "start of word" and "end of word". As these are otherwise illegal
   4745     sequences, we don't break anything by recognizing them. They are replaced
   4746     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
   4747     erroneous and are handled by the normal code below. */
   4748 
   4749     case CHAR_LEFT_SQUARE_BRACKET:
   4750     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
   4751       {
   4752       nestptr = ptr + 7;
   4753       ptr = sub_start_of_word - 1;
   4754       continue;
   4755       }
   4756 
   4757     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
   4758       {
   4759       nestptr = ptr + 7;
   4760       ptr = sub_end_of_word - 1;
   4761       continue;
   4762       }
   4763 
   4764     /* Handle a real character class. */
   4765 
   4766     previous = code;
   4767 
   4768     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
   4769     they are encountered at the top level, so we'll do that too. */
   4770 
   4771     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4772          ptr[1] == CHAR_EQUALS_SIGN) &&
   4773         check_posix_syntax(ptr, &tempptr))
   4774       {
   4775       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
   4776       goto FAILED;
   4777       }
   4778 
   4779     /* If the first character is '^', set the negation flag and skip it. Also,
   4780     if the first few characters (either before or after ^) are \Q\E or \E we
   4781     skip them too. This makes for compatibility with Perl. */
   4782 
   4783     negate_class = FALSE;
   4784     for (;;)
   4785       {
   4786       c = *(++ptr);
   4787       if (c == CHAR_BACKSLASH)
   4788         {
   4789         if (ptr[1] == CHAR_E)
   4790           ptr++;
   4791         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
   4792           ptr += 3;
   4793         else
   4794           break;
   4795         }
   4796       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   4797         negate_class = TRUE;
   4798       else break;
   4799       }
   4800 
   4801     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
   4802     an initial ']' is taken as a data character -- the code below handles
   4803     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
   4804     [^] must match any character, so generate OP_ALLANY. */
   4805 
   4806     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   4807         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   4808       {
   4809       *code++ = negate_class? OP_ALLANY : OP_FAIL;
   4810       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   4811       zerofirstchar = firstchar;
   4812       zerofirstcharflags = firstcharflags;
   4813       break;
   4814       }
   4815 
   4816     /* If a class contains a negative special such as \S, we need to flip the
   4817     negation flag at the end, so that support for characters > 255 works
   4818     correctly (they are all included in the class). */
   4819 
   4820     should_flip_negation = FALSE;
   4821 
   4822     /* Extended class (xclass) will be used when characters > 255
   4823     might match. */
   4824 
   4825 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4826     xclass = FALSE;
   4827     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
   4828     class_uchardata_base = class_uchardata;   /* Save the start */
   4829 #endif
   4830 
   4831     /* For optimization purposes, we track some properties of the class:
   4832     class_has_8bitchar will be non-zero if the class contains at least one <
   4833     256 character; class_one_char will be 1 if the class contains just one
   4834     character; xclass_has_prop will be TRUE if unicode property checks
   4835     are present in the class. */
   4836 
   4837     class_has_8bitchar = 0;
   4838     class_one_char = 0;
   4839 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4840     xclass_has_prop = FALSE;
   4841 #endif
   4842 
   4843     /* Initialize the 32-char bit map to all zeros. We build the map in a
   4844     temporary bit of memory, in case the class contains fewer than two
   4845     8-bit characters because in that case the compiled code doesn't use the bit
   4846     map. */
   4847 
   4848     memset(classbits, 0, 32 * sizeof(pcre_uint8));
   4849 
   4850     /* Process characters until ] is reached. By writing this as a "do" it
   4851     means that an initial ] is taken as a data character. At the start of the
   4852     loop, c contains the first byte of the character. */
   4853 
   4854     if (c != CHAR_NULL) do
   4855       {
   4856       const pcre_uchar *oldptr;
   4857 
   4858 #ifdef SUPPORT_UTF
   4859       if (utf && HAS_EXTRALEN(c))
   4860         {                           /* Braces are required because the */
   4861         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
   4862         }
   4863 #endif
   4864 
   4865 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4866       /* In the pre-compile phase, accumulate the length of any extra
   4867       data and reset the pointer. This is so that very large classes that
   4868       contain a zillion > 255 characters no longer overwrite the work space
   4869       (which is on the stack). We have to remember that there was XCLASS data,
   4870       however. */
   4871 
   4872       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
   4873         {
   4874         xclass = TRUE;
   4875         *lengthptr += (int)(class_uchardata - class_uchardata_base);
   4876         class_uchardata = class_uchardata_base;
   4877         }
   4878 #endif
   4879 
   4880       /* Inside \Q...\E everything is literal except \E */
   4881 
   4882       if (inescq)
   4883         {
   4884         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
   4885           {
   4886           inescq = FALSE;                   /* Reset literal state */
   4887           ptr++;                            /* Skip the 'E' */
   4888           continue;                         /* Carry on with next */
   4889           }
   4890         goto CHECK_RANGE;                   /* Could be range if \E follows */
   4891         }
   4892 
   4893       /* Handle POSIX class names. Perl allows a negation extension of the
   4894       form [:^name:]. A square bracket that doesn't match the syntax is
   4895       treated as a literal. We also recognize the POSIX constructions
   4896       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
   4897       5.6 and 5.8 do. */
   4898 
   4899       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   4900           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4901            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
   4902         {
   4903         BOOL local_negate = FALSE;
   4904         int posix_class, taboffset, tabopt;
   4905         register const pcre_uint8 *cbits = cd->cbits;
   4906         pcre_uint8 pbits[32];
   4907 
   4908         if (ptr[1] != CHAR_COLON)
   4909           {
   4910           *errorcodeptr = ERR31;
   4911           goto FAILED;
   4912           }
   4913 
   4914         ptr += 2;
   4915         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
   4916           {
   4917           local_negate = TRUE;
   4918           should_flip_negation = TRUE;  /* Note negative special */
   4919           ptr++;
   4920           }
   4921 
   4922         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
   4923         if (posix_class < 0)
   4924           {
   4925           *errorcodeptr = ERR30;
   4926           goto FAILED;
   4927           }
   4928 
   4929         /* If matching is caseless, upper and lower are converted to
   4930         alpha. This relies on the fact that the class table starts with
   4931         alpha, lower, upper as the first 3 entries. */
   4932 
   4933         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
   4934           posix_class = 0;
   4935 
   4936         /* When PCRE_UCP is set, some of the POSIX classes are converted to
   4937         different escape sequences that use Unicode properties \p or \P. Others
   4938         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
   4939         directly. */
   4940 
   4941 #ifdef SUPPORT_UCP
   4942         if ((options & PCRE_UCP) != 0)
   4943           {
   4944           unsigned int ptype = 0;
   4945           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
   4946 
   4947           /* The posix_substitutes table specifies which POSIX classes can be
   4948           converted to \p or \P items. */
   4949 
   4950           if (posix_substitutes[pc] != NULL)
   4951             {
   4952             nestptr = tempptr + 1;
   4953             ptr = posix_substitutes[pc] - 1;
   4954             continue;
   4955             }
   4956 
   4957           /* There are three other classes that generate special property calls
   4958           that are recognized only in an XCLASS. */
   4959 
   4960           else switch(posix_class)
   4961             {
   4962             case PC_GRAPH:
   4963             ptype = PT_PXGRAPH;
   4964             /* Fall through */
   4965             case PC_PRINT:
   4966             if (ptype == 0) ptype = PT_PXPRINT;
   4967             /* Fall through */
   4968             case PC_PUNCT:
   4969             if (ptype == 0) ptype = PT_PXPUNCT;
   4970             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
   4971             *class_uchardata++ = ptype;
   4972             *class_uchardata++ = 0;
   4973             xclass_has_prop = TRUE;
   4974             ptr = tempptr + 1;
   4975             continue;
   4976 
   4977             /* For all other POSIX classes, no special action is taken in UCP
   4978             mode. Fall through to the non_UCP case. */
   4979 
   4980             default:
   4981             break;
   4982             }
   4983           }
   4984 #endif
   4985         /* In the non-UCP case, or when UCP makes no difference, we build the
   4986         bit map for the POSIX class in a chunk of local store because we may be
   4987         adding and subtracting from it, and we don't want to subtract bits that
   4988         may be in the main map already. At the end we or the result into the
   4989         bit map that is being built. */
   4990 
   4991         posix_class *= 3;
   4992 
   4993         /* Copy in the first table (always present) */
   4994 
   4995         memcpy(pbits, cbits + posix_class_maps[posix_class],
   4996           32 * sizeof(pcre_uint8));
   4997 
   4998         /* If there is a second table, add or remove it as required. */
   4999 
   5000         taboffset = posix_class_maps[posix_class + 1];
   5001         tabopt = posix_class_maps[posix_class + 2];
   5002 
   5003         if (taboffset >= 0)
   5004           {
   5005           if (tabopt >= 0)
   5006             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
   5007           else
   5008             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
   5009           }
   5010 
   5011         /* Now see if we need to remove any special characters. An option
   5012         value of 1 removes vertical space and 2 removes underscore. */
   5013 
   5014         if (tabopt < 0) tabopt = -tabopt;
   5015         if (tabopt == 1) pbits[1] &= ~0x3c;
   5016           else if (tabopt == 2) pbits[11] &= 0x7f;
   5017 
   5018         /* Add the POSIX table or its complement into the main table that is
   5019         being built and we are done. */
   5020 
   5021         if (local_negate)
   5022           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
   5023         else
   5024           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
   5025 
   5026         ptr = tempptr + 1;
   5027         /* Every class contains at least one < 256 character. */
   5028         class_has_8bitchar = 1;
   5029         /* Every class contains at least two characters. */
   5030         class_one_char = 2;
   5031         continue;    /* End of POSIX syntax handling */
   5032         }
   5033 
   5034       /* Backslash may introduce a single character, or it may introduce one
   5035       of the specials, which just set a flag. The sequence \b is a special
   5036       case. Inside a class (and only there) it is treated as backspace. We
   5037       assume that other escapes have more than one character in them, so
   5038       speculatively set both class_has_8bitchar and class_one_char bigger
   5039       than one. Unrecognized escapes fall through and are either treated
   5040       as literal characters (by default), or are faulted if
   5041       PCRE_EXTRA is set. */
   5042 
   5043       if (c == CHAR_BACKSLASH)
   5044         {
   5045         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
   5046           TRUE);
   5047         if (*errorcodeptr != 0) goto FAILED;
   5048         if (escape == 0) c = ec;
   5049         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
   5050         else if (escape == ESC_N)          /* \N is not supported in a class */
   5051           {
   5052           *errorcodeptr = ERR71;
   5053           goto FAILED;
   5054           }
   5055         else if (escape == ESC_Q)            /* Handle start of quoted string */
   5056           {
   5057           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   5058             {
   5059             ptr += 2; /* avoid empty string */
   5060             }
   5061           else inescq = TRUE;
   5062           continue;
   5063           }
   5064         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
   5065 
   5066         else
   5067           {
   5068           register const pcre_uint8 *cbits = cd->cbits;
   5069           /* Every class contains at least two < 256 characters. */
   5070           class_has_8bitchar++;
   5071           /* Every class contains at least two characters. */
   5072           class_one_char += 2;
   5073 
   5074           switch (escape)
   5075             {
   5076 #ifdef SUPPORT_UCP
   5077             case ESC_du:     /* These are the values given for \d etc */
   5078             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
   5079             case ESC_wu:     /* escape sequence with an appropriate \p */
   5080             case ESC_WU:     /* or \P to test Unicode properties instead */
   5081             case ESC_su:     /* of the default ASCII testing. */
   5082             case ESC_SU:
   5083             nestptr = ptr;
   5084             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
   5085             class_has_8bitchar--;                /* Undo! */
   5086             continue;
   5087 #endif
   5088             case ESC_d:
   5089             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
   5090             continue;
   5091 
   5092             case ESC_D:
   5093             should_flip_negation = TRUE;
   5094             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
   5095             continue;
   5096 
   5097             case ESC_w:
   5098             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
   5099             continue;
   5100 
   5101             case ESC_W:
   5102             should_flip_negation = TRUE;
   5103             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
   5104             continue;
   5105 
   5106             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
   5107             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
   5108             previously set by something earlier in the character class.
   5109             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
   5110             we could just adjust the appropriate bit. From PCRE 8.34 we no
   5111             longer treat \s and \S specially. */
   5112 
   5113             case ESC_s:
   5114             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
   5115             continue;
   5116 
   5117             case ESC_S:
   5118             should_flip_negation = TRUE;
   5119             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
   5120             continue;
   5121 
   5122             /* The rest apply in both UCP and non-UCP cases. */
   5123 
   5124             case ESC_h:
   5125             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
   5126               PRIV(hspace_list), NOTACHAR);
   5127             continue;
   5128 
   5129             case ESC_H:
   5130             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   5131               cd, PRIV(hspace_list));
   5132             continue;
   5133 
   5134             case ESC_v:
   5135             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
   5136               PRIV(vspace_list), NOTACHAR);
   5137             continue;
   5138 
   5139             case ESC_V:
   5140             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   5141               cd, PRIV(vspace_list));
   5142             continue;
   5143 
   5144 #ifdef SUPPORT_UCP
   5145             case ESC_p:
   5146             case ESC_P:
   5147               {
   5148               BOOL negated;
   5149               unsigned int ptype = 0, pdata = 0;
   5150               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
   5151                 goto FAILED;
   5152               *class_uchardata++ = ((escape == ESC_p) != negated)?
   5153                 XCL_PROP : XCL_NOTPROP;
   5154               *class_uchardata++ = ptype;
   5155               *class_uchardata++ = pdata;
   5156               xclass_has_prop = TRUE;
   5157               class_has_8bitchar--;                /* Undo! */
   5158               continue;
   5159               }
   5160 #endif
   5161             /* Unrecognized escapes are faulted if PCRE is running in its
   5162             strict mode. By default, for compatibility with Perl, they are
   5163             treated as literals. */
   5164 
   5165             default:
   5166             if ((options & PCRE_EXTRA) != 0)
   5167               {
   5168               *errorcodeptr = ERR7;
   5169               goto FAILED;
   5170               }
   5171             class_has_8bitchar--;    /* Undo the speculative increase. */
   5172             class_one_char -= 2;     /* Undo the speculative increase. */
   5173             c = *ptr;                /* Get the final character and fall through */
   5174             break;
   5175             }
   5176           }
   5177 
   5178         /* Fall through if the escape just defined a single character (c >= 0).
   5179         This may be greater than 256. */
   5180 
   5181         escape = 0;
   5182 
   5183         }   /* End of backslash handling */
   5184 
   5185       /* A character may be followed by '-' to form a range. However, Perl does
   5186       not permit ']' to be the end of the range. A '-' character at the end is
   5187       treated as a literal. Perl ignores orphaned \E sequences entirely. The
   5188       code for handling \Q and \E is messy. */
   5189 
   5190       CHECK_RANGE:
   5191       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   5192         {
   5193         inescq = FALSE;
   5194         ptr += 2;
   5195         }
   5196       oldptr = ptr;
   5197 
   5198       /* Remember if \r or \n were explicitly used */
   5199 
   5200       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   5201 
   5202       /* Check for range */
   5203 
   5204       if (!inescq && ptr[1] == CHAR_MINUS)
   5205         {
   5206         pcre_uint32 d;
   5207         ptr += 2;
   5208         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
   5209 
   5210         /* If we hit \Q (not followed by \E) at this point, go into escaped
   5211         mode. */
   5212 
   5213         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
   5214           {
   5215           ptr += 2;
   5216           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   5217             { ptr += 2; continue; }
   5218           inescq = TRUE;
   5219           break;
   5220           }
   5221 
   5222         /* Minus (hyphen) at the end of a class is treated as a literal, so put
   5223         back the pointer and jump to handle the character that preceded it. */
   5224 
   5225         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
   5226           {
   5227           ptr = oldptr;
   5228           goto CLASS_SINGLE_CHARACTER;
   5229           }
   5230 
   5231         /* Otherwise, we have a potential range; pick up the next character */
   5232 
   5233 #ifdef SUPPORT_UTF
   5234         if (utf)
   5235           {                           /* Braces are required because the */
   5236           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
   5237           }
   5238         else
   5239 #endif
   5240         d = *ptr;  /* Not UTF-8 mode */
   5241 
   5242         /* The second part of a range can be a single-character escape
   5243         sequence, but not any of the other escapes. Perl treats a hyphen as a
   5244         literal in such circumstances. However, in Perl's warning mode, a
   5245         warning is given, so PCRE now faults it as it is almost certainly a
   5246         mistake on the user's part. */
   5247 
   5248         if (!inescq)
   5249           {
   5250           if (d == CHAR_BACKSLASH)
   5251             {
   5252             int descape;
   5253             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
   5254             if (*errorcodeptr != 0) goto FAILED;
   5255 
   5256             /* 0 means a character was put into d; \b is backspace; any other
   5257             special causes an error. */
   5258 
   5259             if (descape != 0)
   5260               {
   5261               if (descape == ESC_b) d = CHAR_BS; else
   5262                 {
   5263                 *errorcodeptr = ERR83;
   5264                 goto FAILED;
   5265                 }
   5266               }
   5267             }
   5268 
   5269           /* A hyphen followed by a POSIX class is treated in the same way. */
   5270 
   5271           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
   5272                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   5273                     ptr[1] == CHAR_EQUALS_SIGN) &&
   5274                    check_posix_syntax(ptr, &tempptr))
   5275             {
   5276             *errorcodeptr = ERR83;
   5277             goto FAILED;
   5278             }
   5279           }
   5280 
   5281         /* Check that the two values are in the correct order. Optimize
   5282         one-character ranges. */
   5283 
   5284         if (d < c)
   5285           {
   5286           *errorcodeptr = ERR8;
   5287           goto FAILED;
   5288           }
   5289         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
   5290 
   5291         /* We have found a character range, so single character optimizations
   5292         cannot be done anymore. Any value greater than 1 indicates that there
   5293         is more than one character. */
   5294 
   5295         class_one_char = 2;
   5296 
   5297         /* Remember an explicit \r or \n, and add the range to the class. */
   5298 
   5299         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   5300 
   5301         class_has_8bitchar +=
   5302           add_to_class(classbits, &class_uchardata, options, cd, c, d);
   5303 
   5304         continue;   /* Go get the next char in the class */
   5305         }
   5306 
   5307       /* Handle a single character - we can get here for a normal non-escape
   5308       char, or after \ that introduces a single character or for an apparent
   5309       range that isn't. Only the value 1 matters for class_one_char, so don't
   5310       increase it if it is already 2 or more ... just in case there's a class
   5311       with a zillion characters in it. */
   5312 
   5313       CLASS_SINGLE_CHARACTER:
   5314       if (class_one_char < 2) class_one_char++;
   5315 
   5316       /* If class_one_char is 1, we have the first single character in the
   5317       class, and there have been no prior ranges, or XCLASS items generated by
   5318       escapes. If this is the final character in the class, we can optimize by
   5319       turning the item into a 1-character OP_CHAR[I] if it's positive, or
   5320       OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
   5321       to be set. Otherwise, there can be no first char if this item is first,
   5322       whatever repeat count may follow. In the case of reqchar, save the
   5323       previous value for reinstating. */
   5324 
   5325       if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   5326         {
   5327         ptr++;
   5328         zeroreqchar = reqchar;
   5329         zeroreqcharflags = reqcharflags;
   5330 
   5331         if (negate_class)
   5332           {
   5333 #ifdef SUPPORT_UCP
   5334           int d;
   5335 #endif
   5336           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   5337           zerofirstchar = firstchar;
   5338           zerofirstcharflags = firstcharflags;
   5339 
   5340           /* For caseless UTF-8 mode when UCP support is available, check
   5341           whether this character has more than one other case. If so, generate
   5342           a special OP_NOTPROP item instead of OP_NOTI. */
   5343 
   5344 #ifdef SUPPORT_UCP
   5345           if (utf && (options & PCRE_CASELESS) != 0 &&
   5346               (d = UCD_CASESET(c)) != 0)
   5347             {
   5348             *code++ = OP_NOTPROP;
   5349             *code++ = PT_CLIST;
   5350             *code++ = d;
   5351             }
   5352           else
   5353 #endif
   5354           /* Char has only one other case, or UCP not available */
   5355 
   5356             {
   5357             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
   5358 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5359             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
   5360               code += PRIV(ord2utf)(c, code);
   5361             else
   5362 #endif
   5363               *code++ = c;
   5364             }
   5365 
   5366           /* We are finished with this character class */
   5367 
   5368           goto END_CLASS;
   5369           }
   5370 
   5371         /* For a single, positive character, get the value into mcbuffer, and
   5372         then we can handle this with the normal one-character code. */
   5373 
   5374 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5375         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
   5376           mclength = PRIV(ord2utf)(c, mcbuffer);
   5377         else
   5378 #endif
   5379           {
   5380           mcbuffer[0] = c;
   5381           mclength = 1;
   5382           }
   5383         goto ONE_CHAR;
   5384         }       /* End of 1-char optimization */
   5385 
   5386       /* There is more than one character in the class, or an XCLASS item
   5387       has been generated. Add this character to the class. */
   5388 
   5389       class_has_8bitchar +=
   5390         add_to_class(classbits, &class_uchardata, options, cd, c, c);
   5391       }
   5392 
   5393     /* Loop until ']' reached. This "while" is the end of the "do" far above.
   5394     If we are at the end of an internal nested string, revert to the outer
   5395     string. */
   5396 
   5397     while (((c = *(++ptr)) != CHAR_NULL ||
   5398            (nestptr != NULL &&
   5399              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
   5400            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
   5401 
   5402     /* Check for missing terminating ']' */
   5403 
   5404     if (c == CHAR_NULL)
   5405       {
   5406       *errorcodeptr = ERR6;
   5407       goto FAILED;
   5408       }
   5409 
   5410     /* We will need an XCLASS if data has been placed in class_uchardata. In
   5411     the second phase this is a sufficient test. However, in the pre-compile
   5412     phase, class_uchardata gets emptied to prevent workspace overflow, so it
   5413     only if the very last character in the class needs XCLASS will it contain
   5414     anything at this point. For this reason, xclass gets set TRUE above when
   5415     uchar_classdata is emptied, and that's why this code is the way it is here
   5416     instead of just doing a test on class_uchardata below. */
   5417 
   5418 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   5419     if (class_uchardata > class_uchardata_base) xclass = TRUE;
   5420 #endif
   5421 
   5422     /* If this is the first thing in the branch, there can be no first char
   5423     setting, whatever the repeat count. Any reqchar setting must remain
   5424     unchanged after any kind of repeat. */
   5425 
   5426     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   5427     zerofirstchar = firstchar;
   5428     zerofirstcharflags = firstcharflags;
   5429     zeroreqchar = reqchar;
   5430     zeroreqcharflags = reqcharflags;
   5431 
   5432     /* If there are characters with values > 255, we have to compile an
   5433     extended class, with its own opcode, unless there was a negated special
   5434     such as \S in the class, and PCRE_UCP is not set, because in that case all
   5435     characters > 255 are in the class, so any that were explicitly given as
   5436     well can be ignored. If (when there are explicit characters > 255 that must
   5437     be listed) there are no characters < 256, we can omit the bitmap in the
   5438     actual compiled code. */
   5439 
   5440 #ifdef SUPPORT_UTF
   5441     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
   5442 #elif !defined COMPILE_PCRE8
   5443     if (xclass && !should_flip_negation)
   5444 #endif
   5445 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   5446       {
   5447       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
   5448       *code++ = OP_XCLASS;
   5449       code += LINK_SIZE;
   5450       *code = negate_class? XCL_NOT:0;
   5451       if (xclass_has_prop) *code |= XCL_HASPROP;
   5452 
   5453       /* If the map is required, move up the extra data to make room for it;
   5454       otherwise just move the code pointer to the end of the extra data. */
   5455 
   5456       if (class_has_8bitchar > 0)
   5457         {
   5458         *code++ |= XCL_MAP;
   5459         memmove(code + (32 / sizeof(pcre_uchar)), code,
   5460           IN_UCHARS(class_uchardata - code));
   5461         if (negate_class && !xclass_has_prop)
   5462           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5463         memcpy(code, classbits, 32);
   5464         code = class_uchardata + (32 / sizeof(pcre_uchar));
   5465         }
   5466       else code = class_uchardata;
   5467 
   5468       /* Now fill in the complete length of the item */
   5469 
   5470       PUT(previous, 1, (int)(code - previous));
   5471       break;   /* End of class handling */
   5472       }
   5473 #endif
   5474 
   5475     /* If there are no characters > 255, or they are all to be included or
   5476     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
   5477     whole class was negated and whether there were negative specials such as \S
   5478     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
   5479     negating it if necessary. */
   5480 
   5481     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
   5482     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
   5483       {
   5484       if (negate_class)
   5485         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5486       memcpy(code, classbits, 32);
   5487       }
   5488     code += 32 / sizeof(pcre_uchar);
   5489 
   5490     END_CLASS:
   5491     break;
   5492 
   5493 
   5494     /* ===================================================================*/
   5495     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
   5496     has been tested above. */
   5497 
   5498     case CHAR_LEFT_CURLY_BRACKET:
   5499     if (!is_quantifier) goto NORMAL_CHAR;
   5500     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
   5501     if (*errorcodeptr != 0) goto FAILED;
   5502     goto REPEAT;
   5503 
   5504     case CHAR_ASTERISK:
   5505     repeat_min = 0;
   5506     repeat_max = -1;
   5507     goto REPEAT;
   5508 
   5509     case CHAR_PLUS:
   5510     repeat_min = 1;
   5511     repeat_max = -1;
   5512     goto REPEAT;
   5513 
   5514     case CHAR_QUESTION_MARK:
   5515     repeat_min = 0;
   5516     repeat_max = 1;
   5517 
   5518     REPEAT:
   5519     if (previous == NULL)
   5520       {
   5521       *errorcodeptr = ERR9;
   5522       goto FAILED;
   5523       }
   5524 
   5525     if (repeat_min == 0)
   5526       {
   5527       firstchar = zerofirstchar;    /* Adjust for zero repeat */
   5528       firstcharflags = zerofirstcharflags;
   5529       reqchar = zeroreqchar;        /* Ditto */
   5530       reqcharflags = zeroreqcharflags;
   5531       }
   5532 
   5533     /* Remember whether this is a variable length repeat */
   5534 
   5535     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
   5536 
   5537     op_type = 0;                    /* Default single-char op codes */
   5538     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
   5539 
   5540     /* Save start of previous item, in case we have to move it up in order to
   5541     insert something before it. */
   5542 
   5543     tempcode = previous;
   5544 
   5545     /* Before checking for a possessive quantifier, we must skip over
   5546     whitespace and comments in extended mode because Perl allows white space at
   5547     this point. */
   5548 
   5549     if ((options & PCRE_EXTENDED) != 0)
   5550       {
   5551       const pcre_uchar *p = ptr + 1;
   5552       for (;;)
   5553         {
   5554         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
   5555         if (*p != CHAR_NUMBER_SIGN) break;
   5556         p++;
   5557         while (*p != CHAR_NULL)
   5558           {
   5559           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
   5560             {                        /* IS_NEWLINE sets cd->nllen. */
   5561             p += cd->nllen;
   5562             break;
   5563             }
   5564           p++;
   5565 #ifdef SUPPORT_UTF
   5566           if (utf) FORWARDCHAR(p);
   5567 #endif
   5568           }           /* Loop for comment characters */
   5569         }             /* Loop for multiple comments */
   5570       ptr = p - 1;    /* Character before the next significant one. */
   5571       }
   5572 
   5573     /* If the next character is '+', we have a possessive quantifier. This
   5574     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
   5575     If the next character is '?' this is a minimizing repeat, by default,
   5576     but if PCRE_UNGREEDY is set, it works the other way round. We change the
   5577     repeat type to the non-default. */
   5578 
   5579     if (ptr[1] == CHAR_PLUS)
   5580       {
   5581       repeat_type = 0;                  /* Force greedy */
   5582       possessive_quantifier = TRUE;
   5583       ptr++;
   5584       }
   5585     else if (ptr[1] == CHAR_QUESTION_MARK)
   5586       {
   5587       repeat_type = greedy_non_default;
   5588       ptr++;
   5589       }
   5590     else repeat_type = greedy_default;
   5591 
   5592     /* If previous was a recursion call, wrap it in atomic brackets so that
   5593     previous becomes the atomic group. All recursions were so wrapped in the
   5594     past, but it no longer happens for non-repeated recursions. In fact, the
   5595     repeated ones could be re-implemented independently so as not to need this,
   5596     but for the moment we rely on the code for repeating groups. */
   5597 
   5598     if (*previous == OP_RECURSE)
   5599       {
   5600       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
   5601       *previous = OP_ONCE;
   5602       PUT(previous, 1, 2 + 2*LINK_SIZE);
   5603       previous[2 + 2*LINK_SIZE] = OP_KET;
   5604       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
   5605       code += 2 + 2 * LINK_SIZE;
   5606       length_prevgroup = 3 + 3*LINK_SIZE;
   5607 
   5608       /* When actually compiling, we need to check whether this was a forward
   5609       reference, and if so, adjust the offset. */
   5610 
   5611       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
   5612         {
   5613         int offset = GET(cd->hwm, -LINK_SIZE);
   5614         if (offset == previous + 1 - cd->start_code)
   5615           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
   5616         }
   5617       }
   5618 
   5619     /* Now handle repetition for the different types of item. */
   5620 
   5621     /* If previous was a character or negated character match, abolish the item
   5622     and generate a repeat item instead. If a char item has a minimum of more
   5623     than one, ensure that it is set in reqchar - it might not be if a sequence
   5624     such as x{3} is the first thing in a branch because the x will have gone
   5625     into firstchar instead.  */
   5626 
   5627     if (*previous == OP_CHAR || *previous == OP_CHARI
   5628         || *previous == OP_NOT || *previous == OP_NOTI)
   5629       {
   5630       switch (*previous)
   5631         {
   5632         default: /* Make compiler happy. */
   5633         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
   5634         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
   5635         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
   5636         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
   5637         }
   5638 
   5639       /* Deal with UTF characters that take up more than one character. It's
   5640       easier to write this out separately than try to macrify it. Use c to
   5641       hold the length of the character in bytes, plus UTF_LENGTH to flag that
   5642       it's a length rather than a small character. */
   5643 
   5644 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5645       if (utf && NOT_FIRSTCHAR(code[-1]))
   5646         {
   5647         pcre_uchar *lastchar = code - 1;
   5648         BACKCHAR(lastchar);
   5649         c = (int)(code - lastchar);     /* Length of UTF-8 character */
   5650         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
   5651         c |= UTF_LENGTH;                /* Flag c as a length */
   5652         }
   5653       else
   5654 #endif /* SUPPORT_UTF */
   5655 
   5656       /* Handle the case of a single charater - either with no UTF support, or
   5657       with UTF disabled, or for a single character UTF character. */
   5658         {
   5659         c = code[-1];
   5660         if (*previous <= OP_CHARI && repeat_min > 1)
   5661           {
   5662           reqchar = c;
   5663           reqcharflags = req_caseopt | cd->req_varyopt;
   5664           }
   5665         }
   5666 
   5667       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
   5668       }
   5669 
   5670     /* If previous was a character type match (\d or similar), abolish it and
   5671     create a suitable repeat item. The code is shared with single-character
   5672     repeats by setting op_type to add a suitable offset into repeat_type. Note
   5673     the the Unicode property types will be present only when SUPPORT_UCP is
   5674     defined, but we don't wrap the little bits of code here because it just
   5675     makes it horribly messy. */
   5676 
   5677     else if (*previous < OP_EODN)
   5678       {
   5679       pcre_uchar *oldcode;
   5680       int prop_type, prop_value;
   5681       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
   5682       c = *previous;
   5683 
   5684       OUTPUT_SINGLE_REPEAT:
   5685       if (*previous == OP_PROP || *previous == OP_NOTPROP)
   5686         {
   5687         prop_type = previous[1];
   5688         prop_value = previous[2];
   5689         }
   5690       else prop_type = prop_value = -1;
   5691 
   5692       oldcode = code;
   5693       code = previous;                  /* Usually overwrite previous item */
   5694 
   5695       /* If the maximum is zero then the minimum must also be zero; Perl allows
   5696       this case, so we do too - by simply omitting the item altogether. */
   5697 
   5698       if (repeat_max == 0) goto END_REPEAT;
   5699 
   5700       /* Combine the op_type with the repeat_type */
   5701 
   5702       repeat_type += op_type;
   5703 
   5704       /* A minimum of zero is handled either as the special case * or ?, or as
   5705       an UPTO, with the maximum given. */
   5706 
   5707       if (repeat_min == 0)
   5708         {
   5709         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
   5710           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
   5711         else
   5712           {
   5713           *code++ = OP_UPTO + repeat_type;
   5714           PUT2INC(code, 0, repeat_max);
   5715           }
   5716         }
   5717 
   5718       /* A repeat minimum of 1 is optimized into some special cases. If the
   5719       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
   5720       left in place and, if the maximum is greater than 1, we use OP_UPTO with
   5721       one less than the maximum. */
   5722 
   5723       else if (repeat_min == 1)
   5724         {
   5725         if (repeat_max == -1)
   5726           *code++ = OP_PLUS + repeat_type;
   5727         else
   5728           {
   5729           code = oldcode;                 /* leave previous item in place */
   5730           if (repeat_max == 1) goto END_REPEAT;
   5731           *code++ = OP_UPTO + repeat_type;
   5732           PUT2INC(code, 0, repeat_max - 1);
   5733           }
   5734         }
   5735 
   5736       /* The case {n,n} is just an EXACT, while the general case {n,m} is
   5737       handled as an EXACT followed by an UPTO. */
   5738 
   5739       else
   5740         {
   5741         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
   5742         PUT2INC(code, 0, repeat_min);
   5743 
   5744         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
   5745         we have to insert the character for the previous code. For a repeated
   5746         Unicode property match, there are two extra bytes that define the
   5747         required property. In UTF-8 mode, long characters have their length in
   5748         c, with the UTF_LENGTH bit as a flag. */
   5749 
   5750         if (repeat_max < 0)
   5751           {
   5752 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5753           if (utf && (c & UTF_LENGTH) != 0)
   5754             {
   5755             memcpy(code, utf_chars, IN_UCHARS(c & 7));
   5756             code += c & 7;
   5757             }
   5758           else
   5759 #endif
   5760             {
   5761             *code++ = c;
   5762             if (prop_type >= 0)
   5763               {
   5764               *code++ = prop_type;
   5765               *code++ = prop_value;
   5766               }
   5767             }
   5768           *code++ = OP_STAR + repeat_type;
   5769           }
   5770 
   5771         /* Else insert an UPTO if the max is greater than the min, again
   5772         preceded by the character, for the previously inserted code. If the
   5773         UPTO is just for 1 instance, we can use QUERY instead. */
   5774 
   5775         else if (repeat_max != repeat_min)
   5776           {
   5777 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5778           if (utf && (c & UTF_LENGTH) != 0)
   5779             {
   5780             memcpy(code, utf_chars, IN_UCHARS(c & 7));
   5781             code += c & 7;
   5782             }
   5783           else
   5784 #endif
   5785           *code++ = c;
   5786           if (prop_type >= 0)
   5787             {
   5788             *code++ = prop_type;
   5789             *code++ = prop_value;
   5790             }
   5791           repeat_max -= repeat_min;
   5792 
   5793           if (repeat_max == 1)
   5794             {
   5795             *code++ = OP_QUERY + repeat_type;
   5796             }
   5797           else
   5798             {
   5799             *code++ = OP_UPTO + repeat_type;
   5800             PUT2INC(code, 0, repeat_max);
   5801             }
   5802           }
   5803         }
   5804 
   5805       /* The character or character type itself comes last in all cases. */
   5806 
   5807 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5808       if (utf && (c & UTF_LENGTH) != 0)
   5809         {
   5810         memcpy(code, utf_chars, IN_UCHARS(c & 7));
   5811         code += c & 7;
   5812         }
   5813       else
   5814 #endif
   5815       *code++ = c;
   5816 
   5817       /* For a repeated Unicode property match, there are two extra bytes that
   5818       define the required property. */
   5819 
   5820 #ifdef SUPPORT_UCP
   5821       if (prop_type >= 0)
   5822         {
   5823         *code++ = prop_type;
   5824         *code++ = prop_value;
   5825         }
   5826 #endif
   5827       }
   5828 
   5829     /* If previous was a character class or a back reference, we put the repeat
   5830     stuff after it, but just skip the item if the repeat was {0,0}. */
   5831 
   5832     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
   5833 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   5834              *previous == OP_XCLASS ||
   5835 #endif
   5836              *previous == OP_REF   || *previous == OP_REFI ||
   5837              *previous == OP_DNREF || *previous == OP_DNREFI)
   5838       {
   5839       if (repeat_max == 0)
   5840         {
   5841         code = previous;
   5842         goto END_REPEAT;
   5843         }
   5844 
   5845       if (repeat_min == 0 && repeat_max == -1)
   5846         *code++ = OP_CRSTAR + repeat_type;
   5847       else if (repeat_min == 1 && repeat_max == -1)
   5848         *code++ = OP_CRPLUS + repeat_type;
   5849       else if (repeat_min == 0 && repeat_max == 1)
   5850         *code++ = OP_CRQUERY + repeat_type;
   5851       else
   5852         {
   5853         *code++ = OP_CRRANGE + repeat_type;
   5854         PUT2INC(code, 0, repeat_min);
   5855         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
   5856         PUT2INC(code, 0, repeat_max);
   5857         }
   5858       }
   5859 
   5860     /* If previous was a bracket group, we may have to replicate it in certain
   5861     cases. Note that at this point we can encounter only the "basic" bracket
   5862     opcodes such as BRA and CBRA, as this is the place where they get converted
   5863     into the more special varieties such as BRAPOS and SBRA. A test for >=
   5864     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
   5865     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
   5866     Originally, PCRE did not allow repetition of assertions, but now it does,
   5867     for Perl compatibility. */
   5868 
   5869     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
   5870       {
   5871       register int i;
   5872       int len = (int)(code - previous);
   5873       pcre_uchar *bralink = NULL;
   5874       pcre_uchar *brazeroptr = NULL;
   5875 
   5876       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
   5877       we just ignore the repeat. */
   5878 
   5879       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
   5880         goto END_REPEAT;
   5881 
   5882       /* There is no sense in actually repeating assertions. The only potential
   5883       use of repetition is in cases when the assertion is optional. Therefore,
   5884       if the minimum is greater than zero, just ignore the repeat. If the
   5885       maximum is not zero or one, set it to 1. */
   5886 
   5887       if (*previous < OP_ONCE)    /* Assertion */
   5888         {
   5889         if (repeat_min > 0) goto END_REPEAT;
   5890         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
   5891         }
   5892 
   5893       /* The case of a zero minimum is special because of the need to stick
   5894       OP_BRAZERO in front of it, and because the group appears once in the
   5895       data, whereas in other cases it appears the minimum number of times. For
   5896       this reason, it is simplest to treat this case separately, as otherwise
   5897       the code gets far too messy. There are several special subcases when the
   5898       minimum is zero. */
   5899 
   5900       if (repeat_min == 0)
   5901         {
   5902         /* If the maximum is also zero, we used to just omit the group from the
   5903         output altogether, like this:
   5904 
   5905         ** if (repeat_max == 0)
   5906         **   {
   5907         **   code = previous;
   5908         **   goto END_REPEAT;
   5909         **   }
   5910 
   5911         However, that fails when a group or a subgroup within it is referenced
   5912         as a subroutine from elsewhere in the pattern, so now we stick in
   5913         OP_SKIPZERO in front of it so that it is skipped on execution. As we
   5914         don't have a list of which groups are referenced, we cannot do this
   5915         selectively.
   5916 
   5917         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
   5918         and do no more at this point. However, we do need to adjust any
   5919         OP_RECURSE calls inside the group that refer to the group itself or any
   5920         internal or forward referenced group, because the offset is from the
   5921         start of the whole regex. Temporarily terminate the pattern while doing
   5922         this. */
   5923 
   5924         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
   5925           {
   5926           *code = OP_END;
   5927           adjust_recurse(previous, 1, utf, cd, save_hwm);
   5928           memmove(previous + 1, previous, IN_UCHARS(len));
   5929           code++;
   5930           if (repeat_max == 0)
   5931             {
   5932             *previous++ = OP_SKIPZERO;
   5933             goto END_REPEAT;
   5934             }
   5935           brazeroptr = previous;    /* Save for possessive optimizing */
   5936           *previous++ = OP_BRAZERO + repeat_type;
   5937           }
   5938 
   5939         /* If the maximum is greater than 1 and limited, we have to replicate
   5940         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
   5941         The first one has to be handled carefully because it's the original
   5942         copy, which has to be moved up. The remainder can be handled by code
   5943         that is common with the non-zero minimum case below. We have to
   5944         adjust the value or repeat_max, since one less copy is required. Once
   5945         again, we may have to adjust any OP_RECURSE calls inside the group. */
   5946 
   5947         else
   5948           {
   5949           int offset;
   5950           *code = OP_END;
   5951           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
   5952           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
   5953           code += 2 + LINK_SIZE;
   5954           *previous++ = OP_BRAZERO + repeat_type;
   5955           *previous++ = OP_BRA;
   5956 
   5957           /* We chain together the bracket offset fields that have to be
   5958           filled in later when the ends of the brackets are reached. */
   5959 
   5960           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
   5961           bralink = previous;
   5962           PUTINC(previous, 0, offset);
   5963           }
   5964 
   5965         repeat_max--;
   5966         }
   5967 
   5968       /* If the minimum is greater than zero, replicate the group as many
   5969       times as necessary, and adjust the maximum to the number of subsequent
   5970       copies that we need. If we set a first char from the group, and didn't
   5971       set a required char, copy the latter from the former. If there are any
   5972       forward reference subroutine calls in the group, there will be entries on
   5973       the workspace list; replicate these with an appropriate increment. */
   5974 
   5975       else
   5976         {
   5977         if (repeat_min > 1)
   5978           {
   5979           /* In the pre-compile phase, we don't actually do the replication. We
   5980           just adjust the length as if we had. Do some paranoid checks for
   5981           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
   5982           integer type when available, otherwise double. */
   5983 
   5984           if (lengthptr != NULL)
   5985             {
   5986             int delta = (repeat_min - 1)*length_prevgroup;
   5987             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
   5988                   (INT64_OR_DOUBLE)length_prevgroup >
   5989                     (INT64_OR_DOUBLE)INT_MAX ||
   5990                 OFLOW_MAX - *lengthptr < delta)
   5991               {
   5992               *errorcodeptr = ERR20;
   5993               goto FAILED;
   5994               }
   5995             *lengthptr += delta;
   5996             }
   5997 
   5998           /* This is compiling for real. If there is a set first byte for
   5999           the group, and we have not yet set a "required byte", set it. Make
   6000           sure there is enough workspace for copying forward references before
   6001           doing the copy. */
   6002 
   6003           else
   6004             {
   6005             if (groupsetfirstchar && reqcharflags < 0)
   6006               {
   6007               reqchar = firstchar;
   6008               reqcharflags = firstcharflags;
   6009               }
   6010 
   6011             for (i = 1; i < repeat_min; i++)
   6012               {
   6013               pcre_uchar *hc;
   6014               pcre_uchar *this_hwm = cd->hwm;
   6015               memcpy(code, previous, IN_UCHARS(len));
   6016 
   6017               while (cd->hwm > cd->start_workspace + cd->workspace_size -
   6018                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
   6019                 {
   6020                 size_t save_offset = save_hwm - cd->start_workspace;
   6021                 size_t this_offset = this_hwm - cd->start_workspace;
   6022                 *errorcodeptr = expand_workspace(cd);
   6023                 if (*errorcodeptr != 0) goto FAILED;
   6024                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
   6025                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
   6026                 }
   6027 
   6028               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
   6029                 {
   6030                 PUT(cd->hwm, 0, GET(hc, 0) + len);
   6031                 cd->hwm += LINK_SIZE;
   6032                 }
   6033               save_hwm = this_hwm;
   6034               code += len;
   6035               }
   6036             }
   6037           }
   6038 
   6039         if (repeat_max > 0) repeat_max -= repeat_min;
   6040         }
   6041 
   6042       /* This code is common to both the zero and non-zero minimum cases. If
   6043       the maximum is limited, it replicates the group in a nested fashion,
   6044       remembering the bracket starts on a stack. In the case of a zero minimum,
   6045       the first one was set up above. In all cases the repeat_max now specifies
   6046       the number of additional copies needed. Again, we must remember to
   6047       replicate entries on the forward reference list. */
   6048 
   6049       if (repeat_max >= 0)
   6050         {
   6051         /* In the pre-compile phase, we don't actually do the replication. We
   6052         just adjust the length as if we had. For each repetition we must add 1
   6053         to the length for BRAZERO and for all but the last repetition we must
   6054         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
   6055         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
   6056         a 64-bit integer type when available, otherwise double. */
   6057 
   6058         if (lengthptr != NULL && repeat_max > 0)
   6059           {
   6060           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
   6061                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
   6062           if ((INT64_OR_DOUBLE)repeat_max *
   6063                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
   6064                   > (INT64_OR_DOUBLE)INT_MAX ||
   6065               OFLOW_MAX - *lengthptr < delta)
   6066             {
   6067             *errorcodeptr = ERR20;
   6068             goto FAILED;
   6069             }
   6070           *lengthptr += delta;
   6071           }
   6072 
   6073         /* This is compiling for real */
   6074 
   6075         else for (i = repeat_max - 1; i >= 0; i--)
   6076           {
   6077           pcre_uchar *hc;
   6078           pcre_uchar *this_hwm = cd->hwm;
   6079 
   6080           *code++ = OP_BRAZERO + repeat_type;
   6081 
   6082           /* All but the final copy start a new nesting, maintaining the
   6083           chain of brackets outstanding. */
   6084 
   6085           if (i != 0)
   6086             {
   6087             int offset;
   6088             *code++ = OP_BRA;
   6089             offset = (bralink == NULL)? 0 : (int)(code - bralink);
   6090             bralink = code;
   6091             PUTINC(code, 0, offset);
   6092             }
   6093 
   6094           memcpy(code, previous, IN_UCHARS(len));
   6095 
   6096           /* Ensure there is enough workspace for forward references before
   6097           copying them. */
   6098 
   6099           while (cd->hwm > cd->start_workspace + cd->workspace_size -
   6100                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
   6101             {
   6102             size_t save_offset = save_hwm - cd->start_workspace;
   6103             size_t this_offset = this_hwm - cd->start_workspace;
   6104             *errorcodeptr = expand_workspace(cd);
   6105             if (*errorcodeptr != 0) goto FAILED;
   6106             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
   6107             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
   6108             }
   6109 
   6110           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
   6111             {
   6112             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
   6113             cd->hwm += LINK_SIZE;
   6114             }
   6115           save_hwm = this_hwm;
   6116           code += len;
   6117           }
   6118 
   6119         /* Now chain through the pending brackets, and fill in their length
   6120         fields (which are holding the chain links pro tem). */
   6121 
   6122         while (bralink != NULL)
   6123           {
   6124           int oldlinkoffset;
   6125           int offset = (int)(code - bralink + 1);
   6126           pcre_uchar *bra = code - offset;
   6127           oldlinkoffset = GET(bra, 1);
   6128           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
   6129           *code++ = OP_KET;
   6130           PUTINC(code, 0, offset);
   6131           PUT(bra, 1, offset);
   6132           }
   6133         }
   6134 
   6135       /* If the maximum is unlimited, set a repeater in the final copy. For
   6136       ONCE brackets, that's all we need to do. However, possessively repeated
   6137       ONCE brackets can be converted into non-capturing brackets, as the
   6138       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
   6139       deal with possessive ONCEs specially.
   6140 
   6141       Otherwise, when we are doing the actual compile phase, check to see
   6142       whether this group is one that could match an empty string. If so,
   6143       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
   6144       that runtime checking can be done. [This check is also applied to ONCE
   6145       groups at runtime, but in a different way.]
   6146 
   6147       Then, if the quantifier was possessive and the bracket is not a
   6148       conditional, we convert the BRA code to the POS form, and the KET code to
   6149       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
   6150       subpattern at both the start and at the end.) The use of special opcodes
   6151       makes it possible to reduce greatly the stack usage in pcre_exec(). If
   6152       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
   6153 
   6154       Then, if the minimum number of matches is 1 or 0, cancel the possessive
   6155       flag so that the default action below, of wrapping everything inside
   6156       atomic brackets, does not happen. When the minimum is greater than 1,
   6157       there will be earlier copies of the group, and so we still have to wrap
   6158       the whole thing. */
   6159 
   6160       else
   6161         {
   6162         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
   6163         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
   6164 
   6165         /* Convert possessive ONCE brackets to non-capturing */
   6166 
   6167         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
   6168             possessive_quantifier) *bracode = OP_BRA;
   6169 
   6170         /* For non-possessive ONCE brackets, all we need to do is to
   6171         set the KET. */
   6172 
   6173         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
   6174           *ketcode = OP_KETRMAX + repeat_type;
   6175 
   6176         /* Handle non-ONCE brackets and possessive ONCEs (which have been
   6177         converted to non-capturing above). */
   6178 
   6179         else
   6180           {
   6181           /* In the compile phase, check for empty string matching. */
   6182 
   6183           if (lengthptr == NULL)
   6184             {
   6185             pcre_uchar *scode = bracode;
   6186             do
   6187               {
   6188               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
   6189                 {
   6190                 *bracode += OP_SBRA - OP_BRA;
   6191                 break;
   6192                 }
   6193               scode += GET(scode, 1);
   6194               }
   6195             while (*scode == OP_ALT);
   6196             }
   6197 
   6198           /* Handle possessive quantifiers. */
   6199 
   6200           if (possessive_quantifier)
   6201             {
   6202             /* For COND brackets, we wrap the whole thing in a possessively
   6203             repeated non-capturing bracket, because we have not invented POS
   6204             versions of the COND opcodes. Because we are moving code along, we
   6205             must ensure that any pending recursive references are updated. */
   6206 
   6207             if (*bracode == OP_COND || *bracode == OP_SCOND)
   6208               {
   6209               int nlen = (int)(code - bracode);
   6210               *code = OP_END;
   6211               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
   6212               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
   6213               code += 1 + LINK_SIZE;
   6214               nlen += 1 + LINK_SIZE;
   6215               *bracode = OP_BRAPOS;
   6216               *code++ = OP_KETRPOS;
   6217               PUTINC(code, 0, nlen);
   6218               PUT(bracode, 1, nlen);
   6219               }
   6220 
   6221             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
   6222 
   6223             else
   6224               {
   6225               *bracode += 1;              /* Switch to xxxPOS opcodes */
   6226               *ketcode = OP_KETRPOS;
   6227               }
   6228 
   6229             /* If the minimum is zero, mark it as possessive, then unset the
   6230             possessive flag when the minimum is 0 or 1. */
   6231 
   6232             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
   6233             if (repeat_min < 2) possessive_quantifier = FALSE;
   6234             }
   6235 
   6236           /* Non-possessive quantifier */
   6237 
   6238           else *ketcode = OP_KETRMAX + repeat_type;
   6239           }
   6240         }
   6241       }
   6242 
   6243     /* If previous is OP_FAIL, it was generated by an empty class [] in
   6244     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
   6245     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
   6246     error above. We can just ignore the repeat in JS case. */
   6247 
   6248     else if (*previous == OP_FAIL) goto END_REPEAT;
   6249 
   6250     /* Else there's some kind of shambles */
   6251 
   6252     else
   6253       {
   6254       *errorcodeptr = ERR11;
   6255       goto FAILED;
   6256       }
   6257 
   6258     /* If the character following a repeat is '+', possessive_quantifier is
   6259     TRUE. For some opcodes, there are special alternative opcodes for this
   6260     case. For anything else, we wrap the entire repeated item inside OP_ONCE
   6261     brackets. Logically, the '+' notation is just syntactic sugar, taken from
   6262     Sun's Java package, but the special opcodes can optimize it.
   6263 
   6264     Some (but not all) possessively repeated subpatterns have already been
   6265     completely handled in the code just above. For them, possessive_quantifier
   6266     is always FALSE at this stage. Note that the repeated item starts at
   6267     tempcode, not at previous, which might be the first part of a string whose
   6268     (former) last char we repeated. */
   6269 
   6270     if (possessive_quantifier)
   6271       {
   6272       int len;
   6273 
   6274       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
   6275       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
   6276       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
   6277       remains is greater than zero, there's a further opcode that can be
   6278       handled. If not, do nothing, leaving the EXACT alone. */
   6279 
   6280       switch(*tempcode)
   6281         {
   6282         case OP_TYPEEXACT:
   6283         tempcode += PRIV(OP_lengths)[*tempcode] +
   6284           ((tempcode[1 + IMM2_SIZE] == OP_PROP
   6285           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
   6286         break;
   6287 
   6288         /* CHAR opcodes are used for exacts whose count is 1. */
   6289 
   6290         case OP_CHAR:
   6291         case OP_CHARI:
   6292         case OP_NOT:
   6293         case OP_NOTI:
   6294         case OP_EXACT:
   6295         case OP_EXACTI:
   6296         case OP_NOTEXACT:
   6297         case OP_NOTEXACTI:
   6298         tempcode += PRIV(OP_lengths)[*tempcode];
   6299 #ifdef SUPPORT_UTF
   6300         if (utf && HAS_EXTRALEN(tempcode[-1]))
   6301           tempcode += GET_EXTRALEN(tempcode[-1]);
   6302 #endif
   6303         break;
   6304 
   6305         /* For the class opcodes, the repeat operator appears at the end;
   6306         adjust tempcode to point to it. */
   6307 
   6308         case OP_CLASS:
   6309         case OP_NCLASS:
   6310         tempcode += 1 + 32/sizeof(pcre_uchar);
   6311         break;
   6312 
   6313 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   6314         case OP_XCLASS:
   6315         tempcode += GET(tempcode, 1);
   6316         break;
   6317 #endif
   6318         }
   6319 
   6320       /* If tempcode is equal to code (which points to the end of the repeated
   6321       item), it means we have skipped an EXACT item but there is no following
   6322       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
   6323       all other cases, tempcode will be pointing to the repeat opcode, and will
   6324       be less than code, so the value of len will be greater than 0. */
   6325 
   6326       len = (int)(code - tempcode);
   6327       if (len > 0)
   6328         {
   6329         unsigned int repcode = *tempcode;
   6330 
   6331         /* There is a table for possessifying opcodes, all of which are less
   6332         than OP_CALLOUT. A zero entry means there is no possessified version.
   6333         */
   6334 
   6335         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
   6336           *tempcode = opcode_possessify[repcode];
   6337 
   6338         /* For opcode without a special possessified version, wrap the item in
   6339         ONCE brackets. Because we are moving code along, we must ensure that any
   6340         pending recursive references are updated. */
   6341 
   6342         else
   6343           {
   6344           *code = OP_END;
   6345           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
   6346           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
   6347           code += 1 + LINK_SIZE;
   6348           len += 1 + LINK_SIZE;
   6349           tempcode[0] = OP_ONCE;
   6350           *code++ = OP_KET;
   6351           PUTINC(code, 0, len);
   6352           PUT(tempcode, 1, len);
   6353           }
   6354         }
   6355 
   6356 #ifdef NEVER
   6357       if (len > 0) switch (*tempcode)
   6358         {
   6359         case OP_STAR:  *tempcode = OP_POSSTAR; break;
   6360         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
   6361         case OP_QUERY: *tempcode = OP_POSQUERY; break;
   6362         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
   6363 
   6364         case OP_STARI:  *tempcode = OP_POSSTARI; break;
   6365         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
   6366         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
   6367         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
   6368 
   6369         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
   6370         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
   6371         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
   6372         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
   6373 
   6374         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
   6375         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
   6376         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
   6377         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
   6378 
   6379         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
   6380         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
   6381         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
   6382         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
   6383 
   6384         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
   6385         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
   6386         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
   6387         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
   6388 
   6389         /* Because we are moving code along, we must ensure that any
   6390         pending recursive references are updated. */
   6391 
   6392         default:
   6393         *code = OP_END;
   6394         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
   6395         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
   6396         code += 1 + LINK_SIZE;
   6397         len += 1 + LINK_SIZE;
   6398         tempcode[0] = OP_ONCE;
   6399         *code++ = OP_KET;
   6400         PUTINC(code, 0, len);
   6401         PUT(tempcode, 1, len);
   6402         break;
   6403         }
   6404 #endif
   6405       }
   6406 
   6407     /* In all case we no longer have a previous item. We also set the
   6408     "follows varying string" flag for subsequently encountered reqchars if
   6409     it isn't already set and we have just passed a varying length item. */
   6410 
   6411     END_REPEAT:
   6412     previous = NULL;
   6413     cd->req_varyopt |= reqvary;
   6414     break;
   6415 
   6416 
   6417     /* ===================================================================*/
   6418     /* Start of nested parenthesized sub-expression, or comment or lookahead or
   6419     lookbehind or option setting or condition or all the other extended
   6420     parenthesis forms.  */
   6421 
   6422     case CHAR_LEFT_PARENTHESIS:
   6423     newoptions = options;
   6424     skipbytes = 0;
   6425     bravalue = OP_CBRA;
   6426     save_hwm = cd->hwm;
   6427     reset_bracount = FALSE;
   6428 
   6429     /* First deal with various "verbs" that can be introduced by '*'. */
   6430 
   6431     ptr++;
   6432     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
   6433          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
   6434       {
   6435       int i, namelen;
   6436       int arglen = 0;
   6437       const char *vn = verbnames;
   6438       const pcre_uchar *name = ptr + 1;
   6439       const pcre_uchar *arg = NULL;
   6440       previous = NULL;
   6441       ptr++;
   6442       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
   6443       namelen = (int)(ptr - name);
   6444 
   6445       /* It appears that Perl allows any characters whatsoever, other than
   6446       a closing parenthesis, to appear in arguments, so we no longer insist on
   6447       letters, digits, and underscores. */
   6448 
   6449       if (*ptr == CHAR_COLON)
   6450         {
   6451         arg = ++ptr;
   6452         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
   6453         arglen = (int)(ptr - arg);
   6454         if ((unsigned int)arglen > MAX_MARK)
   6455           {
   6456           *errorcodeptr = ERR75;
   6457           goto FAILED;
   6458           }
   6459         }
   6460 
   6461       if (*ptr != CHAR_RIGHT_PARENTHESIS)
   6462         {
   6463         *errorcodeptr = ERR60;
   6464         goto FAILED;
   6465         }
   6466 
   6467       /* Scan the table of verb names */
   6468 
   6469       for (i = 0; i < verbcount; i++)
   6470         {
   6471         if (namelen == verbs[i].len &&
   6472             STRNCMP_UC_C8(name, vn, namelen) == 0)
   6473           {
   6474           int setverb;
   6475 
   6476           /* Check for open captures before ACCEPT and convert it to
   6477           ASSERT_ACCEPT if in an assertion. */
   6478 
   6479           if (verbs[i].op == OP_ACCEPT)
   6480             {
   6481             open_capitem *oc;
   6482             if (arglen != 0)
   6483               {
   6484               *errorcodeptr = ERR59;
   6485               goto FAILED;
   6486               }
   6487             cd->had_accept = TRUE;
   6488             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
   6489               {
   6490               *code++ = OP_CLOSE;
   6491               PUT2INC(code, 0, oc->number);
   6492               }
   6493             setverb = *code++ =
   6494               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
   6495 
   6496             /* Do not set firstchar after *ACCEPT */
   6497             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   6498             }
   6499 
   6500           /* Handle other cases with/without an argument */
   6501 
   6502           else if (arglen == 0)
   6503             {
   6504             if (verbs[i].op < 0)   /* Argument is mandatory */
   6505               {
   6506               *errorcodeptr = ERR66;
   6507               goto FAILED;
   6508               }
   6509             setverb = *code++ = verbs[i].op;
   6510             }
   6511 
   6512           else
   6513             {
   6514             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
   6515               {
   6516               *errorcodeptr = ERR59;
   6517               goto FAILED;
   6518               }
   6519             setverb = *code++ = verbs[i].op_arg;
   6520             *code++ = arglen;
   6521             memcpy(code, arg, IN_UCHARS(arglen));
   6522             code += arglen;
   6523             *code++ = 0;
   6524             }
   6525 
   6526           switch (setverb)
   6527             {
   6528             case OP_THEN:
   6529             case OP_THEN_ARG:
   6530             cd->external_flags |= PCRE_HASTHEN;
   6531             break;
   6532 
   6533             case OP_PRUNE:
   6534             case OP_PRUNE_ARG:
   6535             case OP_SKIP:
   6536             case OP_SKIP_ARG:
   6537             cd->had_pruneorskip = TRUE;
   6538             break;
   6539             }
   6540 
   6541           break;  /* Found verb, exit loop */
   6542           }
   6543 
   6544         vn += verbs[i].len + 1;
   6545         }
   6546 
   6547       if (i < verbcount) continue;    /* Successfully handled a verb */
   6548       *errorcodeptr = ERR60;          /* Verb not recognized */
   6549       goto FAILED;
   6550       }
   6551 
   6552     /* Deal with the extended parentheses; all are introduced by '?', and the
   6553     appearance of any of them means that this is not a capturing group. */
   6554 
   6555     else if (*ptr == CHAR_QUESTION_MARK)
   6556       {
   6557       int i, set, unset, namelen;
   6558       int *optset;
   6559       const pcre_uchar *name;
   6560       pcre_uchar *slot;
   6561 
   6562       switch (*(++ptr))
   6563         {
   6564         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
   6565         ptr++;
   6566         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
   6567         if (*ptr == CHAR_NULL)
   6568           {
   6569           *errorcodeptr = ERR18;
   6570           goto FAILED;
   6571           }
   6572         continue;
   6573 
   6574 
   6575         /* ------------------------------------------------------------ */
   6576         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
   6577         reset_bracount = TRUE;
   6578         /* Fall through */
   6579 
   6580         /* ------------------------------------------------------------ */
   6581         case CHAR_COLON:          /* Non-capturing bracket */
   6582         bravalue = OP_BRA;
   6583         ptr++;
   6584         break;
   6585 
   6586 
   6587         /* ------------------------------------------------------------ */
   6588         case CHAR_LEFT_PARENTHESIS:
   6589         bravalue = OP_COND;       /* Conditional group */
   6590         tempptr = ptr;
   6591 
   6592         /* A condition can be an assertion, a number (referring to a numbered
   6593         group's having been set), a name (referring to a named group), or 'R',
   6594         referring to recursion. R<digits> and R&name are also permitted for
   6595         recursion tests.
   6596 
   6597         There are ways of testing a named group: (?(name)) is used by Python;
   6598         Perl 5.10 onwards uses (?(<name>) or (?('name')).
   6599 
   6600         There is one unfortunate ambiguity, caused by history. 'R' can be the
   6601         recursive thing or the name 'R' (and similarly for 'R' followed by
   6602         digits). We look for a name first; if not found, we try the other case.
   6603 
   6604         For compatibility with auto-callouts, we allow a callout to be
   6605         specified before a condition that is an assertion. First, check for the
   6606         syntax of a callout; if found, adjust the temporary pointer that is
   6607         used to check for an assertion condition. That's all that is needed! */
   6608 
   6609         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
   6610           {
   6611           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
   6612           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
   6613             tempptr += i + 1;
   6614           }
   6615 
   6616         /* For conditions that are assertions, check the syntax, and then exit
   6617         the switch. This will take control down to where bracketed groups,
   6618         including assertions, are processed. */
   6619 
   6620         if (tempptr[1] == CHAR_QUESTION_MARK &&
   6621               (tempptr[2] == CHAR_EQUALS_SIGN ||
   6622                tempptr[2] == CHAR_EXCLAMATION_MARK ||
   6623                tempptr[2] == CHAR_LESS_THAN_SIGN))
   6624           break;
   6625 
   6626         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
   6627         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
   6628 
   6629         code[1+LINK_SIZE] = OP_CREF;
   6630         skipbytes = 1+IMM2_SIZE;
   6631         refsign = -1;     /* => not a number */
   6632         namelen = -1;     /* => not a name; must set to avoid warning */
   6633         name = NULL;      /* Always set to avoid warning */
   6634         recno = 0;        /* Always set to avoid warning */
   6635 
   6636         /* Check for a test for recursion in a named group. */
   6637 
   6638         ptr++;
   6639         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
   6640           {
   6641           terminator = -1;
   6642           ptr += 2;
   6643           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
   6644           }
   6645 
   6646         /* Check for a test for a named group's having been set, using the Perl
   6647         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
   6648         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
   6649 
   6650         else if (*ptr == CHAR_LESS_THAN_SIGN)
   6651           {
   6652           terminator = CHAR_GREATER_THAN_SIGN;
   6653           ptr++;
   6654           }
   6655         else if (*ptr == CHAR_APOSTROPHE)
   6656           {
   6657           terminator = CHAR_APOSTROPHE;
   6658           ptr++;
   6659           }
   6660         else
   6661           {
   6662           terminator = CHAR_NULL;
   6663           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
   6664             else if (IS_DIGIT(*ptr)) refsign = 0;
   6665           }
   6666 
   6667         /* Handle a number */
   6668 
   6669         if (refsign >= 0)
   6670           {
   6671           while (IS_DIGIT(*ptr))
   6672             {
   6673             recno = recno * 10 + (int)(*ptr - CHAR_0);
   6674             ptr++;
   6675             }
   6676           }
   6677 
   6678         /* Otherwise we expect to read a name; anything else is an error. When
   6679         a name is one of a number of duplicates, a different opcode is used and
   6680         it needs more memory. Unfortunately we cannot tell whether a name is a
   6681         duplicate in the first pass, so we have to allow for more memory. */
   6682 
   6683         else
   6684           {
   6685           if (IS_DIGIT(*ptr))
   6686             {
   6687             *errorcodeptr = ERR84;
   6688             goto FAILED;
   6689             }
   6690           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
   6691             {
   6692             *errorcodeptr = ERR28;   /* Assertion expected */
   6693             goto FAILED;
   6694             }
   6695           name = ptr++;
   6696           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
   6697             {
   6698             ptr++;
   6699             }
   6700           namelen = (int)(ptr - name);
   6701           if (lengthptr != NULL && (options & PCRE_DUPNAMES) != 0)
   6702             *lengthptr += IMM2_SIZE;
   6703           }
   6704 
   6705         /* Check the terminator */
   6706 
   6707         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
   6708             *ptr++ != CHAR_RIGHT_PARENTHESIS)
   6709           {
   6710           ptr--;                  /* Error offset */
   6711           *errorcodeptr = ERR26;  /* Malformed number or name */
   6712           goto FAILED;
   6713           }
   6714 
   6715         /* Do no further checking in the pre-compile phase. */
   6716 
   6717         if (lengthptr != NULL) break;
   6718 
   6719         /* In the real compile we do the work of looking for the actual
   6720         reference. If refsign is not negative, it means we have a number in
   6721         recno. */
   6722 
   6723         if (refsign >= 0)
   6724           {
   6725           if (recno <= 0)
   6726             {
   6727             *errorcodeptr = ERR35;
   6728             goto FAILED;
   6729             }
   6730           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
   6731             cd->bracount - recno + 1 : recno + cd->bracount;
   6732           if (recno <= 0 || recno > cd->final_bracount)
   6733             {
   6734             *errorcodeptr = ERR15;
   6735             goto FAILED;
   6736             }
   6737           PUT2(code, 2+LINK_SIZE, recno);
   6738           break;
   6739           }
   6740 
   6741         /* Otherwise look for the name. */
   6742 
   6743         slot = cd->name_table;
   6744         for (i = 0; i < cd->names_found; i++)
   6745           {
   6746           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
   6747           slot += cd->name_entry_size;
   6748           }
   6749 
   6750         /* Found the named subpattern. If the name is duplicated, add one to
   6751         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
   6752         appropriate data values. Otherwise, just insert the unique subpattern
   6753         number. */
   6754 
   6755         if (i < cd->names_found)
   6756           {
   6757           int offset = i++;
   6758           int count = 1;
   6759           recno = GET2(slot, 0);   /* Number from first found */
   6760           for (; i < cd->names_found; i++)
   6761             {
   6762             slot += cd->name_entry_size;
   6763             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
   6764               (slot+IMM2_SIZE)[namelen] != 0) break;
   6765             count++;
   6766             }
   6767 
   6768           if (count > 1)
   6769             {
   6770             PUT2(code, 2+LINK_SIZE, offset);
   6771             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
   6772             skipbytes += IMM2_SIZE;
   6773             code[1+LINK_SIZE]++;
   6774             }
   6775           else  /* Not a duplicated name */
   6776             {
   6777             PUT2(code, 2+LINK_SIZE, recno);
   6778             }
   6779           }
   6780 
   6781         /* If terminator == CHAR_NULL it means that the name followed directly
   6782         after the opening parenthesis [e.g. (?(abc)...] and in this case there
   6783         are some further alternatives to try. For the cases where terminator !=
   6784         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
   6785         we have now checked all the possibilities, so give an error. */
   6786 
   6787         else if (terminator != CHAR_NULL)
   6788           {
   6789           *errorcodeptr = ERR15;
   6790           goto FAILED;
   6791           }
   6792 
   6793         /* Check for (?(R) for recursion. Allow digits after R to specify a
   6794         specific group number. */
   6795 
   6796         else if (*name == CHAR_R)
   6797           {
   6798           recno = 0;
   6799           for (i = 1; i < namelen; i++)
   6800             {
   6801             if (!IS_DIGIT(name[i]))
   6802               {
   6803               *errorcodeptr = ERR15;
   6804               goto FAILED;
   6805               }
   6806             recno = recno * 10 + name[i] - CHAR_0;
   6807             }
   6808           if (recno == 0) recno = RREF_ANY;
   6809           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
   6810           PUT2(code, 2+LINK_SIZE, recno);
   6811           }
   6812 
   6813         /* Similarly, check for the (?(DEFINE) "condition", which is always
   6814         false. */
   6815 
   6816         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
   6817           {
   6818           code[1+LINK_SIZE] = OP_DEF;
   6819           skipbytes = 1;
   6820           }
   6821 
   6822         /* Reference to an unidentified subpattern. */
   6823 
   6824         else
   6825           {
   6826           *errorcodeptr = ERR15;
   6827           goto FAILED;
   6828           }
   6829         break;
   6830 
   6831 
   6832         /* ------------------------------------------------------------ */
   6833         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
   6834         bravalue = OP_ASSERT;
   6835         cd->assert_depth += 1;
   6836         ptr++;
   6837         break;
   6838 
   6839         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
   6840         thing to do, but Perl allows all assertions to be quantified, and when
   6841         they contain capturing parentheses there may be a potential use for
   6842         this feature. Not that that applies to a quantified (?!) but we allow
   6843         it for uniformity. */
   6844 
   6845         /* ------------------------------------------------------------ */
   6846         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
   6847         ptr++;
   6848         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
   6849              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
   6850             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
   6851           {
   6852           *code++ = OP_FAIL;
   6853           previous = NULL;
   6854           continue;
   6855           }
   6856         bravalue = OP_ASSERT_NOT;
   6857         cd->assert_depth += 1;
   6858         break;
   6859 
   6860 
   6861         /* ------------------------------------------------------------ */
   6862         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
   6863         switch (ptr[1])
   6864           {
   6865           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
   6866           bravalue = OP_ASSERTBACK;
   6867           cd->assert_depth += 1;
   6868           ptr += 2;
   6869           break;
   6870 
   6871           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
   6872           bravalue = OP_ASSERTBACK_NOT;
   6873           cd->assert_depth += 1;
   6874           ptr += 2;
   6875           break;
   6876 
   6877           default:                /* Could be name define, else bad */
   6878           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
   6879             goto DEFINE_NAME;
   6880           ptr++;                  /* Correct offset for error */
   6881           *errorcodeptr = ERR24;
   6882           goto FAILED;
   6883           }
   6884         break;
   6885 
   6886 
   6887         /* ------------------------------------------------------------ */
   6888         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
   6889         bravalue = OP_ONCE;
   6890         ptr++;
   6891         break;
   6892 
   6893 
   6894         /* ------------------------------------------------------------ */
   6895         case CHAR_C:                 /* Callout - may be followed by digits; */
   6896         previous_callout = code;     /* Save for later completion */
   6897         after_manual_callout = 1;    /* Skip one item before completing */
   6898         *code++ = OP_CALLOUT;
   6899           {
   6900           int n = 0;
   6901           ptr++;
   6902           while(IS_DIGIT(*ptr))
   6903             n = n * 10 + *ptr++ - CHAR_0;
   6904           if (*ptr != CHAR_RIGHT_PARENTHESIS)
   6905             {
   6906             *errorcodeptr = ERR39;
   6907             goto FAILED;
   6908             }
   6909           if (n > 255)
   6910             {
   6911             *errorcodeptr = ERR38;
   6912             goto FAILED;
   6913             }
   6914           *code++ = n;
   6915           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
   6916           PUT(code, LINK_SIZE, 0);                          /* Default length */
   6917           code += 2 * LINK_SIZE;
   6918           }
   6919         previous = NULL;
   6920         continue;
   6921 
   6922 
   6923         /* ------------------------------------------------------------ */
   6924         case CHAR_P:              /* Python-style named subpattern handling */
   6925         if (*(++ptr) == CHAR_EQUALS_SIGN ||
   6926             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
   6927           {
   6928           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
   6929           terminator = CHAR_RIGHT_PARENTHESIS;
   6930           goto NAMED_REF_OR_RECURSE;
   6931           }
   6932         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
   6933           {
   6934           *errorcodeptr = ERR41;
   6935           goto FAILED;
   6936           }
   6937         /* Fall through to handle (?P< as (?< is handled */
   6938 
   6939 
   6940         /* ------------------------------------------------------------ */
   6941         DEFINE_NAME:    /* Come here from (?< handling */
   6942         case CHAR_APOSTROPHE:
   6943         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
   6944           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
   6945         name = ++ptr;
   6946         if (IS_DIGIT(*ptr))
   6947           {
   6948           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
   6949           goto FAILED;
   6950           }
   6951         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
   6952         namelen = (int)(ptr - name);
   6953 
   6954         /* In the pre-compile phase, do a syntax check, remember the longest
   6955         name, and then remember the group in a vector, expanding it if
   6956         necessary. Duplicates for the same number are skipped; other duplicates
   6957         are checked for validity. In the actual compile, there is nothing to
   6958         do. */
   6959 
   6960         if (lengthptr != NULL)
   6961           {
   6962           named_group *ng;
   6963           pcre_uint32 number = cd->bracount + 1;
   6964 
   6965           if (*ptr != (pcre_uchar)terminator)
   6966             {
   6967             *errorcodeptr = ERR42;
   6968             goto FAILED;
   6969             }
   6970 
   6971           if (cd->names_found >= MAX_NAME_COUNT)
   6972             {
   6973             *errorcodeptr = ERR49;
   6974             goto FAILED;
   6975             }
   6976 
   6977           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
   6978             {
   6979             cd->name_entry_size = namelen + IMM2_SIZE + 1;
   6980             if (namelen > MAX_NAME_SIZE)
   6981               {
   6982               *errorcodeptr = ERR48;
   6983               goto FAILED;
   6984               }
   6985             }
   6986 
   6987           /* Scan the list to check for duplicates. For duplicate names, if the
   6988           number is the same, break the loop, which causes the name to be
   6989           discarded; otherwise, if DUPNAMES is not set, give an error.
   6990           If it is set, allow the name with a different number, but continue
   6991           scanning in case this is a duplicate with the same number. For
   6992           non-duplicate names, give an error if the number is duplicated. */
   6993 
   6994           ng = cd->named_groups;
   6995           for (i = 0; i < cd->names_found; i++, ng++)
   6996             {
   6997             if (namelen == ng->length &&
   6998                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
   6999               {
   7000               if (ng->number == number) break;
   7001               if ((options & PCRE_DUPNAMES) == 0)
   7002                 {
   7003                 *errorcodeptr = ERR43;
   7004                 goto FAILED;
   7005                 }
   7006               cd->dupnames = TRUE;  /* Duplicate names exist */
   7007               }
   7008             else if (ng->number == number)
   7009               {
   7010               *errorcodeptr = ERR65;
   7011               goto FAILED;
   7012               }
   7013             }
   7014 
   7015           if (i >= cd->names_found)     /* Not a duplicate with same number */
   7016             {
   7017             /* Increase the list size if necessary */
   7018 
   7019             if (cd->names_found >= cd->named_group_list_size)
   7020               {
   7021               int newsize = cd->named_group_list_size * 2;
   7022               named_group *newspace = (PUBL(malloc))
   7023                 (newsize * sizeof(named_group));
   7024 
   7025               if (newspace == NULL)
   7026                 {
   7027                 *errorcodeptr = ERR21;
   7028                 goto FAILED;
   7029                 }
   7030 
   7031               memcpy(newspace, cd->named_groups,
   7032                 cd->named_group_list_size * sizeof(named_group));
   7033               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
   7034                 (PUBL(free))((void *)cd->named_groups);
   7035               cd->named_groups = newspace;
   7036               cd->named_group_list_size = newsize;
   7037               }
   7038 
   7039             cd->named_groups[cd->names_found].name = name;
   7040             cd->named_groups[cd->names_found].length = namelen;
   7041             cd->named_groups[cd->names_found].number = number;
   7042             cd->names_found++;
   7043             }
   7044           }
   7045 
   7046         ptr++;                    /* Move past > or ' in both passes. */
   7047         goto NUMBERED_GROUP;
   7048 
   7049 
   7050         /* ------------------------------------------------------------ */
   7051         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
   7052         terminator = CHAR_RIGHT_PARENTHESIS;
   7053         is_recurse = TRUE;
   7054         /* Fall through */
   7055 
   7056         /* We come here from the Python syntax above that handles both
   7057         references (?P=name) and recursion (?P>name), as well as falling
   7058         through from the Perl recursion syntax (?&name). We also come here from
   7059         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
   7060         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
   7061 
   7062         NAMED_REF_OR_RECURSE:
   7063         name = ++ptr;
   7064         if (IS_DIGIT(*ptr))
   7065           {
   7066           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
   7067           goto FAILED;
   7068           }
   7069         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
   7070         namelen = (int)(ptr - name);
   7071 
   7072         /* In the pre-compile phase, do a syntax check. We used to just set
   7073         a dummy reference number, because it was not used in the first pass.
   7074         However, with the change of recursive back references to be atomic,
   7075         we have to look for the number so that this state can be identified, as
   7076         otherwise the incorrect length is computed. If it's not a backwards
   7077         reference, the dummy number will do. */
   7078 
   7079         if (lengthptr != NULL)
   7080           {
   7081           named_group *ng;
   7082 
   7083           if (namelen == 0)
   7084             {
   7085             *errorcodeptr = ERR62;
   7086             goto FAILED;
   7087             }
   7088           if (*ptr != (pcre_uchar)terminator)
   7089             {
   7090             *errorcodeptr = ERR42;
   7091             goto FAILED;
   7092             }
   7093           if (namelen > MAX_NAME_SIZE)
   7094             {
   7095             *errorcodeptr = ERR48;
   7096             goto FAILED;
   7097             }
   7098 
   7099           /* The name table does not exist in the first pass; instead we must
   7100           scan the list of names encountered so far in order to get the
   7101           number. If the name is not found, set the value to 0 for a forward
   7102           reference. */
   7103 
   7104           ng = cd->named_groups;
   7105           for (i = 0; i < cd->names_found; i++, ng++)
   7106             {
   7107             if (namelen == ng->length &&
   7108                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
   7109               break;
   7110             }
   7111           recno = (i < cd->names_found)? ng->number : 0;
   7112 
   7113           /* Count named back references. */
   7114 
   7115           if (!is_recurse) cd->namedrefcount++;
   7116 
   7117           /* If duplicate names are permitted, we have to allow for a named
   7118           reference to a duplicated name (this cannot be determined until the
   7119           second pass). This needs an extra 16-bit data item. */
   7120 
   7121           if ((options & PCRE_DUPNAMES) != 0) *lengthptr += IMM2_SIZE;
   7122           }
   7123 
   7124         /* In the real compile, search the name table. We check the name
   7125         first, and then check that we have reached the end of the name in the
   7126         table. That way, if the name is longer than any in the table, the
   7127         comparison will fail without reading beyond the table entry. */
   7128 
   7129         else
   7130           {
   7131           slot = cd->name_table;
   7132           for (i = 0; i < cd->names_found; i++)
   7133             {
   7134             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
   7135                 slot[IMM2_SIZE+namelen] == 0)
   7136               break;
   7137             slot += cd->name_entry_size;
   7138             }
   7139 
   7140           if (i < cd->names_found)
   7141             {
   7142             recno = GET2(slot, 0);
   7143             }
   7144           else
   7145             {
   7146             *errorcodeptr = ERR15;
   7147             goto FAILED;
   7148             }
   7149           }
   7150 
   7151         /* In both phases, for recursions, we can now go to the code than
   7152         handles numerical recursion. */
   7153 
   7154         if (is_recurse) goto HANDLE_RECURSION;
   7155 
   7156         /* In the second pass we must see if the name is duplicated. If so, we
   7157         generate a different opcode. */
   7158 
   7159         if (lengthptr == NULL && cd->dupnames)
   7160           {
   7161           int count = 1;
   7162           unsigned int index = i;
   7163           pcre_uchar *cslot = slot + cd->name_entry_size;
   7164 
   7165           for (i++; i < cd->names_found; i++)
   7166             {
   7167             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
   7168 
   7169 
   7170             count++;
   7171             cslot += cd->name_entry_size;
   7172             }
   7173 
   7174           if (count > 1)
   7175             {
   7176             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   7177             previous = code;
   7178             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
   7179             PUT2INC(code, 0, index);
   7180             PUT2INC(code, 0, count);
   7181 
   7182             /* Process each potentially referenced group. */
   7183 
   7184             for (; slot < cslot; slot += cd->name_entry_size)
   7185               {
   7186               open_capitem *oc;
   7187               recno = GET2(slot, 0);
   7188               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
   7189               if (recno > cd->top_backref) cd->top_backref = recno;
   7190 
   7191               /* Check to see if this back reference is recursive, that it, it
   7192               is inside the group that it references. A flag is set so that the
   7193               group can be made atomic. */
   7194 
   7195               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
   7196                 {
   7197                 if (oc->number == recno)
   7198                   {
   7199                   oc->flag = TRUE;
   7200                   break;
   7201                   }
   7202                 }
   7203               }
   7204 
   7205             continue;  /* End of back ref handling */
   7206             }
   7207           }
   7208 
   7209         /* First pass, or a non-duplicated name. */
   7210 
   7211         goto HANDLE_REFERENCE;
   7212 
   7213 
   7214         /* ------------------------------------------------------------ */
   7215         case CHAR_R:              /* Recursion */
   7216         ptr++;                    /* Same as (?0)      */
   7217         /* Fall through */
   7218 
   7219 
   7220         /* ------------------------------------------------------------ */
   7221         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
   7222         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
   7223         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   7224           {
   7225           const pcre_uchar *called;
   7226           terminator = CHAR_RIGHT_PARENTHESIS;
   7227 
   7228           /* Come here from the \g<...> and \g'...' code (Oniguruma
   7229           compatibility). However, the syntax has been checked to ensure that
   7230           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
   7231           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
   7232           ever be taken. */
   7233 
   7234           HANDLE_NUMERICAL_RECURSION:
   7235 
   7236           if ((refsign = *ptr) == CHAR_PLUS)
   7237             {
   7238             ptr++;
   7239             if (!IS_DIGIT(*ptr))
   7240               {
   7241               *errorcodeptr = ERR63;
   7242               goto FAILED;
   7243               }
   7244             }
   7245           else if (refsign == CHAR_MINUS)
   7246             {
   7247             if (!IS_DIGIT(ptr[1]))
   7248               goto OTHER_CHAR_AFTER_QUERY;
   7249             ptr++;
   7250             }
   7251 
   7252           recno = 0;
   7253           while(IS_DIGIT(*ptr))
   7254             recno = recno * 10 + *ptr++ - CHAR_0;
   7255 
   7256           if (*ptr != (pcre_uchar)terminator)
   7257             {
   7258             *errorcodeptr = ERR29;
   7259             goto FAILED;
   7260             }
   7261 
   7262           if (refsign == CHAR_MINUS)
   7263             {
   7264             if (recno == 0)
   7265               {
   7266               *errorcodeptr = ERR58;
   7267               goto FAILED;
   7268               }
   7269             recno = cd->bracount - recno + 1;
   7270             if (recno <= 0)
   7271               {
   7272               *errorcodeptr = ERR15;
   7273               goto FAILED;
   7274               }
   7275             }
   7276           else if (refsign == CHAR_PLUS)
   7277             {
   7278             if (recno == 0)
   7279               {
   7280               *errorcodeptr = ERR58;
   7281               goto FAILED;
   7282               }
   7283             recno += cd->bracount;
   7284             }
   7285 
   7286           /* Come here from code above that handles a named recursion */
   7287 
   7288           HANDLE_RECURSION:
   7289 
   7290           previous = code;
   7291           called = cd->start_code;
   7292 
   7293           /* When we are actually compiling, find the bracket that is being
   7294           referenced. Temporarily end the regex in case it doesn't exist before
   7295           this point. If we end up with a forward reference, first check that
   7296           the bracket does occur later so we can give the error (and position)
   7297           now. Then remember this forward reference in the workspace so it can
   7298           be filled in at the end. */
   7299 
   7300           if (lengthptr == NULL)
   7301             {
   7302             *code = OP_END;
   7303             if (recno != 0)
   7304               called = PRIV(find_bracket)(cd->start_code, utf, recno);
   7305 
   7306             /* Forward reference */
   7307 
   7308             if (called == NULL)
   7309               {
   7310               if (recno > cd->final_bracount)
   7311                 {
   7312                 *errorcodeptr = ERR15;
   7313                 goto FAILED;
   7314                 }
   7315 
   7316               /* Fudge the value of "called" so that when it is inserted as an
   7317               offset below, what it actually inserted is the reference number
   7318               of the group. Then remember the forward reference. */
   7319 
   7320               called = cd->start_code + recno;
   7321               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
   7322                   WORK_SIZE_SAFETY_MARGIN)
   7323                 {
   7324                 *errorcodeptr = expand_workspace(cd);
   7325                 if (*errorcodeptr != 0) goto FAILED;
   7326                 }
   7327               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
   7328               }
   7329 
   7330             /* If not a forward reference, and the subpattern is still open,
   7331             this is a recursive call. We check to see if this is a left
   7332             recursion that could loop for ever, and diagnose that case. We
   7333             must not, however, do this check if we are in a conditional
   7334             subpattern because the condition might be testing for recursion in
   7335             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
   7336             Forever loops are also detected at runtime, so those that occur in
   7337             conditional subpatterns will be picked up then. */
   7338 
   7339             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
   7340                      could_be_empty(called, code, bcptr, utf, cd))
   7341               {
   7342               *errorcodeptr = ERR40;
   7343               goto FAILED;
   7344               }
   7345             }
   7346 
   7347           /* Insert the recursion/subroutine item. It does not have a set first
   7348           character (relevant if it is repeated, because it will then be
   7349           wrapped with ONCE brackets). */
   7350 
   7351           *code = OP_RECURSE;
   7352           PUT(code, 1, (int)(called - cd->start_code));
   7353           code += 1 + LINK_SIZE;
   7354           groupsetfirstchar = FALSE;
   7355           }
   7356 
   7357         /* Can't determine a first byte now */
   7358 
   7359         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   7360         continue;
   7361 
   7362 
   7363         /* ------------------------------------------------------------ */
   7364         default:              /* Other characters: check option setting */
   7365         OTHER_CHAR_AFTER_QUERY:
   7366         set = unset = 0;
   7367         optset = &set;
   7368 
   7369         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
   7370           {
   7371           switch (*ptr++)
   7372             {
   7373             case CHAR_MINUS: optset = &unset; break;
   7374 
   7375             case CHAR_J:    /* Record that it changed in the external options */
   7376             *optset |= PCRE_DUPNAMES;
   7377             cd->external_flags |= PCRE_JCHANGED;
   7378             break;
   7379 
   7380             case CHAR_i: *optset |= PCRE_CASELESS; break;
   7381             case CHAR_m: *optset |= PCRE_MULTILINE; break;
   7382             case CHAR_s: *optset |= PCRE_DOTALL; break;
   7383             case CHAR_x: *optset |= PCRE_EXTENDED; break;
   7384             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
   7385             case CHAR_X: *optset |= PCRE_EXTRA; break;
   7386 
   7387             default:  *errorcodeptr = ERR12;
   7388                       ptr--;    /* Correct the offset */
   7389                       goto FAILED;
   7390             }
   7391           }
   7392 
   7393         /* Set up the changed option bits, but don't change anything yet. */
   7394 
   7395         newoptions = (options | set) & (~unset);
   7396 
   7397         /* If the options ended with ')' this is not the start of a nested
   7398         group with option changes, so the options change at this level. If this
   7399         item is right at the start of the pattern, the options can be
   7400         abstracted and made external in the pre-compile phase, and ignored in
   7401         the compile phase. This can be helpful when matching -- for instance in
   7402         caseless checking of required bytes.
   7403 
   7404         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
   7405         definitely *not* at the start of the pattern because something has been
   7406         compiled. In the pre-compile phase, however, the code pointer can have
   7407         that value after the start, because it gets reset as code is discarded
   7408         during the pre-compile. However, this can happen only at top level - if
   7409         we are within parentheses, the starting BRA will still be present. At
   7410         any parenthesis level, the length value can be used to test if anything
   7411         has been compiled at that level. Thus, a test for both these conditions
   7412         is necessary to ensure we correctly detect the start of the pattern in
   7413         both phases.
   7414 
   7415         If we are not at the pattern start, reset the greedy defaults and the
   7416         case value for firstchar and reqchar. */
   7417 
   7418         if (*ptr == CHAR_RIGHT_PARENTHESIS)
   7419           {
   7420           if (code == cd->start_code + 1 + LINK_SIZE &&
   7421                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
   7422             {
   7423             cd->external_options = newoptions;
   7424             }
   7425           else
   7426             {
   7427             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
   7428             greedy_non_default = greedy_default ^ 1;
   7429             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
   7430             }
   7431 
   7432           /* Change options at this level, and pass them back for use
   7433           in subsequent branches. */
   7434 
   7435           *optionsptr = options = newoptions;
   7436           previous = NULL;       /* This item can't be repeated */
   7437           continue;              /* It is complete */
   7438           }
   7439 
   7440         /* If the options ended with ':' we are heading into a nested group
   7441         with possible change of options. Such groups are non-capturing and are
   7442         not assertions of any kind. All we need to do is skip over the ':';
   7443         the newoptions value is handled below. */
   7444 
   7445         bravalue = OP_BRA;
   7446         ptr++;
   7447         }     /* End of switch for character following (? */
   7448       }       /* End of (? handling */
   7449 
   7450     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
   7451     is set, all unadorned brackets become non-capturing and behave like (?:...)
   7452     brackets. */
   7453 
   7454     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
   7455       {
   7456       bravalue = OP_BRA;
   7457       }
   7458 
   7459     /* Else we have a capturing group. */
   7460 
   7461     else
   7462       {
   7463       NUMBERED_GROUP:
   7464       cd->bracount += 1;
   7465       PUT2(code, 1+LINK_SIZE, cd->bracount);
   7466       skipbytes = IMM2_SIZE;
   7467       }
   7468 
   7469     /* Process nested bracketed regex. First check for parentheses nested too
   7470     deeply. */
   7471 
   7472     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
   7473       {
   7474       *errorcodeptr = ERR82;
   7475       goto FAILED;
   7476       }
   7477 
   7478     /* Assertions used not to be repeatable, but this was changed for Perl
   7479     compatibility, so all kinds can now be repeated. We copy code into a
   7480     non-register variable (tempcode) in order to be able to pass its address
   7481     because some compilers complain otherwise. */
   7482 
   7483     previous = code;                      /* For handling repetition */
   7484     *code = bravalue;
   7485     tempcode = code;
   7486     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
   7487     tempbracount = cd->bracount;          /* Save value before bracket */
   7488     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
   7489 
   7490     if (!compile_regex(
   7491          newoptions,                      /* The complete new option state */
   7492          &tempcode,                       /* Where to put code (updated) */
   7493          &ptr,                            /* Input pointer (updated) */
   7494          errorcodeptr,                    /* Where to put an error message */
   7495          (bravalue == OP_ASSERTBACK ||
   7496           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
   7497          reset_bracount,                  /* True if (?| group */
   7498          skipbytes,                       /* Skip over bracket number */
   7499          cond_depth +
   7500            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
   7501          &subfirstchar,                   /* For possible first char */
   7502          &subfirstcharflags,
   7503          &subreqchar,                     /* For possible last char */
   7504          &subreqcharflags,
   7505          bcptr,                           /* Current branch chain */
   7506          cd,                              /* Tables block */
   7507          (lengthptr == NULL)? NULL :      /* Actual compile phase */
   7508            &length_prevgroup              /* Pre-compile phase */
   7509          ))
   7510       goto FAILED;
   7511 
   7512     cd->parens_depth -= 1;
   7513 
   7514     /* If this was an atomic group and there are no capturing groups within it,
   7515     generate OP_ONCE_NC instead of OP_ONCE. */
   7516 
   7517     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
   7518       *code = OP_ONCE_NC;
   7519 
   7520     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
   7521       cd->assert_depth -= 1;
   7522 
   7523     /* At the end of compiling, code is still pointing to the start of the
   7524     group, while tempcode has been updated to point past the end of the group.
   7525     The pattern pointer (ptr) is on the bracket.
   7526 
   7527     If this is a conditional bracket, check that there are no more than
   7528     two branches in the group, or just one if it's a DEFINE group. We do this
   7529     in the real compile phase, not in the pre-pass, where the whole group may
   7530     not be available. */
   7531 
   7532     if (bravalue == OP_COND && lengthptr == NULL)
   7533       {
   7534       pcre_uchar *tc = code;
   7535       int condcount = 0;
   7536 
   7537       do {
   7538          condcount++;
   7539          tc += GET(tc,1);
   7540          }
   7541       while (*tc != OP_KET);
   7542 
   7543       /* A DEFINE group is never obeyed inline (the "condition" is always
   7544       false). It must have only one branch. */
   7545 
   7546       if (code[LINK_SIZE+1] == OP_DEF)
   7547         {
   7548         if (condcount > 1)
   7549           {
   7550           *errorcodeptr = ERR54;
   7551           goto FAILED;
   7552           }
   7553         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
   7554         }
   7555 
   7556       /* A "normal" conditional group. If there is just one branch, we must not
   7557       make use of its firstchar or reqchar, because this is equivalent to an
   7558       empty second branch. */
   7559 
   7560       else
   7561         {
   7562         if (condcount > 2)
   7563           {
   7564           *errorcodeptr = ERR27;
   7565           goto FAILED;
   7566           }
   7567         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
   7568         }
   7569       }
   7570 
   7571     /* Error if hit end of pattern */
   7572 
   7573     if (*ptr != CHAR_RIGHT_PARENTHESIS)
   7574       {
   7575       *errorcodeptr = ERR14;
   7576       goto FAILED;
   7577       }
   7578 
   7579     /* In the pre-compile phase, update the length by the length of the group,
   7580     less the brackets at either end. Then reduce the compiled code to just a
   7581     set of non-capturing brackets so that it doesn't use much memory if it is
   7582     duplicated by a quantifier.*/
   7583 
   7584     if (lengthptr != NULL)
   7585       {
   7586       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
   7587         {
   7588         *errorcodeptr = ERR20;
   7589         goto FAILED;
   7590         }
   7591       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
   7592       code++;   /* This already contains bravalue */
   7593       PUTINC(code, 0, 1 + LINK_SIZE);
   7594       *code++ = OP_KET;
   7595       PUTINC(code, 0, 1 + LINK_SIZE);
   7596       break;    /* No need to waste time with special character handling */
   7597       }
   7598 
   7599     /* Otherwise update the main code pointer to the end of the group. */
   7600 
   7601     code = tempcode;
   7602 
   7603     /* For a DEFINE group, required and first character settings are not
   7604     relevant. */
   7605 
   7606     if (bravalue == OP_DEF) break;
   7607 
   7608     /* Handle updating of the required and first characters for other types of
   7609     group. Update for normal brackets of all kinds, and conditions with two
   7610     branches (see code above). If the bracket is followed by a quantifier with
   7611     zero repeat, we have to back off. Hence the definition of zeroreqchar and
   7612     zerofirstchar outside the main loop so that they can be accessed for the
   7613     back off. */
   7614 
   7615     zeroreqchar = reqchar;
   7616     zeroreqcharflags = reqcharflags;
   7617     zerofirstchar = firstchar;
   7618     zerofirstcharflags = firstcharflags;
   7619     groupsetfirstchar = FALSE;
   7620 
   7621     if (bravalue >= OP_ONCE)
   7622       {
   7623       /* If we have not yet set a firstchar in this branch, take it from the
   7624       subpattern, remembering that it was set here so that a repeat of more
   7625       than one can replicate it as reqchar if necessary. If the subpattern has
   7626       no firstchar, set "none" for the whole branch. In both cases, a zero
   7627       repeat forces firstchar to "none". */
   7628 
   7629       if (firstcharflags == REQ_UNSET)
   7630         {
   7631         if (subfirstcharflags >= 0)
   7632           {
   7633           firstchar = subfirstchar;
   7634           firstcharflags = subfirstcharflags;
   7635           groupsetfirstchar = TRUE;
   7636           }
   7637         else firstcharflags = REQ_NONE;
   7638         zerofirstcharflags = REQ_NONE;
   7639         }
   7640 
   7641       /* If firstchar was previously set, convert the subpattern's firstchar
   7642       into reqchar if there wasn't one, using the vary flag that was in
   7643       existence beforehand. */
   7644 
   7645       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
   7646         {
   7647         subreqchar = subfirstchar;
   7648         subreqcharflags = subfirstcharflags | tempreqvary;
   7649         }
   7650 
   7651       /* If the subpattern set a required byte (or set a first byte that isn't
   7652       really the first byte - see above), set it. */
   7653 
   7654       if (subreqcharflags >= 0)
   7655         {
   7656         reqchar = subreqchar;
   7657         reqcharflags = subreqcharflags;
   7658         }
   7659       }
   7660 
   7661     /* For a forward assertion, we take the reqchar, if set. This can be
   7662     helpful if the pattern that follows the assertion doesn't set a different
   7663     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
   7664     for an assertion, however because it leads to incorrect effect for patterns
   7665     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
   7666     of a firstchar. This is overcome by a scan at the end if there's no
   7667     firstchar, looking for an asserted first char. */
   7668 
   7669     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
   7670       {
   7671       reqchar = subreqchar;
   7672       reqcharflags = subreqcharflags;
   7673       }
   7674     break;     /* End of processing '(' */
   7675 
   7676 
   7677     /* ===================================================================*/
   7678     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
   7679     are arranged to be the negation of the corresponding OP_values in the
   7680     default case when PCRE_UCP is not set. For the back references, the values
   7681     are negative the reference number. Only back references and those types
   7682     that consume a character may be repeated. We can test for values between
   7683     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
   7684     ever created. */
   7685 
   7686     case CHAR_BACKSLASH:
   7687     tempptr = ptr;
   7688     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
   7689     if (*errorcodeptr != 0) goto FAILED;
   7690 
   7691     if (escape == 0)                  /* The escape coded a single character */
   7692       c = ec;
   7693     else
   7694       {
   7695       if (escape == ESC_Q)            /* Handle start of quoted string */
   7696         {
   7697         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   7698           ptr += 2;               /* avoid empty string */
   7699             else inescq = TRUE;
   7700         continue;
   7701         }
   7702 
   7703       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
   7704 
   7705       /* For metasequences that actually match a character, we disable the
   7706       setting of a first character if it hasn't already been set. */
   7707 
   7708       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
   7709         firstcharflags = REQ_NONE;
   7710 
   7711       /* Set values to reset to if this is followed by a zero repeat. */
   7712 
   7713       zerofirstchar = firstchar;
   7714       zerofirstcharflags = firstcharflags;
   7715       zeroreqchar = reqchar;
   7716       zeroreqcharflags = reqcharflags;
   7717 
   7718       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
   7719       is a subroutine call by number (Oniguruma syntax). In fact, the value
   7720       ESC_g is returned only for these cases. So we don't need to check for <
   7721       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
   7722       -n, and for the Perl syntax \g{name} the result is ESC_k (as
   7723       that is a synonym for a named back reference). */
   7724 
   7725       if (escape == ESC_g)
   7726         {
   7727         const pcre_uchar *p;
   7728         pcre_uint32 cf;
   7729 
   7730         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
   7731         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
   7732           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
   7733 
   7734         /* These two statements stop the compiler for warning about possibly
   7735         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
   7736         fact, because we do the check for a number below, the paths that
   7737         would actually be in error are never taken. */
   7738 
   7739         skipbytes = 0;
   7740         reset_bracount = FALSE;
   7741 
   7742         /* If it's not a signed or unsigned number, treat it as a name. */
   7743 
   7744         cf = ptr[1];
   7745         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
   7746           {
   7747           is_recurse = TRUE;
   7748           goto NAMED_REF_OR_RECURSE;
   7749           }
   7750 
   7751         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
   7752         or a digit. */
   7753 
   7754         p = ptr + 2;
   7755         while (IS_DIGIT(*p)) p++;
   7756         if (*p != (pcre_uchar)terminator)
   7757           {
   7758           *errorcodeptr = ERR57;
   7759           break;
   7760           }
   7761         ptr++;
   7762         goto HANDLE_NUMERICAL_RECURSION;
   7763         }
   7764 
   7765       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
   7766       We also support \k{name} (.NET syntax).  */
   7767 
   7768       if (escape == ESC_k)
   7769         {
   7770         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
   7771           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
   7772           {
   7773           *errorcodeptr = ERR69;
   7774           break;
   7775           }
   7776         is_recurse = FALSE;
   7777         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
   7778           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
   7779           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
   7780         goto NAMED_REF_OR_RECURSE;
   7781         }
   7782 
   7783       /* Back references are handled specially; must disable firstchar if
   7784       not set to cope with cases like (?=(\w+))\1: which would otherwise set
   7785       ':' later. */
   7786 
   7787       if (escape < 0)
   7788         {
   7789         open_capitem *oc;
   7790         recno = -escape;
   7791 
   7792         /* Come here from named backref handling when the reference is to a
   7793         single group (i.e. not to a duplicated name. */
   7794 
   7795         HANDLE_REFERENCE:
   7796         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   7797         previous = code;
   7798         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
   7799         PUT2INC(code, 0, recno);
   7800         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
   7801         if (recno > cd->top_backref) cd->top_backref = recno;
   7802 
   7803         /* Check to see if this back reference is recursive, that it, it
   7804         is inside the group that it references. A flag is set so that the
   7805         group can be made atomic. */
   7806 
   7807         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
   7808           {
   7809           if (oc->number == recno)
   7810             {
   7811             oc->flag = TRUE;
   7812             break;
   7813             }
   7814           }
   7815         }
   7816 
   7817       /* So are Unicode property matches, if supported. */
   7818 
   7819 #ifdef SUPPORT_UCP
   7820       else if (escape == ESC_P || escape == ESC_p)
   7821         {
   7822         BOOL negated;
   7823         unsigned int ptype = 0, pdata = 0;
   7824         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
   7825           goto FAILED;
   7826         previous = code;
   7827         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
   7828         *code++ = ptype;
   7829         *code++ = pdata;
   7830         }
   7831 #else
   7832 
   7833       /* If Unicode properties are not supported, \X, \P, and \p are not
   7834       allowed. */
   7835 
   7836       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
   7837         {
   7838         *errorcodeptr = ERR45;
   7839         goto FAILED;
   7840         }
   7841 #endif
   7842 
   7843       /* For the rest (including \X when Unicode properties are supported), we
   7844       can obtain the OP value by negating the escape value in the default
   7845       situation when PCRE_UCP is not set. When it *is* set, we substitute
   7846       Unicode property tests. Note that \b and \B do a one-character
   7847       lookbehind, and \A also behaves as if it does. */
   7848 
   7849       else
   7850         {
   7851         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
   7852              cd->max_lookbehind == 0)
   7853           cd->max_lookbehind = 1;
   7854 #ifdef SUPPORT_UCP
   7855         if (escape >= ESC_DU && escape <= ESC_wu)
   7856           {
   7857           nestptr = ptr + 1;                   /* Where to resume */
   7858           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
   7859           }
   7860         else
   7861 #endif
   7862         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
   7863         so that it works in DFA mode and in lookbehinds. */
   7864 
   7865           {
   7866           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
   7867           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
   7868           }
   7869         }
   7870       continue;
   7871       }
   7872 
   7873     /* We have a data character whose value is in c. In UTF-8 mode it may have
   7874     a value > 127. We set its representation in the length/buffer, and then
   7875     handle it as a data character. */
   7876 
   7877 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   7878     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
   7879       mclength = PRIV(ord2utf)(c, mcbuffer);
   7880     else
   7881 #endif
   7882 
   7883      {
   7884      mcbuffer[0] = c;
   7885      mclength = 1;
   7886      }
   7887     goto ONE_CHAR;
   7888 
   7889 
   7890     /* ===================================================================*/
   7891     /* Handle a literal character. It is guaranteed not to be whitespace or #
   7892     when the extended flag is set. If we are in a UTF mode, it may be a
   7893     multi-unit literal character. */
   7894 
   7895     default:
   7896     NORMAL_CHAR:
   7897     mclength = 1;
   7898     mcbuffer[0] = c;
   7899 
   7900 #ifdef SUPPORT_UTF
   7901     if (utf && HAS_EXTRALEN(c))
   7902       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
   7903 #endif
   7904 
   7905     /* At this point we have the character's bytes in mcbuffer, and the length
   7906     in mclength. When not in UTF-8 mode, the length is always 1. */
   7907 
   7908     ONE_CHAR:
   7909     previous = code;
   7910 
   7911     /* For caseless UTF-8 mode when UCP support is available, check whether
   7912     this character has more than one other case. If so, generate a special
   7913     OP_PROP item instead of OP_CHARI. */
   7914 
   7915 #ifdef SUPPORT_UCP
   7916     if (utf && (options & PCRE_CASELESS) != 0)
   7917       {
   7918       GETCHAR(c, mcbuffer);
   7919       if ((c = UCD_CASESET(c)) != 0)
   7920         {
   7921         *code++ = OP_PROP;
   7922         *code++ = PT_CLIST;
   7923         *code++ = c;
   7924         if (firstcharflags == REQ_UNSET)
   7925           firstcharflags = zerofirstcharflags = REQ_NONE;
   7926         break;
   7927         }
   7928       }
   7929 #endif
   7930 
   7931     /* Caseful matches, or not one of the multicase characters. */
   7932 
   7933     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
   7934     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
   7935 
   7936     /* Remember if \r or \n were seen */
   7937 
   7938     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
   7939       cd->external_flags |= PCRE_HASCRORLF;
   7940 
   7941     /* Set the first and required bytes appropriately. If no previous first
   7942     byte, set it from this character, but revert to none on a zero repeat.
   7943     Otherwise, leave the firstchar value alone, and don't change it on a zero
   7944     repeat. */
   7945 
   7946     if (firstcharflags == REQ_UNSET)
   7947       {
   7948       zerofirstcharflags = REQ_NONE;
   7949       zeroreqchar = reqchar;
   7950       zeroreqcharflags = reqcharflags;
   7951 
   7952       /* If the character is more than one byte long, we can set firstchar
   7953       only if it is not to be matched caselessly. */
   7954 
   7955       if (mclength == 1 || req_caseopt == 0)
   7956         {
   7957         firstchar = mcbuffer[0] | req_caseopt;
   7958         firstchar = mcbuffer[0];
   7959         firstcharflags = req_caseopt;
   7960 
   7961         if (mclength != 1)
   7962           {
   7963           reqchar = code[-1];
   7964           reqcharflags = cd->req_varyopt;
   7965           }
   7966         }
   7967       else firstcharflags = reqcharflags = REQ_NONE;
   7968       }
   7969 
   7970     /* firstchar was previously set; we can set reqchar only if the length is
   7971     1 or the matching is caseful. */
   7972 
   7973     else
   7974       {
   7975       zerofirstchar = firstchar;
   7976       zerofirstcharflags = firstcharflags;
   7977       zeroreqchar = reqchar;
   7978       zeroreqcharflags = reqcharflags;
   7979       if (mclength == 1 || req_caseopt == 0)
   7980         {
   7981         reqchar = code[-1];
   7982         reqcharflags = req_caseopt | cd->req_varyopt;
   7983         }
   7984       }
   7985 
   7986     break;            /* End of literal character handling */
   7987     }
   7988   }                   /* end of big loop */
   7989 
   7990 
   7991 /* Control never reaches here by falling through, only by a goto for all the
   7992 error states. Pass back the position in the pattern so that it can be displayed
   7993 to the user for diagnosing the error. */
   7994 
   7995 FAILED:
   7996 *ptrptr = ptr;
   7997 return FALSE;
   7998 }
   7999 
   8000 
   8001 
   8002 /*************************************************
   8003 *     Compile sequence of alternatives           *
   8004 *************************************************/
   8005 
   8006 /* On entry, ptr is pointing past the bracket character, but on return it
   8007 points to the closing bracket, or vertical bar, or end of string. The code
   8008 variable is pointing at the byte into which the BRA operator has been stored.
   8009 This function is used during the pre-compile phase when we are trying to find
   8010 out the amount of memory needed, as well as during the real compile phase. The
   8011 value of lengthptr distinguishes the two phases.
   8012 
   8013 Arguments:
   8014   options           option bits, including any changes for this subpattern
   8015   codeptr           -> the address of the current code pointer
   8016   ptrptr            -> the address of the current pattern pointer
   8017   errorcodeptr      -> pointer to error code variable
   8018   lookbehind        TRUE if this is a lookbehind assertion
   8019   reset_bracount    TRUE to reset the count for each branch
   8020   skipbytes         skip this many bytes at start (for brackets and OP_COND)
   8021   cond_depth        depth of nesting for conditional subpatterns
   8022   firstcharptr      place to put the first required character
   8023   firstcharflagsptr place to put the first character flags, or a negative number
   8024   reqcharptr        place to put the last required character
   8025   reqcharflagsptr   place to put the last required character flags, or a negative number
   8026   bcptr             pointer to the chain of currently open branches
   8027   cd                points to the data block with tables pointers etc.
   8028   lengthptr         NULL during the real compile phase
   8029                     points to length accumulator during pre-compile phase
   8030 
   8031 Returns:            TRUE on success
   8032 */
   8033 
   8034 static BOOL
   8035 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
   8036   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
   8037   int cond_depth,
   8038   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
   8039   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
   8040   branch_chain *bcptr, compile_data *cd, int *lengthptr)
   8041 {
   8042 const pcre_uchar *ptr = *ptrptr;
   8043 pcre_uchar *code = *codeptr;
   8044 pcre_uchar *last_branch = code;
   8045 pcre_uchar *start_bracket = code;
   8046 pcre_uchar *reverse_count = NULL;
   8047 open_capitem capitem;
   8048 int capnumber = 0;
   8049 pcre_uint32 firstchar, reqchar;
   8050 pcre_int32 firstcharflags, reqcharflags;
   8051 pcre_uint32 branchfirstchar, branchreqchar;
   8052 pcre_int32 branchfirstcharflags, branchreqcharflags;
   8053 int length;
   8054 unsigned int orig_bracount;
   8055 unsigned int max_bracount;
   8056 branch_chain bc;
   8057 
   8058 /* If set, call the external function that checks for stack availability. */
   8059 
   8060 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
   8061   {
   8062   *errorcodeptr= ERR85;
   8063   return FALSE;
   8064   }
   8065 
   8066 /* Miscellaneous initialization */
   8067 
   8068 bc.outer = bcptr;
   8069 bc.current_branch = code;
   8070 
   8071 firstchar = reqchar = 0;
   8072 firstcharflags = reqcharflags = REQ_UNSET;
   8073 
   8074 /* Accumulate the length for use in the pre-compile phase. Start with the
   8075 length of the BRA and KET and any extra bytes that are required at the
   8076 beginning. We accumulate in a local variable to save frequent testing of
   8077 lenthptr for NULL. We cannot do this by looking at the value of code at the
   8078 start and end of each alternative, because compiled items are discarded during
   8079 the pre-compile phase so that the work space is not exceeded. */
   8080 
   8081 length = 2 + 2*LINK_SIZE + skipbytes;
   8082 
   8083 /* WARNING: If the above line is changed for any reason, you must also change
   8084 the code that abstracts option settings at the start of the pattern and makes
   8085 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
   8086 pre-compile phase to find out whether anything has yet been compiled or not. */
   8087 
   8088 /* If this is a capturing subpattern, add to the chain of open capturing items
   8089 so that we can detect them if (*ACCEPT) is encountered. This is also used to
   8090 detect groups that contain recursive back references to themselves. Note that
   8091 only OP_CBRA need be tested here; changing this opcode to one of its variants,
   8092 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
   8093 
   8094 if (*code == OP_CBRA)
   8095   {
   8096   capnumber = GET2(code, 1 + LINK_SIZE);
   8097   capitem.number = capnumber;
   8098   capitem.next = cd->open_caps;
   8099   capitem.flag = FALSE;
   8100   cd->open_caps = &capitem;
   8101   }
   8102 
   8103 /* Offset is set zero to mark that this bracket is still open */
   8104 
   8105 PUT(code, 1, 0);
   8106 code += 1 + LINK_SIZE + skipbytes;
   8107 
   8108 /* Loop for each alternative branch */
   8109 
   8110 orig_bracount = max_bracount = cd->bracount;
   8111 for (;;)
   8112   {
   8113   /* For a (?| group, reset the capturing bracket count so that each branch
   8114   uses the same numbers. */
   8115 
   8116   if (reset_bracount) cd->bracount = orig_bracount;
   8117 
   8118   /* Set up dummy OP_REVERSE if lookbehind assertion */
   8119 
   8120   if (lookbehind)
   8121     {
   8122     *code++ = OP_REVERSE;
   8123     reverse_count = code;
   8124     PUTINC(code, 0, 0);
   8125     length += 1 + LINK_SIZE;
   8126     }
   8127 
   8128   /* Now compile the branch; in the pre-compile phase its length gets added
   8129   into the length. */
   8130 
   8131   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
   8132         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
   8133         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
   8134     {
   8135     *ptrptr = ptr;
   8136     return FALSE;
   8137     }
   8138 
   8139   /* Keep the highest bracket count in case (?| was used and some branch
   8140   has fewer than the rest. */
   8141 
   8142   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
   8143 
   8144   /* In the real compile phase, there is some post-processing to be done. */
   8145 
   8146   if (lengthptr == NULL)
   8147     {
   8148     /* If this is the first branch, the firstchar and reqchar values for the
   8149     branch become the values for the regex. */
   8150 
   8151     if (*last_branch != OP_ALT)
   8152       {
   8153       firstchar = branchfirstchar;
   8154       firstcharflags = branchfirstcharflags;
   8155       reqchar = branchreqchar;
   8156       reqcharflags = branchreqcharflags;
   8157       }
   8158 
   8159     /* If this is not the first branch, the first char and reqchar have to
   8160     match the values from all the previous branches, except that if the
   8161     previous value for reqchar didn't have REQ_VARY set, it can still match,
   8162     and we set REQ_VARY for the regex. */
   8163 
   8164     else
   8165       {
   8166       /* If we previously had a firstchar, but it doesn't match the new branch,
   8167       we have to abandon the firstchar for the regex, but if there was
   8168       previously no reqchar, it takes on the value of the old firstchar. */
   8169 
   8170       if (firstcharflags >= 0 &&
   8171           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
   8172         {
   8173         if (reqcharflags < 0)
   8174           {
   8175           reqchar = firstchar;
   8176           reqcharflags = firstcharflags;
   8177           }
   8178         firstcharflags = REQ_NONE;
   8179         }
   8180 
   8181       /* If we (now or from before) have no firstchar, a firstchar from the
   8182       branch becomes a reqchar if there isn't a branch reqchar. */
   8183 
   8184       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
   8185         {
   8186         branchreqchar = branchfirstchar;
   8187         branchreqcharflags = branchfirstcharflags;
   8188         }
   8189 
   8190       /* Now ensure that the reqchars match */
   8191 
   8192       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
   8193           reqchar != branchreqchar)
   8194         reqcharflags = REQ_NONE;
   8195       else
   8196         {
   8197         reqchar = branchreqchar;
   8198         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
   8199         }
   8200       }
   8201 
   8202     /* If lookbehind, check that this branch matches a fixed-length string, and
   8203     put the length into the OP_REVERSE item. Temporarily mark the end of the
   8204     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
   8205     because there may be forward references that we can't check here. Set a
   8206     flag to cause another lookbehind check at the end. Why not do it all at the
   8207     end? Because common, erroneous checks are picked up here and the offset of
   8208     the problem can be shown. */
   8209 
   8210     if (lookbehind)
   8211       {
   8212       int fixed_length;
   8213       *code = OP_END;
   8214       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
   8215         FALSE, cd);
   8216       DPRINTF(("fixed length = %d\n", fixed_length));
   8217       if (fixed_length == -3)
   8218         {
   8219         cd->check_lookbehind = TRUE;
   8220         }
   8221       else if (fixed_length < 0)
   8222         {
   8223         *errorcodeptr = (fixed_length == -2)? ERR36 :
   8224                         (fixed_length == -4)? ERR70: ERR25;
   8225         *ptrptr = ptr;
   8226         return FALSE;
   8227         }
   8228       else
   8229         {
   8230         if (fixed_length > cd->max_lookbehind)
   8231           cd->max_lookbehind = fixed_length;
   8232         PUT(reverse_count, 0, fixed_length);
   8233         }
   8234       }
   8235     }
   8236 
   8237   /* Reached end of expression, either ')' or end of pattern. In the real
   8238   compile phase, go back through the alternative branches and reverse the chain
   8239   of offsets, with the field in the BRA item now becoming an offset to the
   8240   first alternative. If there are no alternatives, it points to the end of the
   8241   group. The length in the terminating ket is always the length of the whole
   8242   bracketed item. Return leaving the pointer at the terminating char. */
   8243 
   8244   if (*ptr != CHAR_VERTICAL_LINE)
   8245     {
   8246     if (lengthptr == NULL)
   8247       {
   8248       int branch_length = (int)(code - last_branch);
   8249       do
   8250         {
   8251         int prev_length = GET(last_branch, 1);
   8252         PUT(last_branch, 1, branch_length);
   8253         branch_length = prev_length;
   8254         last_branch -= branch_length;
   8255         }
   8256       while (branch_length > 0);
   8257       }
   8258 
   8259     /* Fill in the ket */
   8260 
   8261     *code = OP_KET;
   8262     PUT(code, 1, (int)(code - start_bracket));
   8263     code += 1 + LINK_SIZE;
   8264 
   8265     /* If it was a capturing subpattern, check to see if it contained any
   8266     recursive back references. If so, we must wrap it in atomic brackets.
   8267     Because we are moving code along, we must ensure that any pending recursive
   8268     references are updated. In any event, remove the block from the chain. */
   8269 
   8270     if (capnumber > 0)
   8271       {
   8272       if (cd->open_caps->flag)
   8273         {
   8274         *code = OP_END;
   8275         adjust_recurse(start_bracket, 1 + LINK_SIZE,
   8276           (options & PCRE_UTF8) != 0, cd, cd->hwm);
   8277         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
   8278           IN_UCHARS(code - start_bracket));
   8279         *start_bracket = OP_ONCE;
   8280         code += 1 + LINK_SIZE;
   8281         PUT(start_bracket, 1, (int)(code - start_bracket));
   8282         *code = OP_KET;
   8283         PUT(code, 1, (int)(code - start_bracket));
   8284         code += 1 + LINK_SIZE;
   8285         length += 2 + 2*LINK_SIZE;
   8286         }
   8287       cd->open_caps = cd->open_caps->next;
   8288       }
   8289 
   8290     /* Retain the highest bracket number, in case resetting was used. */
   8291 
   8292     cd->bracount = max_bracount;
   8293 
   8294     /* Set values to pass back */
   8295 
   8296     *codeptr = code;
   8297     *ptrptr = ptr;
   8298     *firstcharptr = firstchar;
   8299     *firstcharflagsptr = firstcharflags;
   8300     *reqcharptr = reqchar;
   8301     *reqcharflagsptr = reqcharflags;
   8302     if (lengthptr != NULL)
   8303       {
   8304       if (OFLOW_MAX - *lengthptr < length)
   8305         {
   8306         *errorcodeptr = ERR20;
   8307         return FALSE;
   8308         }
   8309       *lengthptr += length;
   8310       }
   8311     return TRUE;
   8312     }
   8313 
   8314   /* Another branch follows. In the pre-compile phase, we can move the code
   8315   pointer back to where it was for the start of the first branch. (That is,
   8316   pretend that each branch is the only one.)
   8317 
   8318   In the real compile phase, insert an ALT node. Its length field points back
   8319   to the previous branch while the bracket remains open. At the end the chain
   8320   is reversed. It's done like this so that the start of the bracket has a
   8321   zero offset until it is closed, making it possible to detect recursion. */
   8322 
   8323   if (lengthptr != NULL)
   8324     {
   8325     code = *codeptr + 1 + LINK_SIZE + skipbytes;
   8326     length += 1 + LINK_SIZE;
   8327     }
   8328   else
   8329     {
   8330     *code = OP_ALT;
   8331     PUT(code, 1, (int)(code - last_branch));
   8332     bc.current_branch = last_branch = code;
   8333     code += 1 + LINK_SIZE;
   8334     }
   8335 
   8336   ptr++;
   8337   }
   8338 /* Control never reaches here */
   8339 }
   8340 
   8341 
   8342 
   8343 
   8344 /*************************************************
   8345 *          Check for anchored expression         *
   8346 *************************************************/
   8347 
   8348 /* Try to find out if this is an anchored regular expression. Consider each
   8349 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
   8350 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
   8351 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
   8352 be found, because ^ generates OP_CIRCM in that mode.
   8353 
   8354 We can also consider a regex to be anchored if OP_SOM starts all its branches.
   8355 This is the code for \G, which means "match at start of match position, taking
   8356 into account the match offset".
   8357 
   8358 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
   8359 because that will try the rest of the pattern at all possible matching points,
   8360 so there is no point trying again.... er ....
   8361 
   8362 .... except when the .* appears inside capturing parentheses, and there is a
   8363 subsequent back reference to those parentheses. We haven't enough information
   8364 to catch that case precisely.
   8365 
   8366 At first, the best we could do was to detect when .* was in capturing brackets
   8367 and the highest back reference was greater than or equal to that level.
   8368 However, by keeping a bitmap of the first 31 back references, we can catch some
   8369 of the more common cases more precisely.
   8370 
   8371 ... A second exception is when the .* appears inside an atomic group, because
   8372 this prevents the number of characters it matches from being adjusted.
   8373 
   8374 Arguments:
   8375   code           points to start of expression (the bracket)
   8376   bracket_map    a bitmap of which brackets we are inside while testing; this
   8377                   handles up to substring 31; after that we just have to take
   8378                   the less precise approach
   8379   cd             points to the compile data block
   8380   atomcount      atomic group level
   8381 
   8382 Returns:     TRUE or FALSE
   8383 */
   8384 
   8385 static BOOL
   8386 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
   8387   compile_data *cd, int atomcount)
   8388 {
   8389 do {
   8390    const pcre_uchar *scode = first_significant_code(
   8391      code + PRIV(OP_lengths)[*code], FALSE);
   8392    register int op = *scode;
   8393 
   8394    /* Non-capturing brackets */
   8395 
   8396    if (op == OP_BRA  || op == OP_BRAPOS ||
   8397        op == OP_SBRA || op == OP_SBRAPOS)
   8398      {
   8399      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
   8400      }
   8401 
   8402    /* Capturing brackets */
   8403 
   8404    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   8405             op == OP_SCBRA || op == OP_SCBRAPOS)
   8406      {
   8407      int n = GET2(scode, 1+LINK_SIZE);
   8408      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
   8409      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
   8410      }
   8411 
   8412    /* Positive forward assertions and conditions */
   8413 
   8414    else if (op == OP_ASSERT || op == OP_COND)
   8415      {
   8416      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
   8417      }
   8418 
   8419    /* Atomic groups */
   8420 
   8421    else if (op == OP_ONCE || op == OP_ONCE_NC)
   8422      {
   8423      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
   8424        return FALSE;
   8425      }
   8426 
   8427    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
   8428    it isn't in brackets that are or may be referenced or inside an atomic
   8429    group. */
   8430 
   8431    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
   8432              op == OP_TYPEPOSSTAR))
   8433      {
   8434      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
   8435          atomcount > 0 || cd->had_pruneorskip)
   8436        return FALSE;
   8437      }
   8438 
   8439    /* Check for explicit anchoring */
   8440 
   8441    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
   8442 
   8443    code += GET(code, 1);
   8444    }
   8445 while (*code == OP_ALT);   /* Loop for each alternative */
   8446 return TRUE;
   8447 }
   8448 
   8449 
   8450 
   8451 /*************************************************
   8452 *         Check for starting with ^ or .*        *
   8453 *************************************************/
   8454 
   8455 /* This is called to find out if every branch starts with ^ or .* so that
   8456 "first char" processing can be done to speed things up in multiline
   8457 matching and for non-DOTALL patterns that start with .* (which must start at
   8458 the beginning or after \n). As in the case of is_anchored() (see above), we
   8459 have to take account of back references to capturing brackets that contain .*
   8460 because in that case we can't make the assumption. Also, the appearance of .*
   8461 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
   8462 count, because once again the assumption no longer holds.
   8463 
   8464 Arguments:
   8465   code           points to start of expression (the bracket)
   8466   bracket_map    a bitmap of which brackets we are inside while testing; this
   8467                   handles up to substring 31; after that we just have to take
   8468                   the less precise approach
   8469   cd             points to the compile data
   8470   atomcount      atomic group level
   8471 
   8472 Returns:         TRUE or FALSE
   8473 */
   8474 
   8475 static BOOL
   8476 is_startline(const pcre_uchar *code, unsigned int bracket_map,
   8477   compile_data *cd, int atomcount)
   8478 {
   8479 do {
   8480    const pcre_uchar *scode = first_significant_code(
   8481      code + PRIV(OP_lengths)[*code], FALSE);
   8482    register int op = *scode;
   8483 
   8484    /* If we are at the start of a conditional assertion group, *both* the
   8485    conditional assertion *and* what follows the condition must satisfy the test
   8486    for start of line. Other kinds of condition fail. Note that there may be an
   8487    auto-callout at the start of a condition. */
   8488 
   8489    if (op == OP_COND)
   8490      {
   8491      scode += 1 + LINK_SIZE;
   8492      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
   8493      switch (*scode)
   8494        {
   8495        case OP_CREF:
   8496        case OP_DNCREF:
   8497        case OP_RREF:
   8498        case OP_DNRREF:
   8499        case OP_DEF:
   8500        return FALSE;
   8501 
   8502        default:     /* Assertion */
   8503        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
   8504        do scode += GET(scode, 1); while (*scode == OP_ALT);
   8505        scode += 1 + LINK_SIZE;
   8506        break;
   8507        }
   8508      scode = first_significant_code(scode, FALSE);
   8509      op = *scode;
   8510      }
   8511 
   8512    /* Non-capturing brackets */
   8513 
   8514    if (op == OP_BRA  || op == OP_BRAPOS ||
   8515        op == OP_SBRA || op == OP_SBRAPOS)
   8516      {
   8517      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
   8518      }
   8519 
   8520    /* Capturing brackets */
   8521 
   8522    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
   8523             op == OP_SCBRA || op == OP_SCBRAPOS)
   8524      {
   8525      int n = GET2(scode, 1+LINK_SIZE);
   8526      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
   8527      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
   8528      }
   8529 
   8530    /* Positive forward assertions */
   8531 
   8532    else if (op == OP_ASSERT)
   8533      {
   8534      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
   8535      }
   8536 
   8537    /* Atomic brackets */
   8538 
   8539    else if (op == OP_ONCE || op == OP_ONCE_NC)
   8540      {
   8541      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
   8542      }
   8543 
   8544    /* .* means "start at start or after \n" if it isn't in atomic brackets or
   8545    brackets that may be referenced, as long as the pattern does not contain
   8546    *PRUNE or *SKIP, because these break the feature. Consider, for example,
   8547    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
   8548    start of a line. */
   8549 
   8550    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
   8551      {
   8552      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
   8553          atomcount > 0 || cd->had_pruneorskip)
   8554        return FALSE;
   8555      }
   8556 
   8557    /* Check for explicit circumflex; anything else gives a FALSE result. Note
   8558    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
   8559    because the number of characters matched by .* cannot be adjusted inside
   8560    them. */
   8561 
   8562    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
   8563 
   8564    /* Move on to the next alternative */
   8565 
   8566    code += GET(code, 1);
   8567    }
   8568 while (*code == OP_ALT);  /* Loop for each alternative */
   8569 return TRUE;
   8570 }
   8571 
   8572 
   8573 
   8574 /*************************************************
   8575 *       Check for asserted fixed first char      *
   8576 *************************************************/
   8577 
   8578 /* During compilation, the "first char" settings from forward assertions are
   8579 discarded, because they can cause conflicts with actual literals that follow.
   8580 However, if we end up without a first char setting for an unanchored pattern,
   8581 it is worth scanning the regex to see if there is an initial asserted first
   8582 char. If all branches start with the same asserted char, or with a
   8583 non-conditional bracket all of whose alternatives start with the same asserted
   8584 char (recurse ad lib), then we return that char, with the flags set to zero or
   8585 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
   8586 
   8587 Arguments:
   8588   code       points to start of expression (the bracket)
   8589   flags      points to the first char flags, or to REQ_NONE
   8590   inassert   TRUE if in an assertion
   8591 
   8592 Returns:     the fixed first char, or 0 with REQ_NONE in flags
   8593 */
   8594 
   8595 static pcre_uint32
   8596 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
   8597   BOOL inassert)
   8598 {
   8599 register pcre_uint32 c = 0;
   8600 int cflags = REQ_NONE;
   8601 
   8602 *flags = REQ_NONE;
   8603 do {
   8604    pcre_uint32 d;
   8605    int dflags;
   8606    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
   8607              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
   8608    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
   8609      TRUE);
   8610    register pcre_uchar op = *scode;
   8611 
   8612    switch(op)
   8613      {
   8614      default:
   8615      return 0;
   8616 
   8617      case OP_BRA:
   8618      case OP_BRAPOS:
   8619      case OP_CBRA:
   8620      case OP_SCBRA:
   8621      case OP_CBRAPOS:
   8622      case OP_SCBRAPOS:
   8623      case OP_ASSERT:
   8624      case OP_ONCE:
   8625      case OP_ONCE_NC:
   8626      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
   8627      if (dflags < 0)
   8628        return 0;
   8629      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
   8630      break;
   8631 
   8632      case OP_EXACT:
   8633      scode += IMM2_SIZE;
   8634      /* Fall through */
   8635 
   8636      case OP_CHAR:
   8637      case OP_PLUS:
   8638      case OP_MINPLUS:
   8639      case OP_POSPLUS:
   8640      if (!inassert) return 0;
   8641      if (cflags < 0) { c = scode[1]; cflags = 0; }
   8642        else if (c != scode[1]) return 0;
   8643      break;
   8644 
   8645      case OP_EXACTI:
   8646      scode += IMM2_SIZE;
   8647      /* Fall through */
   8648 
   8649      case OP_CHARI:
   8650      case OP_PLUSI:
   8651      case OP_MINPLUSI:
   8652      case OP_POSPLUSI:
   8653      if (!inassert) return 0;
   8654      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
   8655        else if (c != scode[1]) return 0;
   8656      break;
   8657      }
   8658 
   8659    code += GET(code, 1);
   8660    }
   8661 while (*code == OP_ALT);
   8662 
   8663 *flags = cflags;
   8664 return c;
   8665 }
   8666 
   8667 
   8668 
   8669 /*************************************************
   8670 *     Add an entry to the name/number table      *
   8671 *************************************************/
   8672 
   8673 /* This function is called between compiling passes to add an entry to the
   8674 name/number table, maintaining alphabetical order. Checking for permitted
   8675 and forbidden duplicates has already been done.
   8676 
   8677 Arguments:
   8678   cd           the compile data block
   8679   name         the name to add
   8680   length       the length of the name
   8681   groupno      the group number
   8682 
   8683 Returns:       nothing
   8684 */
   8685 
   8686 static void
   8687 add_name(compile_data *cd, const pcre_uchar *name, int length,
   8688   unsigned int groupno)
   8689 {
   8690 int i;
   8691 pcre_uchar *slot = cd->name_table;
   8692 
   8693 for (i = 0; i < cd->names_found; i++)
   8694   {
   8695   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
   8696   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
   8697     crc = -1; /* Current name is a substring */
   8698 
   8699   /* Make space in the table and break the loop for an earlier name. For a
   8700   duplicate or later name, carry on. We do this for duplicates so that in the
   8701   simple case (when ?(| is not used) they are in order of their numbers. In all
   8702   cases they are in the order in which they appear in the pattern. */
   8703 
   8704   if (crc < 0)
   8705     {
   8706     memmove(slot + cd->name_entry_size, slot,
   8707       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
   8708     break;
   8709     }
   8710 
   8711   /* Continue the loop for a later or duplicate name */
   8712 
   8713   slot += cd->name_entry_size;
   8714   }
   8715 
   8716 PUT2(slot, 0, groupno);
   8717 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
   8718 slot[IMM2_SIZE + length] = 0;
   8719 cd->names_found++;
   8720 }
   8721 
   8722 
   8723 
   8724 /*************************************************
   8725 *        Compile a Regular Expression            *
   8726 *************************************************/
   8727 
   8728 /* This function takes a string and returns a pointer to a block of store
   8729 holding a compiled version of the expression. The original API for this
   8730 function had no error code return variable; it is retained for backwards
   8731 compatibility. The new function is given a new name.
   8732 
   8733 Arguments:
   8734   pattern       the regular expression
   8735   options       various option bits
   8736   errorcodeptr  pointer to error code variable (pcre_compile2() only)
   8737                   can be NULL if you don't want a code value
   8738   errorptr      pointer to pointer to error text
   8739   erroroffset   ptr offset in pattern where error was detected
   8740   tables        pointer to character tables or NULL
   8741 
   8742 Returns:        pointer to compiled data block, or NULL on error,
   8743                 with errorptr and erroroffset set
   8744 */
   8745 
   8746 #if defined COMPILE_PCRE8
   8747 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
   8748 pcre_compile(const char *pattern, int options, const char **errorptr,
   8749   int *erroroffset, const unsigned char *tables)
   8750 #elif defined COMPILE_PCRE16
   8751 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
   8752 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
   8753   int *erroroffset, const unsigned char *tables)
   8754 #elif defined COMPILE_PCRE32
   8755 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
   8756 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
   8757   int *erroroffset, const unsigned char *tables)
   8758 #endif
   8759 {
   8760 #if defined COMPILE_PCRE8
   8761 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
   8762 #elif defined COMPILE_PCRE16
   8763 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
   8764 #elif defined COMPILE_PCRE32
   8765 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
   8766 #endif
   8767 }
   8768 
   8769 
   8770 #if defined COMPILE_PCRE8
   8771 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
   8772 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
   8773   const char **errorptr, int *erroroffset, const unsigned char *tables)
   8774 #elif defined COMPILE_PCRE16
   8775 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
   8776 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
   8777   const char **errorptr, int *erroroffset, const unsigned char *tables)
   8778 #elif defined COMPILE_PCRE32
   8779 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
   8780 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
   8781   const char **errorptr, int *erroroffset, const unsigned char *tables)
   8782 #endif
   8783 {
   8784 REAL_PCRE *re;
   8785 int length = 1;  /* For final END opcode */
   8786 pcre_int32 firstcharflags, reqcharflags;
   8787 pcre_uint32 firstchar, reqchar;
   8788 pcre_uint32 limit_match = PCRE_UINT32_MAX;
   8789 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
   8790 int newline;
   8791 int errorcode = 0;
   8792 int skipatstart = 0;
   8793 BOOL utf;
   8794 BOOL never_utf = FALSE;
   8795 size_t size;
   8796 pcre_uchar *code;
   8797 const pcre_uchar *codestart;
   8798 const pcre_uchar *ptr;
   8799 compile_data compile_block;
   8800 compile_data *cd = &compile_block;
   8801 
   8802 /* This space is used for "compiling" into during the first phase, when we are
   8803 computing the amount of memory that is needed. Compiled items are thrown away
   8804 as soon as possible, so that a fairly large buffer should be sufficient for
   8805 this purpose. The same space is used in the second phase for remembering where
   8806 to fill in forward references to subpatterns. That may overflow, in which case
   8807 new memory is obtained from malloc(). */
   8808 
   8809 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
   8810 
   8811 /* This vector is used for remembering name groups during the pre-compile. In a
   8812 similar way to cworkspace, it can be expanded using malloc() if necessary. */
   8813 
   8814 named_group named_groups[NAMED_GROUP_LIST_SIZE];
   8815 
   8816 /* Set this early so that early errors get offset 0. */
   8817 
   8818 ptr = (const pcre_uchar *)pattern;
   8819 
   8820 /* We can't pass back an error message if errorptr is NULL; I guess the best we
   8821 can do is just return NULL, but we can set a code value if there is a code
   8822 pointer. */
   8823 
   8824 if (errorptr == NULL)
   8825   {
   8826   if (errorcodeptr != NULL) *errorcodeptr = 99;
   8827   return NULL;
   8828   }
   8829 
   8830 *errorptr = NULL;
   8831 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
   8832 
   8833 /* However, we can give a message for this error */
   8834 
   8835 if (erroroffset == NULL)
   8836   {
   8837   errorcode = ERR16;
   8838   goto PCRE_EARLY_ERROR_RETURN2;
   8839   }
   8840 
   8841 *erroroffset = 0;
   8842 
   8843 /* Set up pointers to the individual character tables */
   8844 
   8845 if (tables == NULL) tables = PRIV(default_tables);
   8846 cd->lcc = tables + lcc_offset;
   8847 cd->fcc = tables + fcc_offset;
   8848 cd->cbits = tables + cbits_offset;
   8849 cd->ctypes = tables + ctypes_offset;
   8850 
   8851 /* Check that all undefined public option bits are zero */
   8852 
   8853 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
   8854   {
   8855   errorcode = ERR17;
   8856   goto PCRE_EARLY_ERROR_RETURN;
   8857   }
   8858 
   8859 /* If PCRE_NEVER_UTF is set, remember it. */
   8860 
   8861 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
   8862 
   8863 /* Check for global one-time settings at the start of the pattern, and remember
   8864 the offset for later. */
   8865 
   8866 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
   8867 
   8868 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
   8869        ptr[skipatstart+1] == CHAR_ASTERISK)
   8870   {
   8871   int newnl = 0;
   8872   int newbsr = 0;
   8873 
   8874 /* For completeness and backward compatibility, (*UTFn) is supported in the
   8875 relevant libraries, but (*UTF) is generic and always supported. Note that
   8876 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
   8877 
   8878 #ifdef COMPILE_PCRE8
   8879   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
   8880     { skipatstart += 7; options |= PCRE_UTF8; continue; }
   8881 #endif
   8882 #ifdef COMPILE_PCRE16
   8883   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
   8884     { skipatstart += 8; options |= PCRE_UTF16; continue; }
   8885 #endif
   8886 #ifdef COMPILE_PCRE32
   8887   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
   8888     { skipatstart += 8; options |= PCRE_UTF32; continue; }
   8889 #endif
   8890 
   8891   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
   8892     { skipatstart += 6; options |= PCRE_UTF8; continue; }
   8893   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
   8894     { skipatstart += 6; options |= PCRE_UCP; continue; }
   8895   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
   8896     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
   8897   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
   8898     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
   8899 
   8900   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
   8901     {
   8902     pcre_uint32 c = 0;
   8903     int p = skipatstart + 14;
   8904     while (isdigit(ptr[p]))
   8905       {
   8906       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
   8907       c = c*10 + ptr[p++] - CHAR_0;
   8908       }
   8909     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
   8910     if (c < limit_match)
   8911       {
   8912       limit_match = c;
   8913       cd->external_flags |= PCRE_MLSET;
   8914       }
   8915     skipatstart = p;
   8916     continue;
   8917     }
   8918 
   8919   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
   8920     {
   8921     pcre_uint32 c = 0;
   8922     int p = skipatstart + 18;
   8923     while (isdigit(ptr[p]))
   8924       {
   8925       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
   8926       c = c*10 + ptr[p++] - CHAR_0;
   8927       }
   8928     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
   8929     if (c < limit_recursion)
   8930       {
   8931       limit_recursion = c;
   8932       cd->external_flags |= PCRE_RLSET;
   8933       }
   8934     skipatstart = p;
   8935     continue;
   8936     }
   8937 
   8938   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
   8939     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
   8940   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
   8941     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
   8942   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
   8943     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
   8944   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
   8945     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
   8946   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
   8947     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
   8948 
   8949   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
   8950     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
   8951   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
   8952     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
   8953 
   8954   if (newnl != 0)
   8955     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
   8956   else if (newbsr != 0)
   8957     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
   8958   else break;
   8959   }
   8960 
   8961 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
   8962 utf = (options & PCRE_UTF8) != 0;
   8963 if (utf && never_utf)
   8964   {
   8965   errorcode = ERR78;
   8966   goto PCRE_EARLY_ERROR_RETURN2;
   8967   }
   8968 
   8969 /* Can't support UTF unless PCRE has been compiled to include the code. The
   8970 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
   8971 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
   8972 not used here. */
   8973 
   8974 #ifdef SUPPORT_UTF
   8975 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
   8976      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
   8977   {
   8978 #if defined COMPILE_PCRE8
   8979   errorcode = ERR44;
   8980 #elif defined COMPILE_PCRE16
   8981   errorcode = ERR74;
   8982 #elif defined COMPILE_PCRE32
   8983   errorcode = ERR77;
   8984 #endif
   8985   goto PCRE_EARLY_ERROR_RETURN2;
   8986   }
   8987 #else
   8988 if (utf)
   8989   {
   8990   errorcode = ERR32;
   8991   goto PCRE_EARLY_ERROR_RETURN;
   8992   }
   8993 #endif
   8994 
   8995 /* Can't support UCP unless PCRE has been compiled to include the code. */
   8996 
   8997 #ifndef SUPPORT_UCP
   8998 if ((options & PCRE_UCP) != 0)
   8999   {
   9000   errorcode = ERR67;
   9001   goto PCRE_EARLY_ERROR_RETURN;
   9002   }
   9003 #endif
   9004 
   9005 /* Check validity of \R options. */
   9006 
   9007 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
   9008      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
   9009   {
   9010   errorcode = ERR56;
   9011   goto PCRE_EARLY_ERROR_RETURN;
   9012   }
   9013 
   9014 /* Handle different types of newline. The three bits give seven cases. The
   9015 current code allows for fixed one- or two-byte sequences, plus "any" and
   9016 "anycrlf". */
   9017 
   9018 switch (options & PCRE_NEWLINE_BITS)
   9019   {
   9020   case 0: newline = NEWLINE; break;   /* Build-time default */
   9021   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
   9022   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
   9023   case PCRE_NEWLINE_CR+
   9024        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
   9025   case PCRE_NEWLINE_ANY: newline = -1; break;
   9026   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
   9027   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
   9028   }
   9029 
   9030 if (newline == -2)
   9031   {
   9032   cd->nltype = NLTYPE_ANYCRLF;
   9033   }
   9034 else if (newline < 0)
   9035   {
   9036   cd->nltype = NLTYPE_ANY;
   9037   }
   9038 else
   9039   {
   9040   cd->nltype = NLTYPE_FIXED;
   9041   if (newline > 255)
   9042     {
   9043     cd->nllen = 2;
   9044     cd->nl[0] = (newline >> 8) & 255;
   9045     cd->nl[1] = newline & 255;
   9046     }
   9047   else
   9048     {
   9049     cd->nllen = 1;
   9050     cd->nl[0] = newline;
   9051     }
   9052   }
   9053 
   9054 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
   9055 references to help in deciding whether (.*) can be treated as anchored or not.
   9056 */
   9057 
   9058 cd->top_backref = 0;
   9059 cd->backref_map = 0;
   9060 
   9061 /* Reflect pattern for debugging output */
   9062 
   9063 DPRINTF(("------------------------------------------------------------------\n"));
   9064 #ifdef PCRE_DEBUG
   9065 print_puchar(stdout, (PCRE_PUCHAR)pattern);
   9066 #endif
   9067 DPRINTF(("\n"));
   9068 
   9069 /* Pretend to compile the pattern while actually just accumulating the length
   9070 of memory required. This behaviour is triggered by passing a non-NULL final
   9071 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
   9072 to compile parts of the pattern into; the compiled code is discarded when it is
   9073 no longer needed, so hopefully this workspace will never overflow, though there
   9074 is a test for its doing so. */
   9075 
   9076 cd->bracount = cd->final_bracount = 0;
   9077 cd->names_found = 0;
   9078 cd->name_entry_size = 0;
   9079 cd->name_table = NULL;
   9080 cd->dupnames = FALSE;
   9081 cd->namedrefcount = 0;
   9082 cd->start_code = cworkspace;
   9083 cd->hwm = cworkspace;
   9084 cd->start_workspace = cworkspace;
   9085 cd->workspace_size = COMPILE_WORK_SIZE;
   9086 cd->named_groups = named_groups;
   9087 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
   9088 cd->start_pattern = (const pcre_uchar *)pattern;
   9089 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
   9090 cd->req_varyopt = 0;
   9091 cd->parens_depth = 0;
   9092 cd->assert_depth = 0;
   9093 cd->max_lookbehind = 0;
   9094 cd->external_options = options;
   9095 cd->open_caps = NULL;
   9096 
   9097 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
   9098 don't need to look at the result of the function here. The initial options have
   9099 been put into the cd block so that they can be changed if an option setting is
   9100 found within the regex right at the beginning. Bringing initial option settings
   9101 outside can help speed up starting point checks. */
   9102 
   9103 ptr += skipatstart;
   9104 code = cworkspace;
   9105 *code = OP_BRA;
   9106 
   9107 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
   9108   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
   9109   cd, &length);
   9110 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
   9111 
   9112 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
   9113   (int)(cd->hwm - cworkspace)));
   9114 
   9115 if (length > MAX_PATTERN_SIZE)
   9116   {
   9117   errorcode = ERR20;
   9118   goto PCRE_EARLY_ERROR_RETURN;
   9119   }
   9120 
   9121 /* If there are groups with duplicate names and there are also references by
   9122 name, we must allow for the possibility of named references to duplicated
   9123 groups. These require an extra data item each. */
   9124 
   9125 if (cd->dupnames && cd->namedrefcount > 0)
   9126   length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
   9127 
   9128 /* Compute the size of the data block for storing the compiled pattern. Integer
   9129 overflow should no longer be possible because nowadays we limit the maximum
   9130 value of cd->names_found and cd->name_entry_size. */
   9131 
   9132 size = sizeof(REAL_PCRE) +
   9133   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
   9134 
   9135 /* Get the memory. */
   9136 
   9137 re = (REAL_PCRE *)(PUBL(malloc))(size);
   9138 if (re == NULL)
   9139   {
   9140   errorcode = ERR21;
   9141   goto PCRE_EARLY_ERROR_RETURN;
   9142   }
   9143 
   9144 /* Put in the magic number, and save the sizes, initial options, internal
   9145 flags, and character table pointer. NULL is used for the default character
   9146 tables. The nullpad field is at the end; it's there to help in the case when a
   9147 regex compiled on a system with 4-byte pointers is run on another with 8-byte
   9148 pointers. */
   9149 
   9150 re->magic_number = MAGIC_NUMBER;
   9151 re->size = (int)size;
   9152 re->options = cd->external_options;
   9153 re->flags = cd->external_flags;
   9154 re->limit_match = limit_match;
   9155 re->limit_recursion = limit_recursion;
   9156 re->first_char = 0;
   9157 re->req_char = 0;
   9158 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
   9159 re->name_entry_size = cd->name_entry_size;
   9160 re->name_count = cd->names_found;
   9161 re->ref_count = 0;
   9162 re->tables = (tables == PRIV(default_tables))? NULL : tables;
   9163 re->nullpad = NULL;
   9164 #ifdef COMPILE_PCRE32
   9165 re->dummy = 0;
   9166 #else
   9167 re->dummy1 = re->dummy2 = re->dummy3 = 0;
   9168 #endif
   9169 
   9170 /* The starting points of the name/number translation table and of the code are
   9171 passed around in the compile data block. The start/end pattern and initial
   9172 options are already set from the pre-compile phase, as is the name_entry_size
   9173 field. Reset the bracket count and the names_found field. Also reset the hwm
   9174 field; this time it's used for remembering forward references to subpatterns.
   9175 */
   9176 
   9177 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
   9178 cd->parens_depth = 0;
   9179 cd->assert_depth = 0;
   9180 cd->bracount = 0;
   9181 cd->max_lookbehind = 0;
   9182 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
   9183 codestart = cd->name_table + re->name_entry_size * re->name_count;
   9184 cd->start_code = codestart;
   9185 cd->hwm = (pcre_uchar *)(cd->start_workspace);
   9186 cd->req_varyopt = 0;
   9187 cd->had_accept = FALSE;
   9188 cd->had_pruneorskip = FALSE;
   9189 cd->check_lookbehind = FALSE;
   9190 cd->open_caps = NULL;
   9191 
   9192 /* If any named groups were found, create the name/number table from the list
   9193 created in the first pass. */
   9194 
   9195 if (cd->names_found > 0)
   9196   {
   9197   int i = cd->names_found;
   9198   named_group *ng = cd->named_groups;
   9199   cd->names_found = 0;
   9200   for (; i > 0; i--, ng++)
   9201     add_name(cd, ng->name, ng->length, ng->number);
   9202   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
   9203     (PUBL(free))((void *)cd->named_groups);
   9204   }
   9205 
   9206 /* Set up a starting, non-extracting bracket, then compile the expression. On
   9207 error, errorcode will be set non-zero, so we don't need to look at the result
   9208 of the function here. */
   9209 
   9210 ptr = (const pcre_uchar *)pattern + skipatstart;
   9211 code = (pcre_uchar *)codestart;
   9212 *code = OP_BRA;
   9213 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
   9214   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
   9215 re->top_bracket = cd->bracount;
   9216 re->top_backref = cd->top_backref;
   9217 re->max_lookbehind = cd->max_lookbehind;
   9218 re->flags = cd->external_flags | PCRE_MODE;
   9219 
   9220 if (cd->had_accept)
   9221   {
   9222   reqchar = 0;              /* Must disable after (*ACCEPT) */
   9223   reqcharflags = REQ_NONE;
   9224   }
   9225 
   9226 /* If not reached end of pattern on success, there's an excess bracket. */
   9227 
   9228 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
   9229 
   9230 /* Fill in the terminating state and check for disastrous overflow, but
   9231 if debugging, leave the test till after things are printed out. */
   9232 
   9233 *code++ = OP_END;
   9234 
   9235 #ifndef PCRE_DEBUG
   9236 if (code - codestart > length) errorcode = ERR23;
   9237 #endif
   9238 
   9239 #ifdef SUPPORT_VALGRIND
   9240 /* If the estimated length exceeds the really used length, mark the extra
   9241 allocated memory as unaddressable, so that any out-of-bound reads can be
   9242 detected. */
   9243 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
   9244 #endif
   9245 
   9246 /* Fill in any forward references that are required. There may be repeated
   9247 references; optimize for them, as searching a large regex takes time. */
   9248 
   9249 if (cd->hwm > cd->start_workspace)
   9250   {
   9251   int prev_recno = -1;
   9252   const pcre_uchar *groupptr = NULL;
   9253   while (errorcode == 0 && cd->hwm > cd->start_workspace)
   9254     {
   9255     int offset, recno;
   9256     cd->hwm -= LINK_SIZE;
   9257     offset = GET(cd->hwm, 0);
   9258     recno = GET(codestart, offset);
   9259     if (recno != prev_recno)
   9260       {
   9261       groupptr = PRIV(find_bracket)(codestart, utf, recno);
   9262       prev_recno = recno;
   9263       }
   9264     if (groupptr == NULL) errorcode = ERR53;
   9265       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
   9266     }
   9267   }
   9268 
   9269 /* If the workspace had to be expanded, free the new memory. Set the pointer to
   9270 NULL to indicate that forward references have been filled in. */
   9271 
   9272 if (cd->workspace_size > COMPILE_WORK_SIZE)
   9273   (PUBL(free))((void *)cd->start_workspace);
   9274 cd->start_workspace = NULL;
   9275 
   9276 /* Give an error if there's back reference to a non-existent capturing
   9277 subpattern. */
   9278 
   9279 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
   9280 
   9281 /* Unless disabled, check whether any single character iterators can be
   9282 auto-possessified. The function overwrites the appropriate opcode values, so
   9283 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
   9284 used in this code because at least one compiler gives a warning about loss of
   9285 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
   9286 function call. */
   9287 
   9288 if ((options & PCRE_NO_AUTO_POSSESS) == 0)
   9289   {
   9290   pcre_uchar *temp = (pcre_uchar *)codestart;
   9291   auto_possessify(temp, utf, cd);
   9292   }
   9293 
   9294 /* If there were any lookbehind assertions that contained OP_RECURSE
   9295 (recursions or subroutine calls), a flag is set for them to be checked here,
   9296 because they may contain forward references. Actual recursions cannot be fixed
   9297 length, but subroutine calls can. It is done like this so that those without
   9298 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
   9299 exceptional ones forgo this. We scan the pattern to check that they are fixed
   9300 length, and set their lengths. */
   9301 
   9302 if (cd->check_lookbehind)
   9303   {
   9304   pcre_uchar *cc = (pcre_uchar *)codestart;
   9305 
   9306   /* Loop, searching for OP_REVERSE items, and process those that do not have
   9307   their length set. (Actually, it will also re-process any that have a length
   9308   of zero, but that is a pathological case, and it does no harm.) When we find
   9309   one, we temporarily terminate the branch it is in while we scan it. */
   9310 
   9311   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
   9312        cc != NULL;
   9313        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
   9314     {
   9315     if (GET(cc, 1) == 0)
   9316       {
   9317       int fixed_length;
   9318       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
   9319       int end_op = *be;
   9320       *be = OP_END;
   9321       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
   9322         cd);
   9323       *be = end_op;
   9324       DPRINTF(("fixed length = %d\n", fixed_length));
   9325       if (fixed_length < 0)
   9326         {
   9327         errorcode = (fixed_length == -2)? ERR36 :
   9328                     (fixed_length == -4)? ERR70 : ERR25;
   9329         break;
   9330         }
   9331       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
   9332       PUT(cc, 1, fixed_length);
   9333       }
   9334     cc += 1 + LINK_SIZE;
   9335     }
   9336   }
   9337 
   9338 /* Failed to compile, or error while post-processing */
   9339 
   9340 if (errorcode != 0)
   9341   {
   9342   (PUBL(free))(re);
   9343   PCRE_EARLY_ERROR_RETURN:
   9344   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
   9345   PCRE_EARLY_ERROR_RETURN2:
   9346   *errorptr = find_error_text(errorcode);
   9347   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
   9348   return NULL;
   9349   }
   9350 
   9351 /* If the anchored option was not passed, set the flag if we can determine that
   9352 the pattern is anchored by virtue of ^ characters or \A or anything else, such
   9353 as starting with non-atomic .* when DOTALL is set and there are no occurrences
   9354 of *PRUNE or *SKIP.
   9355 
   9356 Otherwise, if we know what the first byte has to be, save it, because that
   9357 speeds up unanchored matches no end. If not, see if we can set the
   9358 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
   9359 start with ^. and also when all branches start with non-atomic .* for
   9360 non-DOTALL matches when *PRUNE and SKIP are not present. */
   9361 
   9362 if ((re->options & PCRE_ANCHORED) == 0)
   9363   {
   9364   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
   9365   else
   9366     {
   9367     if (firstcharflags < 0)
   9368       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
   9369     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
   9370       {
   9371 #if defined COMPILE_PCRE8
   9372       re->first_char = firstchar & 0xff;
   9373 #elif defined COMPILE_PCRE16
   9374       re->first_char = firstchar & 0xffff;
   9375 #elif defined COMPILE_PCRE32
   9376       re->first_char = firstchar;
   9377 #endif
   9378       if ((firstcharflags & REQ_CASELESS) != 0)
   9379         {
   9380 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
   9381         /* We ignore non-ASCII first chars in 8 bit mode. */
   9382         if (utf)
   9383           {
   9384           if (re->first_char < 128)
   9385             {
   9386             if (cd->fcc[re->first_char] != re->first_char)
   9387               re->flags |= PCRE_FCH_CASELESS;
   9388             }
   9389           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
   9390             re->flags |= PCRE_FCH_CASELESS;
   9391           }
   9392         else
   9393 #endif
   9394         if (MAX_255(re->first_char)
   9395             && cd->fcc[re->first_char] != re->first_char)
   9396           re->flags |= PCRE_FCH_CASELESS;
   9397         }
   9398 
   9399       re->flags |= PCRE_FIRSTSET;
   9400       }
   9401 
   9402     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
   9403     }
   9404   }
   9405 
   9406 /* For an anchored pattern, we use the "required byte" only if it follows a
   9407 variable length item in the regex. Remove the caseless flag for non-caseable
   9408 bytes. */
   9409 
   9410 if (reqcharflags >= 0 &&
   9411      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
   9412   {
   9413 #if defined COMPILE_PCRE8
   9414   re->req_char = reqchar & 0xff;
   9415 #elif defined COMPILE_PCRE16
   9416   re->req_char = reqchar & 0xffff;
   9417 #elif defined COMPILE_PCRE32
   9418   re->req_char = reqchar;
   9419 #endif
   9420   if ((reqcharflags & REQ_CASELESS) != 0)
   9421     {
   9422 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
   9423     /* We ignore non-ASCII first chars in 8 bit mode. */
   9424     if (utf)
   9425       {
   9426       if (re->req_char < 128)
   9427         {
   9428         if (cd->fcc[re->req_char] != re->req_char)
   9429           re->flags |= PCRE_RCH_CASELESS;
   9430         }
   9431       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
   9432         re->flags |= PCRE_RCH_CASELESS;
   9433       }
   9434     else
   9435 #endif
   9436     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
   9437       re->flags |= PCRE_RCH_CASELESS;
   9438     }
   9439 
   9440   re->flags |= PCRE_REQCHSET;
   9441   }
   9442 
   9443 /* Print out the compiled data if debugging is enabled. This is never the
   9444 case when building a production library. */
   9445 
   9446 #ifdef PCRE_DEBUG
   9447 printf("Length = %d top_bracket = %d top_backref = %d\n",
   9448   length, re->top_bracket, re->top_backref);
   9449 
   9450 printf("Options=%08x\n", re->options);
   9451 
   9452 if ((re->flags & PCRE_FIRSTSET) != 0)
   9453   {
   9454   pcre_uchar ch = re->first_char;
   9455   const char *caseless =
   9456     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
   9457   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
   9458     else printf("First char = \\x%02x%s\n", ch, caseless);
   9459   }
   9460 
   9461 if ((re->flags & PCRE_REQCHSET) != 0)
   9462   {
   9463   pcre_uchar ch = re->req_char;
   9464   const char *caseless =
   9465     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
   9466   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
   9467     else printf("Req char = \\x%02x%s\n", ch, caseless);
   9468   }
   9469 
   9470 #if defined COMPILE_PCRE8
   9471 pcre_printint((pcre *)re, stdout, TRUE);
   9472 #elif defined COMPILE_PCRE16
   9473 pcre16_printint((pcre *)re, stdout, TRUE);
   9474 #elif defined COMPILE_PCRE32
   9475 pcre32_printint((pcre *)re, stdout, TRUE);
   9476 #endif
   9477 
   9478 /* This check is done here in the debugging case so that the code that
   9479 was compiled can be seen. */
   9480 
   9481 if (code - codestart > length)
   9482   {
   9483   (PUBL(free))(re);
   9484   *errorptr = find_error_text(ERR23);
   9485   *erroroffset = ptr - (pcre_uchar *)pattern;
   9486   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
   9487   return NULL;
   9488   }
   9489 #endif   /* PCRE_DEBUG */
   9490 
   9491 /* Check for a pattern than can match an empty string, so that this information
   9492 can be provided to applications. */
   9493 
   9494 do
   9495   {
   9496   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
   9497     {
   9498     re->flags |= PCRE_MATCH_EMPTY;
   9499     break;
   9500     }
   9501   codestart += GET(codestart, 1);
   9502   }
   9503 while (*codestart == OP_ALT);
   9504 
   9505 #if defined COMPILE_PCRE8
   9506 return (pcre *)re;
   9507 #elif defined COMPILE_PCRE16
   9508 return (pcre16 *)re;
   9509 #elif defined COMPILE_PCRE32
   9510 return (pcre32 *)re;
   9511 #endif
   9512 }
   9513 
   9514 /* End of pcre_compile.c */
   9515 
   9516