Home | History | Annotate | Download | only in dist
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9            Copyright (c) 1997-2014 University of Cambridge
     10 
     11 -----------------------------------------------------------------------------
     12 Redistribution and use in source and binary forms, with or without
     13 modification, are permitted provided that the following conditions are met:
     14 
     15     * Redistributions of source code must retain the above copyright notice,
     16       this list of conditions and the following disclaimer.
     17 
     18     * Redistributions in binary form must reproduce the above copyright
     19       notice, this list of conditions and the following disclaimer in the
     20       documentation and/or other materials provided with the distribution.
     21 
     22     * Neither the name of the University of Cambridge nor the names of its
     23       contributors may be used to endorse or promote products derived from
     24       this software without specific prior written permission.
     25 
     26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36 POSSIBILITY OF SUCH DAMAGE.
     37 -----------------------------------------------------------------------------
     38 */
     39 
     40 
     41 /* This module contains the external function pcre_compile(), along with
     42 supporting internal functions that are not used by other modules. */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #define NLBLOCK cd             /* Block containing newline information */
     50 #define PSSTART start_pattern  /* Field containing pattern start */
     51 #define PSEND   end_pattern    /* Field containing pattern end */
     52 
     53 #include "pcre_internal.h"
     54 
     55 
     56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
     57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
     58 library. We do not need to select pcre16_printint.c specially, because the
     59 COMPILE_PCREx macro will already be appropriately set. */
     60 
     61 #ifdef PCRE_DEBUG
     62 /* pcre_printint.c should not include any headers */
     63 #define PCRE_INCLUDED
     64 #include "pcre_printint.c"
     65 #undef PCRE_INCLUDED
     66 #endif
     67 
     68 
     69 /* Macro for setting individual bits in class bitmaps. */
     70 
     71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
     72 
     73 /* Maximum length value to check against when making sure that the integer that
     74 holds the compiled pattern length does not overflow. We make it a bit less than
     75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
     76 to check them every time. */
     77 
     78 #define OFLOW_MAX (INT_MAX - 20)
     79 
     80 /* Definitions to allow mutual recursion */
     81 
     82 static int
     83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
     84     const pcre_uint32 *, unsigned int);
     85 
     86 static BOOL
     87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
     88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
     89     compile_data *, int *);
     90 
     91 
     92 
     93 /*************************************************
     94 *      Code parameters and static tables         *
     95 *************************************************/
     96 
     97 /* This value specifies the size of stack workspace that is used during the
     98 first pre-compile phase that determines how much memory is required. The regex
     99 is partly compiled into this space, but the compiled parts are discarded as
    100 soon as they can be, so that hopefully there will never be an overrun. The code
    101 does, however, check for an overrun. The largest amount I've seen used is 218,
    102 so this number is very generous.
    103 
    104 The same workspace is used during the second, actual compile phase for
    105 remembering forward references to groups so that they can be filled in at the
    106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
    107 is 4 there is plenty of room for most patterns. However, the memory can get
    108 filled up by repetitions of forward references, for example patterns like
    109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
    110 that the workspace is expanded using malloc() in this situation. The value
    111 below is therefore a minimum, and we put a maximum on it for safety. The
    112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
    113 kicks in at the same number of forward references in all cases. */
    114 
    115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
    116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
    117 
    118 /* This value determines the size of the initial vector that is used for
    119 remembering named groups during the pre-compile. It is allocated on the stack,
    120 but if it is too small, it is expanded using malloc(), in a similar way to the
    121 workspace. The value is the number of slots in the list. */
    122 
    123 #define NAMED_GROUP_LIST_SIZE  20
    124 
    125 /* The overrun tests check for a slightly smaller size so that they detect the
    126 overrun before it actually does run off the end of the data block. */
    127 
    128 #define WORK_SIZE_SAFETY_MARGIN (100)
    129 
    130 /* Private flags added to firstchar and reqchar. */
    131 
    132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
    133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
    134 /* Negative values for the firstchar and reqchar flags */
    135 #define REQ_UNSET       (-2)
    136 #define REQ_NONE        (-1)
    137 
    138 /* Repeated character flags. */
    139 
    140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
    141 
    142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
    143 are simple data values; negative values are for special things like \d and so
    144 on. Zero means further processing is needed (for things like \x), or the escape
    145 is invalid. */
    146 
    147 #ifndef EBCDIC
    148 
    149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
    150 in UTF-8 mode. */
    151 
    152 static const short int escapes[] = {
    153      0,                       0,
    154      0,                       0,
    155      0,                       0,
    156      0,                       0,
    157      0,                       0,
    158      CHAR_COLON,              CHAR_SEMICOLON,
    159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
    160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
    161      CHAR_COMMERCIAL_AT,      -ESC_A,
    162      -ESC_B,                  -ESC_C,
    163      -ESC_D,                  -ESC_E,
    164      0,                       -ESC_G,
    165      -ESC_H,                  0,
    166      0,                       -ESC_K,
    167      0,                       0,
    168      -ESC_N,                  0,
    169      -ESC_P,                  -ESC_Q,
    170      -ESC_R,                  -ESC_S,
    171      0,                       0,
    172      -ESC_V,                  -ESC_W,
    173      -ESC_X,                  0,
    174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
    175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
    176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
    177      CHAR_GRAVE_ACCENT,       ESC_a,
    178      -ESC_b,                  0,
    179      -ESC_d,                  ESC_e,
    180      ESC_f,                   0,
    181      -ESC_h,                  0,
    182      0,                       -ESC_k,
    183      0,                       0,
    184      ESC_n,                   0,
    185      -ESC_p,                  0,
    186      ESC_r,                   -ESC_s,
    187      ESC_tee,                 0,
    188      -ESC_v,                  -ESC_w,
    189      0,                       0,
    190      -ESC_z
    191 };
    192 
    193 #else
    194 
    195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
    196 
    197 static const short int escapes[] = {
    198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
    199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
    200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
    201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
    202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
    203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
    204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
    205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
    206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
    207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
    208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
    209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
    210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
    211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
    212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
    213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
    214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
    215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
    216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
    217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
    218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
    219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
    220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
    221 };
    222 
    223 /* We also need a table of characters that may follow \c in an EBCDIC
    224 environment for characters 0-31. */
    225 
    226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
    227 
    228 #endif
    229 
    230 
    231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
    232 searched linearly. Put all the names into a single string, in order to reduce
    233 the number of relocations when a shared library is dynamically linked. The
    234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
    235 platforms. */
    236 
    237 typedef struct verbitem {
    238   int   len;                 /* Length of verb name */
    239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
    240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
    241 } verbitem;
    242 
    243 static const char verbnames[] =
    244   "\0"                       /* Empty name is a shorthand for MARK */
    245   STRING_MARK0
    246   STRING_ACCEPT0
    247   STRING_COMMIT0
    248   STRING_F0
    249   STRING_FAIL0
    250   STRING_PRUNE0
    251   STRING_SKIP0
    252   STRING_THEN;
    253 
    254 static const verbitem verbs[] = {
    255   { 0, -1,        OP_MARK },
    256   { 4, -1,        OP_MARK },
    257   { 6, OP_ACCEPT, -1 },
    258   { 6, OP_COMMIT, -1 },
    259   { 1, OP_FAIL,   -1 },
    260   { 4, OP_FAIL,   -1 },
    261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
    262   { 4, OP_SKIP,   OP_SKIP_ARG  },
    263   { 4, OP_THEN,   OP_THEN_ARG  }
    264 };
    265 
    266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
    267 
    268 
    269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
    270 another regex library. */
    271 
    272 static const pcre_uchar sub_start_of_word[] = {
    273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
    275 
    276 static const pcre_uchar sub_end_of_word[] = {
    277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
    278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
    279   CHAR_RIGHT_PARENTHESIS, '\0' };
    280 
    281 
    282 /* Tables of names of POSIX character classes and their lengths. The names are
    283 now all in a single string, to reduce the number of relocations when a shared
    284 library is dynamically loaded. The list of lengths is terminated by a zero
    285 length entry. The first three must be alpha, lower, upper, as this is assumed
    286 for handling case independence. The indices for graph, print, and punct are
    287 needed, so identify them. */
    288 
    289 static const char posix_names[] =
    290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
    291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
    292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
    293   STRING_word0  STRING_xdigit;
    294 
    295 static const pcre_uint8 posix_name_lengths[] = {
    296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
    297 
    298 #define PC_GRAPH  8
    299 #define PC_PRINT  9
    300 #define PC_PUNCT 10
    301 
    302 
    303 /* Table of class bit maps for each POSIX class. Each class is formed from a
    304 base map, with an optional addition or removal of another map. Then, for some
    305 classes, there is some additional tweaking: for [:blank:] the vertical space
    306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
    307 character is removed. The triples in the table consist of the base map offset,
    308 second map offset or -1 if no second map, and a non-negative value for map
    309 addition or a negative value for map subtraction (if there are two maps). The
    310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
    311 remove vertical space characters, 2 => remove underscore. */
    312 
    313 static const int posix_class_maps[] = {
    314   cbit_word,  cbit_digit, -2,             /* alpha */
    315   cbit_lower, -1,          0,             /* lower */
    316   cbit_upper, -1,          0,             /* upper */
    317   cbit_word,  -1,          2,             /* alnum - word without underscore */
    318   cbit_print, cbit_cntrl,  0,             /* ascii */
    319   cbit_space, -1,          1,             /* blank - a GNU extension */
    320   cbit_cntrl, -1,          0,             /* cntrl */
    321   cbit_digit, -1,          0,             /* digit */
    322   cbit_graph, -1,          0,             /* graph */
    323   cbit_print, -1,          0,             /* print */
    324   cbit_punct, -1,          0,             /* punct */
    325   cbit_space, -1,          0,             /* space */
    326   cbit_word,  -1,          0,             /* word - a Perl extension */
    327   cbit_xdigit,-1,          0              /* xdigit */
    328 };
    329 
    330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
    331 Unicode property escapes. */
    332 
    333 #ifdef SUPPORT_UCP
    334 static const pcre_uchar string_PNd[]  = {
    335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    337 static const pcre_uchar string_pNd[]  = {
    338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    340 static const pcre_uchar string_PXsp[] = {
    341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    343 static const pcre_uchar string_pXsp[] = {
    344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    346 static const pcre_uchar string_PXwd[] = {
    347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    349 static const pcre_uchar string_pXwd[] = {
    350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    352 
    353 static const pcre_uchar *substitutes[] = {
    354   string_PNd,           /* \D */
    355   string_pNd,           /* \d */
    356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
    357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
    358   string_PXwd,          /* \W */
    359   string_pXwd           /* \w */
    360 };
    361 
    362 /* The POSIX class substitutes must be in the order of the POSIX class names,
    363 defined above, and there are both positive and negative cases. NULL means no
    364 general substitute of a Unicode property escape (\p or \P). However, for some
    365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
    366 directly. */
    367 
    368 static const pcre_uchar string_pL[] =   {
    369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    371 static const pcre_uchar string_pLl[] =  {
    372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    374 static const pcre_uchar string_pLu[] =  {
    375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    377 static const pcre_uchar string_pXan[] = {
    378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    380 static const pcre_uchar string_h[] =    {
    381   CHAR_BACKSLASH, CHAR_h, '\0' };
    382 static const pcre_uchar string_pXps[] = {
    383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
    384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    385 static const pcre_uchar string_PL[] =   {
    386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    388 static const pcre_uchar string_PLl[] =  {
    389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    391 static const pcre_uchar string_PLu[] =  {
    392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    394 static const pcre_uchar string_PXan[] = {
    395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    397 static const pcre_uchar string_H[] =    {
    398   CHAR_BACKSLASH, CHAR_H, '\0' };
    399 static const pcre_uchar string_PXps[] = {
    400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
    401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
    402 
    403 static const pcre_uchar *posix_substitutes[] = {
    404   string_pL,            /* alpha */
    405   string_pLl,           /* lower */
    406   string_pLu,           /* upper */
    407   string_pXan,          /* alnum */
    408   NULL,                 /* ascii */
    409   string_h,             /* blank */
    410   NULL,                 /* cntrl */
    411   string_pNd,           /* digit */
    412   NULL,                 /* graph */
    413   NULL,                 /* print */
    414   NULL,                 /* punct */
    415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
    416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
    417   NULL,                 /* xdigit */
    418   /* Negated cases */
    419   string_PL,            /* ^alpha */
    420   string_PLl,           /* ^lower */
    421   string_PLu,           /* ^upper */
    422   string_PXan,          /* ^alnum */
    423   NULL,                 /* ^ascii */
    424   string_H,             /* ^blank */
    425   NULL,                 /* ^cntrl */
    426   string_PNd,           /* ^digit */
    427   NULL,                 /* ^graph */
    428   NULL,                 /* ^print */
    429   NULL,                 /* ^punct */
    430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
    431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
    432   NULL                  /* ^xdigit */
    433 };
    434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
    435 #endif
    436 
    437 #define STRING(a)  # a
    438 #define XSTRING(s) STRING(s)
    439 
    440 /* The texts of compile-time error messages. These are "char *" because they
    441 are passed to the outside world. Do not ever re-use any error number, because
    442 they are documented. Always add a new error instead. Messages marked DEAD below
    443 are no longer used. This used to be a table of strings, but in order to reduce
    444 the number of relocations needed when a shared library is loaded dynamically,
    445 it is now one long string. We cannot use a table of offsets, because the
    446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
    447 simply count through to the one we want - this isn't a performance issue
    448 because these strings are used only when there is a compilation error.
    449 
    450 Each substring ends with \0 to insert a null character. This includes the final
    451 substring, so that the whole string ends with \0\0, which can be detected when
    452 counting through. */
    453 
    454 static const char error_texts[] =
    455   "no error\0"
    456   "\\ at end of pattern\0"
    457   "\\c at end of pattern\0"
    458   "unrecognized character follows \\\0"
    459   "numbers out of order in {} quantifier\0"
    460   /* 5 */
    461   "number too big in {} quantifier\0"
    462   "missing terminating ] for character class\0"
    463   "invalid escape sequence in character class\0"
    464   "range out of order in character class\0"
    465   "nothing to repeat\0"
    466   /* 10 */
    467   "internal error: invalid forward reference offset\0"
    468   "internal error: unexpected repeat\0"
    469   "unrecognized character after (? or (?-\0"
    470   "POSIX named classes are supported only within a class\0"
    471   "missing )\0"
    472   /* 15 */
    473   "reference to non-existent subpattern\0"
    474   "erroffset passed as NULL\0"
    475   "unknown option bit(s) set\0"
    476   "missing ) after comment\0"
    477   "parentheses nested too deeply\0"  /** DEAD **/
    478   /* 20 */
    479   "regular expression is too large\0"
    480   "failed to get memory\0"
    481   "unmatched parentheses\0"
    482   "internal error: code overflow\0"
    483   "unrecognized character after (?<\0"
    484   /* 25 */
    485   "lookbehind assertion is not fixed length\0"
    486   "malformed number or name after (?(\0"
    487   "conditional group contains more than two branches\0"
    488   "assertion expected after (?(\0"
    489   "(?R or (?[+-]digits must be followed by )\0"
    490   /* 30 */
    491   "unknown POSIX class name\0"
    492   "POSIX collating elements are not supported\0"
    493   "this version of PCRE is compiled without UTF support\0"
    494   "spare error\0"  /** DEAD **/
    495   "character value in \\x{} or \\o{} is too large\0"
    496   /* 35 */
    497   "invalid condition (?(0)\0"
    498   "\\C not allowed in lookbehind assertion\0"
    499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
    500   "number after (?C is > 255\0"
    501   "closing ) for (?C expected\0"
    502   /* 40 */
    503   "recursive call could loop indefinitely\0"
    504   "unrecognized character after (?P\0"
    505   "syntax error in subpattern name (missing terminator)\0"
    506   "two named subpatterns have the same name\0"
    507   "invalid UTF-8 string\0"
    508   /* 45 */
    509   "support for \\P, \\p, and \\X has not been compiled\0"
    510   "malformed \\P or \\p sequence\0"
    511   "unknown property name after \\P or \\p\0"
    512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
    513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
    514   /* 50 */
    515   "repeated subpattern is too long\0"    /** DEAD **/
    516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
    517   "internal error: overran compiling workspace\0"
    518   "internal error: previously-checked referenced subpattern not found\0"
    519   "DEFINE group contains more than one branch\0"
    520   /* 55 */
    521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
    522   "inconsistent NEWLINE options\0"
    523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
    524   "a numbered reference must not be zero\0"
    525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
    526   /* 60 */
    527   "(*VERB) not recognized or malformed\0"
    528   "number is too big\0"
    529   "subpattern name expected\0"
    530   "digit expected after (?+\0"
    531   "] is an invalid data character in JavaScript compatibility mode\0"
    532   /* 65 */
    533   "different names for subpatterns of the same number are not allowed\0"
    534   "(*MARK) must have an argument\0"
    535   "this version of PCRE is not compiled with Unicode property support\0"
    536 #ifndef EBCDIC
    537   "\\c must be followed by an ASCII character\0"
    538 #else
    539   "\\c must be followed by a letter or one of [\\]^_?\0"
    540 #endif
    541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
    542   /* 70 */
    543   "internal error: unknown opcode in find_fixedlength()\0"
    544   "\\N is not supported in a class\0"
    545   "too many forward references\0"
    546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
    547   "invalid UTF-16 string\0"
    548   /* 75 */
    549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
    550   "character value in \\u.... sequence is too large\0"
    551   "invalid UTF-32 string\0"
    552   "setting UTF is disabled by the application\0"
    553   "non-hex character in \\x{} (closing brace missing?)\0"
    554   /* 80 */
    555   "non-octal character in \\o{} (closing brace missing?)\0"
    556   "missing opening brace after \\o\0"
    557   "parentheses are too deeply nested\0"
    558   "invalid range in character class\0"
    559   "group name must start with a non-digit\0"
    560   /* 85 */
    561   "parentheses are too deeply nested (stack check)\0"
    562   "digits missing in \\x{} or \\o{}\0"
    563   ;
    564 
    565 /* Table to identify digits and hex digits. This is used when compiling
    566 patterns. Note that the tables in chartables are dependent on the locale, and
    567 may mark arbitrary characters as digits - but the PCRE compiling code expects
    568 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
    569 a private table here. It costs 256 bytes, but it is a lot faster than doing
    570 character value tests (at least in some simple cases I timed), and in some
    571 applications one wants PCRE to compile efficiently as well as match
    572 efficiently.
    573 
    574 For convenience, we use the same bit definitions as in chartables:
    575 
    576   0x04   decimal digit
    577   0x08   hexadecimal digit
    578 
    579 Then we can use ctype_digit and ctype_xdigit in the code. */
    580 
    581 /* Using a simple comparison for decimal numbers rather than a memory read
    582 is much faster, and the resulting code is simpler (the compiler turns it
    583 into a subtraction and unsigned comparison). */
    584 
    585 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
    586 
    587 #ifndef EBCDIC
    588 
    589 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
    590 UTF-8 mode. */
    591 
    592 static const pcre_uint8 digitab[] =
    593   {
    594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
    595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
    596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
    597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
    599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
    600   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
    601   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
    602   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
    603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
    604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
    605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
    606   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
    607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
    608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
    609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
    610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
    611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
    612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
    613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
    614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
    615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
    616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
    617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
    619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
    620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
    621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
    622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
    623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
    624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
    625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
    626 
    627 #else
    628 
    629 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
    630 
    631 static const pcre_uint8 digitab[] =
    632   {
    633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
    634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
    635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
    636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
    637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
    638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
    639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
    640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
    641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
    642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
    643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
    644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
    645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
    646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
    647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
    648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
    649   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
    650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
    651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
    652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
    653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
    654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
    655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
    656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
    657   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
    658   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
    659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
    660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
    661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
    662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
    663   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
    664   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
    665 
    666 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
    667   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
    668   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
    669   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
    670   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    671   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
    672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
    673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
    674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
    675   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
    676   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
    677   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
    678   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
    679   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
    680   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
    681   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
    682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
    683   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
    684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
    685   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
    686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
    687   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
    688   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
    689   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
    690   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    691   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
    692   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
    693   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
    694   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
    695   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
    696   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
    697   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
    698   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
    699 #endif
    700 
    701 
    702 /* This table is used to check whether auto-possessification is possible
    703 between adjacent character-type opcodes. The left-hand (repeated) opcode is
    704 used to select the row, and the right-hand opcode is use to select the column.
    705 A value of 1 means that auto-possessification is OK. For example, the second
    706 value in the first row means that \D+\d can be turned into \D++\d.
    707 
    708 The Unicode property types (\P and \p) have to be present to fill out the table
    709 because of what their opcode values are, but the table values should always be
    710 zero because property types are handled separately in the code. The last four
    711 columns apply to items that cannot be repeated, so there is no need to have
    712 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
    713 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
    714 
    715 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
    716 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
    717 
    718 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
    719 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
    720   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
    721   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
    722   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
    723   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
    724   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
    725   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
    726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
    727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
    728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
    729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
    730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
    731   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
    732   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
    733   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
    734   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
    735   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
    736   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
    737 };
    738 
    739 
    740 /* This table is used to check whether auto-possessification is possible
    741 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
    742 left-hand (repeated) opcode is used to select the row, and the right-hand
    743 opcode is used to select the column. The values are as follows:
    744 
    745   0   Always return FALSE (never auto-possessify)
    746   1   Character groups are distinct (possessify if both are OP_PROP)
    747   2   Check character categories in the same group (general or particular)
    748   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
    749 
    750   4   Check left general category vs right particular category
    751   5   Check right general category vs left particular category
    752 
    753   6   Left alphanum vs right general category
    754   7   Left space vs right general category
    755   8   Left word vs right general category
    756 
    757   9   Right alphanum vs left general category
    758  10   Right space vs left general category
    759  11   Right word vs left general category
    760 
    761  12   Left alphanum vs right particular category
    762  13   Left space vs right particular category
    763  14   Left word vs right particular category
    764 
    765  15   Right alphanum vs left particular category
    766  16   Right space vs left particular category
    767  17   Right word vs left particular category
    768 */
    769 
    770 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
    771 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
    772   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
    773   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
    774   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
    775   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
    776   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
    777   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
    778   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
    779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
    780   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
    781   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
    782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
    783 };
    784 
    785 /* This table is used to check whether auto-possessification is possible
    786 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
    787 specifies a general category and the other specifies a particular category. The
    788 row is selected by the general category and the column by the particular
    789 category. The value is 1 if the particular category is not part of the general
    790 category. */
    791 
    792 static const pcre_uint8 catposstab[7][30] = {
    793 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
    794   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
    795   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
    796   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
    797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
    798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
    799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
    800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
    801 };
    802 
    803 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
    804 a general or particular category. The properties in each row are those
    805 that apply to the character set in question. Duplication means that a little
    806 unnecessary work is done when checking, but this keeps things much simpler
    807 because they can all use the same code. For more details see the comment where
    808 this table is used.
    809 
    810 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
    811 "space", but from Perl 5.18 it's included, so both categories are treated the
    812 same here. */
    813 
    814 static const pcre_uint8 posspropstab[3][4] = {
    815   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
    816   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
    817   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
    818 };
    819 
    820 /* This table is used when converting repeating opcodes into possessified
    821 versions as a result of an explicit possessive quantifier such as ++. A zero
    822 value means there is no possessified version - in those cases the item in
    823 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
    824 because all relevant opcodes are less than that. */
    825 
    826 static const pcre_uint8 opcode_possessify[] = {
    827   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
    828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
    829 
    830   0,                       /* NOTI */
    831   OP_POSSTAR, 0,           /* STAR, MINSTAR */
    832   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
    833   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
    834   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
    835   0,                       /* EXACT */
    836   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
    837 
    838   OP_POSSTARI, 0,          /* STARI, MINSTARI */
    839   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
    840   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
    841   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
    842   0,                       /* EXACTI */
    843   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
    844 
    845   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
    846   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
    847   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
    848   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
    849   0,                       /* NOTEXACT */
    850   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
    851 
    852   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
    853   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
    854   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
    855   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
    856   0,                       /* NOTEXACTI */
    857   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
    858 
    859   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
    860   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
    861   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
    862   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
    863   0,                       /* TYPEEXACT */
    864   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
    865 
    866   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
    867   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
    868   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
    869   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
    870   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
    871 
    872   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
    873   0, 0,                    /* REF, REFI */
    874   0, 0,                    /* DNREF, DNREFI */
    875   0, 0                     /* RECURSE, CALLOUT */
    876 };
    877 
    878 
    879 
    880 /*************************************************
    881 *            Find an error text                  *
    882 *************************************************/
    883 
    884 /* The error texts are now all in one long string, to save on relocations. As
    885 some of the text is of unknown length, we can't use a table of offsets.
    886 Instead, just count through the strings. This is not a performance issue
    887 because it happens only when there has been a compilation error.
    888 
    889 Argument:   the error number
    890 Returns:    pointer to the error string
    891 */
    892 
    893 static const char *
    894 find_error_text(int n)
    895 {
    896 const char *s = error_texts;
    897 for (; n > 0; n--)
    898   {
    899   while (*s++ != CHAR_NULL) {};
    900   if (*s == CHAR_NULL) return "Error text not found (please report)";
    901   }
    902 return s;
    903 }
    904 
    905 
    906 
    907 /*************************************************
    908 *           Expand the workspace                 *
    909 *************************************************/
    910 
    911 /* This function is called during the second compiling phase, if the number of
    912 forward references fills the existing workspace, which is originally a block on
    913 the stack. A larger block is obtained from malloc() unless the ultimate limit
    914 has been reached or the increase will be rather small.
    915 
    916 Argument: pointer to the compile data block
    917 Returns:  0 if all went well, else an error number
    918 */
    919 
    920 static int
    921 expand_workspace(compile_data *cd)
    922 {
    923 pcre_uchar *newspace;
    924 int newsize = cd->workspace_size * 2;
    925 
    926 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
    927 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
    928     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
    929  return ERR72;
    930 
    931 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
    932 if (newspace == NULL) return ERR21;
    933 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
    934 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
    935 if (cd->workspace_size > COMPILE_WORK_SIZE)
    936   (PUBL(free))((void *)cd->start_workspace);
    937 cd->start_workspace = newspace;
    938 cd->workspace_size = newsize;
    939 return 0;
    940 }
    941 
    942 
    943 
    944 /*************************************************
    945 *            Check for counted repeat            *
    946 *************************************************/
    947 
    948 /* This function is called when a '{' is encountered in a place where it might
    949 start a quantifier. It looks ahead to see if it really is a quantifier or not.
    950 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
    951 where the ddds are digits.
    952 
    953 Arguments:
    954   p         pointer to the first char after '{'
    955 
    956 Returns:    TRUE or FALSE
    957 */
    958 
    959 static BOOL
    960 is_counted_repeat(const pcre_uchar *p)
    961 {
    962 if (!IS_DIGIT(*p)) return FALSE;
    963 p++;
    964 while (IS_DIGIT(*p)) p++;
    965 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
    966 
    967 if (*p++ != CHAR_COMMA) return FALSE;
    968 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
    969 
    970 if (!IS_DIGIT(*p)) return FALSE;
    971 p++;
    972 while (IS_DIGIT(*p)) p++;
    973 
    974 return (*p == CHAR_RIGHT_CURLY_BRACKET);
    975 }
    976 
    977 
    978 
    979 /*************************************************
    980 *            Handle escapes                      *
    981 *************************************************/
    982 
    983 /* This function is called when a \ has been encountered. It either returns a
    984 positive value for a simple escape such as \n, or 0 for a data character which
    985 will be placed in chptr. A backreference to group n is returned as negative n.
    986 When UTF-8 is enabled, a positive value greater than 255 may be returned in
    987 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
    988 character of the escape sequence.
    989 
    990 Arguments:
    991   ptrptr         points to the pattern position pointer
    992   chptr          points to a returned data character
    993   errorcodeptr   points to the errorcode variable
    994   bracount       number of previous extracting brackets
    995   options        the options bits
    996   isclass        TRUE if inside a character class
    997 
    998 Returns:         zero => a data character
    999                  positive => a special escape sequence
   1000                  negative => a back reference
   1001                  on error, errorcodeptr is set
   1002 */
   1003 
   1004 static int
   1005 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
   1006   int bracount, int options, BOOL isclass)
   1007 {
   1008 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
   1009 BOOL utf = (options & PCRE_UTF8) != 0;
   1010 const pcre_uchar *ptr = *ptrptr + 1;
   1011 pcre_uint32 c;
   1012 int escape = 0;
   1013 int i;
   1014 
   1015 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
   1016 ptr--;                            /* Set pointer back to the last byte */
   1017 
   1018 /* If backslash is at the end of the pattern, it's an error. */
   1019 
   1020 if (c == CHAR_NULL) *errorcodeptr = ERR1;
   1021 
   1022 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
   1023 in a table. A non-zero result is something that can be returned immediately.
   1024 Otherwise further processing may be required. */
   1025 
   1026 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1027 /* Not alphanumeric */
   1028 else if (c < CHAR_0 || c > CHAR_z) {}
   1029 else if ((i = escapes[c - CHAR_0]) != 0)
   1030   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
   1031 
   1032 #else           /* EBCDIC coding */
   1033 /* Not alphanumeric */
   1034 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
   1035 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
   1036 #endif
   1037 
   1038 /* Escapes that need further processing, or are illegal. */
   1039 
   1040 else
   1041   {
   1042   const pcre_uchar *oldptr;
   1043   BOOL braced, negated, overflow;
   1044   int s;
   1045 
   1046   switch (c)
   1047     {
   1048     /* A number of Perl escapes are not handled by PCRE. We give an explicit
   1049     error. */
   1050 
   1051     case CHAR_l:
   1052     case CHAR_L:
   1053     *errorcodeptr = ERR37;
   1054     break;
   1055 
   1056     case CHAR_u:
   1057     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
   1058       {
   1059       /* In JavaScript, \u must be followed by four hexadecimal numbers.
   1060       Otherwise it is a lowercase u letter. */
   1061       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
   1062         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
   1063         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
   1064         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
   1065         {
   1066         c = 0;
   1067         for (i = 0; i < 4; ++i)
   1068           {
   1069           register pcre_uint32 cc = *(++ptr);
   1070 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1071           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1072           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1073 #else           /* EBCDIC coding */
   1074           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1075           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1076 #endif
   1077           }
   1078 
   1079 #if defined COMPILE_PCRE8
   1080         if (c > (utf ? 0x10ffffU : 0xffU))
   1081 #elif defined COMPILE_PCRE16
   1082         if (c > (utf ? 0x10ffffU : 0xffffU))
   1083 #elif defined COMPILE_PCRE32
   1084         if (utf && c > 0x10ffffU)
   1085 #endif
   1086           {
   1087           *errorcodeptr = ERR76;
   1088           }
   1089         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1090         }
   1091       }
   1092     else
   1093       *errorcodeptr = ERR37;
   1094     break;
   1095 
   1096     case CHAR_U:
   1097     /* In JavaScript, \U is an uppercase U letter. */
   1098     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
   1099     break;
   1100 
   1101     /* In a character class, \g is just a literal "g". Outside a character
   1102     class, \g must be followed by one of a number of specific things:
   1103 
   1104     (1) A number, either plain or braced. If positive, it is an absolute
   1105     backreference. If negative, it is a relative backreference. This is a Perl
   1106     5.10 feature.
   1107 
   1108     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
   1109     is part of Perl's movement towards a unified syntax for back references. As
   1110     this is synonymous with \k{name}, we fudge it up by pretending it really
   1111     was \k.
   1112 
   1113     (3) For Oniguruma compatibility we also support \g followed by a name or a
   1114     number either in angle brackets or in single quotes. However, these are
   1115     (possibly recursive) subroutine calls, _not_ backreferences. Just return
   1116     the ESC_g code (cf \k). */
   1117 
   1118     case CHAR_g:
   1119     if (isclass) break;
   1120     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
   1121       {
   1122       escape = ESC_g;
   1123       break;
   1124       }
   1125 
   1126     /* Handle the Perl-compatible cases */
   1127 
   1128     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   1129       {
   1130       const pcre_uchar *p;
   1131       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
   1132         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
   1133       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
   1134         {
   1135         escape = ESC_k;
   1136         break;
   1137         }
   1138       braced = TRUE;
   1139       ptr++;
   1140       }
   1141     else braced = FALSE;
   1142 
   1143     if (ptr[1] == CHAR_MINUS)
   1144       {
   1145       negated = TRUE;
   1146       ptr++;
   1147       }
   1148     else negated = FALSE;
   1149 
   1150     /* The integer range is limited by the machine's int representation. */
   1151     s = 0;
   1152     overflow = FALSE;
   1153     while (IS_DIGIT(ptr[1]))
   1154       {
   1155       if (s > INT_MAX / 10 - 1) /* Integer overflow */
   1156         {
   1157         overflow = TRUE;
   1158         break;
   1159         }
   1160       s = s * 10 + (int)(*(++ptr) - CHAR_0);
   1161       }
   1162     if (overflow) /* Integer overflow */
   1163       {
   1164       while (IS_DIGIT(ptr[1]))
   1165         ptr++;
   1166       *errorcodeptr = ERR61;
   1167       break;
   1168       }
   1169 
   1170     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
   1171       {
   1172       *errorcodeptr = ERR57;
   1173       break;
   1174       }
   1175 
   1176     if (s == 0)
   1177       {
   1178       *errorcodeptr = ERR58;
   1179       break;
   1180       }
   1181 
   1182     if (negated)
   1183       {
   1184       if (s > bracount)
   1185         {
   1186         *errorcodeptr = ERR15;
   1187         break;
   1188         }
   1189       s = bracount - (s - 1);
   1190       }
   1191 
   1192     escape = -s;
   1193     break;
   1194 
   1195     /* The handling of escape sequences consisting of a string of digits
   1196     starting with one that is not zero is not straightforward. Perl has changed
   1197     over the years. Nowadays \g{} for backreferences and \o{} for octal are
   1198     recommended to avoid the ambiguities in the old syntax.
   1199 
   1200     Outside a character class, the digits are read as a decimal number. If the
   1201     number is less than 8 (used to be 10), or if there are that many previous
   1202     extracting left brackets, then it is a back reference. Otherwise, up to
   1203     three octal digits are read to form an escaped byte. Thus \123 is likely to
   1204     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
   1205     the octal value is greater than 377, the least significant 8 bits are
   1206     taken. \8 and \9 are treated as the literal characters 8 and 9.
   1207 
   1208     Inside a character class, \ followed by a digit is always either a literal
   1209     8 or 9 or an octal number. */
   1210 
   1211     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
   1212     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
   1213 
   1214     if (!isclass)
   1215       {
   1216       oldptr = ptr;
   1217       /* The integer range is limited by the machine's int representation. */
   1218       s = (int)(c -CHAR_0);
   1219       overflow = FALSE;
   1220       while (IS_DIGIT(ptr[1]))
   1221         {
   1222         if (s > INT_MAX / 10 - 1) /* Integer overflow */
   1223           {
   1224           overflow = TRUE;
   1225           break;
   1226           }
   1227         s = s * 10 + (int)(*(++ptr) - CHAR_0);
   1228         }
   1229       if (overflow) /* Integer overflow */
   1230         {
   1231         while (IS_DIGIT(ptr[1]))
   1232           ptr++;
   1233         *errorcodeptr = ERR61;
   1234         break;
   1235         }
   1236       if (s < 8 || s <= bracount)  /* Check for back reference */
   1237         {
   1238         escape = -s;
   1239         break;
   1240         }
   1241       ptr = oldptr;      /* Put the pointer back and fall through */
   1242       }
   1243 
   1244     /* Handle a digit following \ when the number is not a back reference. If
   1245     the first digit is 8 or 9, Perl used to generate a binary zero byte and
   1246     then treat the digit as a following literal. At least by Perl 5.18 this
   1247     changed so as not to insert the binary zero. */
   1248 
   1249     if ((c = *ptr) >= CHAR_8) break;
   1250 
   1251     /* Fall through with a digit less than 8 */
   1252 
   1253     /* \0 always starts an octal number, but we may drop through to here with a
   1254     larger first octal digit. The original code used just to take the least
   1255     significant 8 bits of octal numbers (I think this is what early Perls used
   1256     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
   1257     but no more than 3 octal digits. */
   1258 
   1259     case CHAR_0:
   1260     c -= CHAR_0;
   1261     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
   1262         c = c * 8 + *(++ptr) - CHAR_0;
   1263 #ifdef COMPILE_PCRE8
   1264     if (!utf && c > 0xff) *errorcodeptr = ERR51;
   1265 #endif
   1266     break;
   1267 
   1268     /* \o is a relatively new Perl feature, supporting a more general way of
   1269     specifying character codes in octal. The only supported form is \o{ddd}. */
   1270 
   1271     case CHAR_o:
   1272     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
   1273     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
   1274       {
   1275       ptr += 2;
   1276       c = 0;
   1277       overflow = FALSE;
   1278       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
   1279         {
   1280         register pcre_uint32 cc = *ptr++;
   1281         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   1282 #ifdef COMPILE_PCRE32
   1283         if (c >= 0x20000000l) { overflow = TRUE; break; }
   1284 #endif
   1285         c = (c << 3) + cc - CHAR_0 ;
   1286 #if defined COMPILE_PCRE8
   1287         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   1288 #elif defined COMPILE_PCRE16
   1289         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   1290 #elif defined COMPILE_PCRE32
   1291         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   1292 #endif
   1293         }
   1294       if (overflow)
   1295         {
   1296         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
   1297         *errorcodeptr = ERR34;
   1298         }
   1299       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1300         {
   1301         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1302         }
   1303       else *errorcodeptr = ERR80;
   1304       }
   1305     break;
   1306 
   1307     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
   1308     numbers. Otherwise it is a lowercase x letter. */
   1309 
   1310     case CHAR_x:
   1311     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
   1312       {
   1313       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
   1314         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
   1315         {
   1316         c = 0;
   1317         for (i = 0; i < 2; ++i)
   1318           {
   1319           register pcre_uint32 cc = *(++ptr);
   1320 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1321           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1322           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1323 #else           /* EBCDIC coding */
   1324           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1325           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1326 #endif
   1327           }
   1328         }
   1329       }    /* End JavaScript handling */
   1330 
   1331     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
   1332     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
   1333     digits. If not, { used to be treated as a data character. However, Perl
   1334     seems to read hex digits up to the first non-such, and ignore the rest, so
   1335     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
   1336     now gives an error. */
   1337 
   1338     else
   1339       {
   1340       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
   1341         {
   1342         ptr += 2;
   1343         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1344           {
   1345           *errorcodeptr = ERR86;
   1346           break;
   1347           }
   1348         c = 0;
   1349         overflow = FALSE;
   1350         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
   1351           {
   1352           register pcre_uint32 cc = *ptr++;
   1353           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
   1354 
   1355 #ifdef COMPILE_PCRE32
   1356           if (c >= 0x10000000l) { overflow = TRUE; break; }
   1357 #endif
   1358 
   1359 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1360           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
   1361           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1362 #else           /* EBCDIC coding */
   1363           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
   1364           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1365 #endif
   1366 
   1367 #if defined COMPILE_PCRE8
   1368           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
   1369 #elif defined COMPILE_PCRE16
   1370           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
   1371 #elif defined COMPILE_PCRE32
   1372           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
   1373 #endif
   1374           }
   1375 
   1376         if (overflow)
   1377           {
   1378           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
   1379           *errorcodeptr = ERR34;
   1380           }
   1381 
   1382         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
   1383           {
   1384           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
   1385           }
   1386 
   1387         /* If the sequence of hex digits does not end with '}', give an error.
   1388         We used just to recognize this construct and fall through to the normal
   1389         \x handling, but nowadays Perl gives an error, which seems much more
   1390         sensible, so we do too. */
   1391 
   1392         else *errorcodeptr = ERR79;
   1393         }   /* End of \x{} processing */
   1394 
   1395       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
   1396 
   1397       else
   1398         {
   1399         c = 0;
   1400         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
   1401           {
   1402           pcre_uint32 cc;                          /* Some compilers don't like */
   1403           cc = *(++ptr);                           /* ++ in initializers */
   1404 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
   1405           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
   1406           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
   1407 #else           /* EBCDIC coding */
   1408           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
   1409           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
   1410 #endif
   1411           }
   1412         }     /* End of \xdd handling */
   1413       }       /* End of Perl-style \x handling */
   1414     break;
   1415 
   1416     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
   1417     An error is given if the byte following \c is not an ASCII character. This
   1418     coding is ASCII-specific, but then the whole concept of \cx is
   1419     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
   1420 
   1421     case CHAR_c:
   1422     c = *(++ptr);
   1423     if (c == CHAR_NULL)
   1424       {
   1425       *errorcodeptr = ERR2;
   1426       break;
   1427       }
   1428 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
   1429     if (c > 127)  /* Excludes all non-ASCII in either mode */
   1430       {
   1431       *errorcodeptr = ERR68;
   1432       break;
   1433       }
   1434     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
   1435     c ^= 0x40;
   1436 #else             /* EBCDIC coding */
   1437     if (c >= CHAR_a && c <= CHAR_z) c += 64;
   1438     if (c == CHAR_QUESTION_MARK)
   1439       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
   1440     else
   1441       {
   1442       for (i = 0; i < 32; i++)
   1443         {
   1444         if (c == ebcdic_escape_c[i]) break;
   1445         }
   1446       if (i < 32) c = i; else *errorcodeptr = ERR68;
   1447       }
   1448 #endif
   1449     break;
   1450 
   1451     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
   1452     other alphanumeric following \ is an error if PCRE_EXTRA was set;
   1453     otherwise, for Perl compatibility, it is a literal. This code looks a bit
   1454     odd, but there used to be some cases other than the default, and there may
   1455     be again in future, so I haven't "optimized" it. */
   1456 
   1457     default:
   1458     if ((options & PCRE_EXTRA) != 0) switch(c)
   1459       {
   1460       default:
   1461       *errorcodeptr = ERR3;
   1462       break;
   1463       }
   1464     break;
   1465     }
   1466   }
   1467 
   1468 /* Perl supports \N{name} for character names, as well as plain \N for "not
   1469 newline". PCRE does not support \N{name}. However, it does support
   1470 quantification such as \N{2,3}. */
   1471 
   1472 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
   1473      !is_counted_repeat(ptr+2))
   1474   *errorcodeptr = ERR37;
   1475 
   1476 /* If PCRE_UCP is set, we change the values for \d etc. */
   1477 
   1478 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
   1479   escape += (ESC_DU - ESC_D);
   1480 
   1481 /* Set the pointer to the final character before returning. */
   1482 
   1483 *ptrptr = ptr;
   1484 *chptr = c;
   1485 return escape;
   1486 }
   1487 
   1488 
   1489 
   1490 #ifdef SUPPORT_UCP
   1491 /*************************************************
   1492 *               Handle \P and \p                 *
   1493 *************************************************/
   1494 
   1495 /* This function is called after \P or \p has been encountered, provided that
   1496 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
   1497 pointing at the P or p. On exit, it is pointing at the final character of the
   1498 escape sequence.
   1499 
   1500 Argument:
   1501   ptrptr         points to the pattern position pointer
   1502   negptr         points to a boolean that is set TRUE for negation else FALSE
   1503   ptypeptr       points to an unsigned int that is set to the type value
   1504   pdataptr       points to an unsigned int that is set to the detailed property value
   1505   errorcodeptr   points to the error code variable
   1506 
   1507 Returns:         TRUE if the type value was found, or FALSE for an invalid type
   1508 */
   1509 
   1510 static BOOL
   1511 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
   1512   unsigned int *pdataptr, int *errorcodeptr)
   1513 {
   1514 pcre_uchar c;
   1515 int i, bot, top;
   1516 const pcre_uchar *ptr = *ptrptr;
   1517 pcre_uchar name[32];
   1518 
   1519 c = *(++ptr);
   1520 if (c == CHAR_NULL) goto ERROR_RETURN;
   1521 
   1522 *negptr = FALSE;
   1523 
   1524 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
   1525 negation. */
   1526 
   1527 if (c == CHAR_LEFT_CURLY_BRACKET)
   1528   {
   1529   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
   1530     {
   1531     *negptr = TRUE;
   1532     ptr++;
   1533     }
   1534   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
   1535     {
   1536     c = *(++ptr);
   1537     if (c == CHAR_NULL) goto ERROR_RETURN;
   1538     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
   1539     name[i] = c;
   1540     }
   1541   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
   1542   name[i] = 0;
   1543   }
   1544 
   1545 /* Otherwise there is just one following character */
   1546 
   1547 else
   1548   {
   1549   name[0] = c;
   1550   name[1] = 0;
   1551   }
   1552 
   1553 *ptrptr = ptr;
   1554 
   1555 /* Search for a recognized property name using binary chop */
   1556 
   1557 bot = 0;
   1558 top = PRIV(utt_size);
   1559 
   1560 while (bot < top)
   1561   {
   1562   int r;
   1563   i = (bot + top) >> 1;
   1564   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
   1565   if (r == 0)
   1566     {
   1567     *ptypeptr = PRIV(utt)[i].type;
   1568     *pdataptr = PRIV(utt)[i].value;
   1569     return TRUE;
   1570     }
   1571   if (r > 0) bot = i + 1; else top = i;
   1572   }
   1573 
   1574 *errorcodeptr = ERR47;
   1575 *ptrptr = ptr;
   1576 return FALSE;
   1577 
   1578 ERROR_RETURN:
   1579 *errorcodeptr = ERR46;
   1580 *ptrptr = ptr;
   1581 return FALSE;
   1582 }
   1583 #endif
   1584 
   1585 
   1586 
   1587 /*************************************************
   1588 *         Read repeat counts                     *
   1589 *************************************************/
   1590 
   1591 /* Read an item of the form {n,m} and return the values. This is called only
   1592 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
   1593 so the syntax is guaranteed to be correct, but we need to check the values.
   1594 
   1595 Arguments:
   1596   p              pointer to first char after '{'
   1597   minp           pointer to int for min
   1598   maxp           pointer to int for max
   1599                  returned as -1 if no max
   1600   errorcodeptr   points to error code variable
   1601 
   1602 Returns:         pointer to '}' on success;
   1603                  current ptr on error, with errorcodeptr set non-zero
   1604 */
   1605 
   1606 static const pcre_uchar *
   1607 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
   1608 {
   1609 int min = 0;
   1610 int max = -1;
   1611 
   1612 while (IS_DIGIT(*p))
   1613   {
   1614   min = min * 10 + (int)(*p++ - CHAR_0);
   1615   if (min > 65535)
   1616     {
   1617     *errorcodeptr = ERR5;
   1618     return p;
   1619     }
   1620   }
   1621 
   1622 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
   1623   {
   1624   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
   1625     {
   1626     max = 0;
   1627     while(IS_DIGIT(*p))
   1628       {
   1629       max = max * 10 + (int)(*p++ - CHAR_0);
   1630       if (max > 65535)
   1631         {
   1632         *errorcodeptr = ERR5;
   1633         return p;
   1634         }
   1635       }
   1636     if (max < min)
   1637       {
   1638       *errorcodeptr = ERR4;
   1639       return p;
   1640       }
   1641     }
   1642   }
   1643 
   1644 *minp = min;
   1645 *maxp = max;
   1646 return p;
   1647 }
   1648 
   1649 
   1650 
   1651 /*************************************************
   1652 *      Find first significant op code            *
   1653 *************************************************/
   1654 
   1655 /* This is called by several functions that scan a compiled expression looking
   1656 for a fixed first character, or an anchoring op code etc. It skips over things
   1657 that do not influence this. For some calls, it makes sense to skip negative
   1658 forward and all backward assertions, and also the \b assertion; for others it
   1659 does not.
   1660 
   1661 Arguments:
   1662   code         pointer to the start of the group
   1663   skipassert   TRUE if certain assertions are to be skipped
   1664 
   1665 Returns:       pointer to the first significant opcode
   1666 */
   1667 
   1668 static const pcre_uchar*
   1669 first_significant_code(const pcre_uchar *code, BOOL skipassert)
   1670 {
   1671 for (;;)
   1672   {
   1673   switch ((int)*code)
   1674     {
   1675     case OP_ASSERT_NOT:
   1676     case OP_ASSERTBACK:
   1677     case OP_ASSERTBACK_NOT:
   1678     if (!skipassert) return code;
   1679     do code += GET(code, 1); while (*code == OP_ALT);
   1680     code += PRIV(OP_lengths)[*code];
   1681     break;
   1682 
   1683     case OP_WORD_BOUNDARY:
   1684     case OP_NOT_WORD_BOUNDARY:
   1685     if (!skipassert) return code;
   1686     /* Fall through */
   1687 
   1688     case OP_CALLOUT:
   1689     case OP_CREF:
   1690     case OP_DNCREF:
   1691     case OP_RREF:
   1692     case OP_DNRREF:
   1693     case OP_DEF:
   1694     code += PRIV(OP_lengths)[*code];
   1695     break;
   1696 
   1697     default:
   1698     return code;
   1699     }
   1700   }
   1701 /* Control never reaches here */
   1702 }
   1703 
   1704 
   1705 
   1706 /*************************************************
   1707 *        Find the fixed length of a branch       *
   1708 *************************************************/
   1709 
   1710 /* Scan a branch and compute the fixed length of subject that will match it,
   1711 if the length is fixed. This is needed for dealing with backward assertions.
   1712 In UTF8 mode, the result is in characters rather than bytes. The branch is
   1713 temporarily terminated with OP_END when this function is called.
   1714 
   1715 This function is called when a backward assertion is encountered, so that if it
   1716 fails, the error message can point to the correct place in the pattern.
   1717 However, we cannot do this when the assertion contains subroutine calls,
   1718 because they can be forward references. We solve this by remembering this case
   1719 and doing the check at the end; a flag specifies which mode we are running in.
   1720 
   1721 Arguments:
   1722   code     points to the start of the pattern (the bracket)
   1723   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
   1724   atend    TRUE if called when the pattern is complete
   1725   cd       the "compile data" structure
   1726   recurses    chain of recurse_check to catch mutual recursion
   1727 
   1728 Returns:   the fixed length,
   1729              or -1 if there is no fixed length,
   1730              or -2 if \C was encountered (in UTF-8 mode only)
   1731              or -3 if an OP_RECURSE item was encountered and atend is FALSE
   1732              or -4 if an unknown opcode was encountered (internal error)
   1733 */
   1734 
   1735 static int
   1736 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
   1737   recurse_check *recurses)
   1738 {
   1739 int length = -1;
   1740 recurse_check this_recurse;
   1741 register int branchlength = 0;
   1742 register pcre_uchar *cc = code + 1 + LINK_SIZE;
   1743 
   1744 /* Scan along the opcodes for this branch. If we get to the end of the
   1745 branch, check the length against that of the other branches. */
   1746 
   1747 for (;;)
   1748   {
   1749   int d;
   1750   pcre_uchar *ce, *cs;
   1751   register pcre_uchar op = *cc;
   1752 
   1753   switch (op)
   1754     {
   1755     /* We only need to continue for OP_CBRA (normal capturing bracket) and
   1756     OP_BRA (normal non-capturing bracket) because the other variants of these
   1757     opcodes are all concerned with unlimited repeated groups, which of course
   1758     are not of fixed length. */
   1759 
   1760     case OP_CBRA:
   1761     case OP_BRA:
   1762     case OP_ONCE:
   1763     case OP_ONCE_NC:
   1764     case OP_COND:
   1765     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
   1766       recurses);
   1767     if (d < 0) return d;
   1768     branchlength += d;
   1769     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1770     cc += 1 + LINK_SIZE;
   1771     break;
   1772 
   1773     /* Reached end of a branch; if it's a ket it is the end of a nested call.
   1774     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
   1775     an ALT. If it is END it's the end of the outer call. All can be handled by
   1776     the same code. Note that we must not include the OP_KETRxxx opcodes here,
   1777     because they all imply an unlimited repeat. */
   1778 
   1779     case OP_ALT:
   1780     case OP_KET:
   1781     case OP_END:
   1782     case OP_ACCEPT:
   1783     case OP_ASSERT_ACCEPT:
   1784     if (length < 0) length = branchlength;
   1785       else if (length != branchlength) return -1;
   1786     if (*cc != OP_ALT) return length;
   1787     cc += 1 + LINK_SIZE;
   1788     branchlength = 0;
   1789     break;
   1790 
   1791     /* A true recursion implies not fixed length, but a subroutine call may
   1792     be OK. If the subroutine is a forward reference, we can't deal with
   1793     it until the end of the pattern, so return -3. */
   1794 
   1795     case OP_RECURSE:
   1796     if (!atend) return -3;
   1797     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
   1798     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
   1799     if (cc > cs && cc < ce) return -1;                    /* Recursion */
   1800     else   /* Check for mutual recursion */
   1801       {
   1802       recurse_check *r = recurses;
   1803       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
   1804       if (r != NULL) return -1;   /* Mutual recursion */
   1805       }
   1806     this_recurse.prev = recurses;
   1807     this_recurse.group = cs;
   1808     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
   1809     if (d < 0) return d;
   1810     branchlength += d;
   1811     cc += 1 + LINK_SIZE;
   1812     break;
   1813 
   1814     /* Skip over assertive subpatterns */
   1815 
   1816     case OP_ASSERT:
   1817     case OP_ASSERT_NOT:
   1818     case OP_ASSERTBACK:
   1819     case OP_ASSERTBACK_NOT:
   1820     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1821     cc += 1 + LINK_SIZE;
   1822     break;
   1823 
   1824     /* Skip over things that don't match chars */
   1825 
   1826     case OP_MARK:
   1827     case OP_PRUNE_ARG:
   1828     case OP_SKIP_ARG:
   1829     case OP_THEN_ARG:
   1830     cc += cc[1] + PRIV(OP_lengths)[*cc];
   1831     break;
   1832 
   1833     case OP_CALLOUT:
   1834     case OP_CIRC:
   1835     case OP_CIRCM:
   1836     case OP_CLOSE:
   1837     case OP_COMMIT:
   1838     case OP_CREF:
   1839     case OP_DEF:
   1840     case OP_DNCREF:
   1841     case OP_DNRREF:
   1842     case OP_DOLL:
   1843     case OP_DOLLM:
   1844     case OP_EOD:
   1845     case OP_EODN:
   1846     case OP_FAIL:
   1847     case OP_NOT_WORD_BOUNDARY:
   1848     case OP_PRUNE:
   1849     case OP_REVERSE:
   1850     case OP_RREF:
   1851     case OP_SET_SOM:
   1852     case OP_SKIP:
   1853     case OP_SOD:
   1854     case OP_SOM:
   1855     case OP_THEN:
   1856     case OP_WORD_BOUNDARY:
   1857     cc += PRIV(OP_lengths)[*cc];
   1858     break;
   1859 
   1860     /* Handle literal characters */
   1861 
   1862     case OP_CHAR:
   1863     case OP_CHARI:
   1864     case OP_NOT:
   1865     case OP_NOTI:
   1866     branchlength++;
   1867     cc += 2;
   1868 #ifdef SUPPORT_UTF
   1869     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1870 #endif
   1871     break;
   1872 
   1873     /* Handle exact repetitions. The count is already in characters, but we
   1874     need to skip over a multibyte character in UTF8 mode.  */
   1875 
   1876     case OP_EXACT:
   1877     case OP_EXACTI:
   1878     case OP_NOTEXACT:
   1879     case OP_NOTEXACTI:
   1880     branchlength += (int)GET2(cc,1);
   1881     cc += 2 + IMM2_SIZE;
   1882 #ifdef SUPPORT_UTF
   1883     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
   1884 #endif
   1885     break;
   1886 
   1887     case OP_TYPEEXACT:
   1888     branchlength += GET2(cc,1);
   1889     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
   1890       cc += 2;
   1891     cc += 1 + IMM2_SIZE + 1;
   1892     break;
   1893 
   1894     /* Handle single-char matchers */
   1895 
   1896     case OP_PROP:
   1897     case OP_NOTPROP:
   1898     cc += 2;
   1899     /* Fall through */
   1900 
   1901     case OP_HSPACE:
   1902     case OP_VSPACE:
   1903     case OP_NOT_HSPACE:
   1904     case OP_NOT_VSPACE:
   1905     case OP_NOT_DIGIT:
   1906     case OP_DIGIT:
   1907     case OP_NOT_WHITESPACE:
   1908     case OP_WHITESPACE:
   1909     case OP_NOT_WORDCHAR:
   1910     case OP_WORDCHAR:
   1911     case OP_ANY:
   1912     case OP_ALLANY:
   1913     branchlength++;
   1914     cc++;
   1915     break;
   1916 
   1917     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
   1918     otherwise \C is coded as OP_ALLANY. */
   1919 
   1920     case OP_ANYBYTE:
   1921     return -2;
   1922 
   1923     /* Check a class for variable quantification */
   1924 
   1925     case OP_CLASS:
   1926     case OP_NCLASS:
   1927 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1928     case OP_XCLASS:
   1929     /* The original code caused an unsigned overflow in 64 bit systems,
   1930     so now we use a conditional statement. */
   1931     if (op == OP_XCLASS)
   1932       cc += GET(cc, 1);
   1933     else
   1934       cc += PRIV(OP_lengths)[OP_CLASS];
   1935 #else
   1936     cc += PRIV(OP_lengths)[OP_CLASS];
   1937 #endif
   1938 
   1939     switch (*cc)
   1940       {
   1941       case OP_CRSTAR:
   1942       case OP_CRMINSTAR:
   1943       case OP_CRPLUS:
   1944       case OP_CRMINPLUS:
   1945       case OP_CRQUERY:
   1946       case OP_CRMINQUERY:
   1947       case OP_CRPOSSTAR:
   1948       case OP_CRPOSPLUS:
   1949       case OP_CRPOSQUERY:
   1950       return -1;
   1951 
   1952       case OP_CRRANGE:
   1953       case OP_CRMINRANGE:
   1954       case OP_CRPOSRANGE:
   1955       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
   1956       branchlength += (int)GET2(cc,1);
   1957       cc += 1 + 2 * IMM2_SIZE;
   1958       break;
   1959 
   1960       default:
   1961       branchlength++;
   1962       }
   1963     break;
   1964 
   1965     /* Anything else is variable length */
   1966 
   1967     case OP_ANYNL:
   1968     case OP_BRAMINZERO:
   1969     case OP_BRAPOS:
   1970     case OP_BRAPOSZERO:
   1971     case OP_BRAZERO:
   1972     case OP_CBRAPOS:
   1973     case OP_EXTUNI:
   1974     case OP_KETRMAX:
   1975     case OP_KETRMIN:
   1976     case OP_KETRPOS:
   1977     case OP_MINPLUS:
   1978     case OP_MINPLUSI:
   1979     case OP_MINQUERY:
   1980     case OP_MINQUERYI:
   1981     case OP_MINSTAR:
   1982     case OP_MINSTARI:
   1983     case OP_MINUPTO:
   1984     case OP_MINUPTOI:
   1985     case OP_NOTMINPLUS:
   1986     case OP_NOTMINPLUSI:
   1987     case OP_NOTMINQUERY:
   1988     case OP_NOTMINQUERYI:
   1989     case OP_NOTMINSTAR:
   1990     case OP_NOTMINSTARI:
   1991     case OP_NOTMINUPTO:
   1992     case OP_NOTMINUPTOI:
   1993     case OP_NOTPLUS:
   1994     case OP_NOTPLUSI:
   1995     case OP_NOTPOSPLUS:
   1996     case OP_NOTPOSPLUSI:
   1997     case OP_NOTPOSQUERY:
   1998     case OP_NOTPOSQUERYI:
   1999     case OP_NOTPOSSTAR:
   2000     case OP_NOTPOSSTARI:
   2001     case OP_NOTPOSUPTO:
   2002     case OP_NOTPOSUPTOI:
   2003     case OP_NOTQUERY:
   2004     case OP_NOTQUERYI:
   2005     case OP_NOTSTAR:
   2006     case OP_NOTSTARI:
   2007     case OP_NOTUPTO:
   2008     case OP_NOTUPTOI:
   2009     case OP_PLUS:
   2010     case OP_PLUSI:
   2011     case OP_POSPLUS:
   2012     case OP_POSPLUSI:
   2013     case OP_POSQUERY:
   2014     case OP_POSQUERYI:
   2015     case OP_POSSTAR:
   2016     case OP_POSSTARI:
   2017     case OP_POSUPTO:
   2018     case OP_POSUPTOI:
   2019     case OP_QUERY:
   2020     case OP_QUERYI:
   2021     case OP_REF:
   2022     case OP_REFI:
   2023     case OP_DNREF:
   2024     case OP_DNREFI:
   2025     case OP_SBRA:
   2026     case OP_SBRAPOS:
   2027     case OP_SCBRA:
   2028     case OP_SCBRAPOS:
   2029     case OP_SCOND:
   2030     case OP_SKIPZERO:
   2031     case OP_STAR:
   2032     case OP_STARI:
   2033     case OP_TYPEMINPLUS:
   2034     case OP_TYPEMINQUERY:
   2035     case OP_TYPEMINSTAR:
   2036     case OP_TYPEMINUPTO:
   2037     case OP_TYPEPLUS:
   2038     case OP_TYPEPOSPLUS:
   2039     case OP_TYPEPOSQUERY:
   2040     case OP_TYPEPOSSTAR:
   2041     case OP_TYPEPOSUPTO:
   2042     case OP_TYPEQUERY:
   2043     case OP_TYPESTAR:
   2044     case OP_TYPEUPTO:
   2045     case OP_UPTO:
   2046     case OP_UPTOI:
   2047     return -1;
   2048 
   2049     /* Catch unrecognized opcodes so that when new ones are added they
   2050     are not forgotten, as has happened in the past. */
   2051 
   2052     default:
   2053     return -4;
   2054     }
   2055   }
   2056 /* Control never gets here */
   2057 }
   2058 
   2059 
   2060 
   2061 /*************************************************
   2062 *    Scan compiled regex for specific bracket    *
   2063 *************************************************/
   2064 
   2065 /* This little function scans through a compiled pattern until it finds a
   2066 capturing bracket with the given number, or, if the number is negative, an
   2067 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
   2068 so that it can be called from pcre_study() when finding the minimum matching
   2069 length.
   2070 
   2071 Arguments:
   2072   code        points to start of expression
   2073   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   2074   number      the required bracket number or negative to find a lookbehind
   2075 
   2076 Returns:      pointer to the opcode for the bracket, or NULL if not found
   2077 */
   2078 
   2079 const pcre_uchar *
   2080 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
   2081 {
   2082 for (;;)
   2083   {
   2084   register pcre_uchar c = *code;
   2085 
   2086   if (c == OP_END) return NULL;
   2087 
   2088   /* XCLASS is used for classes that cannot be represented just by a bit
   2089   map. This includes negated single high-valued characters. The length in
   2090   the table is zero; the actual length is stored in the compiled code. */
   2091 
   2092   if (c == OP_XCLASS) code += GET(code, 1);
   2093 
   2094   /* Handle recursion */
   2095 
   2096   else if (c == OP_REVERSE)
   2097     {
   2098     if (number < 0) return (pcre_uchar *)code;
   2099     code += PRIV(OP_lengths)[c];
   2100     }
   2101 
   2102   /* Handle capturing bracket */
   2103 
   2104   else if (c == OP_CBRA || c == OP_SCBRA ||
   2105            c == OP_CBRAPOS || c == OP_SCBRAPOS)
   2106     {
   2107     int n = (int)GET2(code, 1+LINK_SIZE);
   2108     if (n == number) return (pcre_uchar *)code;
   2109     code += PRIV(OP_lengths)[c];
   2110     }
   2111 
   2112   /* Otherwise, we can get the item's length from the table, except that for
   2113   repeated character types, we have to test for \p and \P, which have an extra
   2114   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   2115   must add in its length. */
   2116 
   2117   else
   2118     {
   2119     switch(c)
   2120       {
   2121       case OP_TYPESTAR:
   2122       case OP_TYPEMINSTAR:
   2123       case OP_TYPEPLUS:
   2124       case OP_TYPEMINPLUS:
   2125       case OP_TYPEQUERY:
   2126       case OP_TYPEMINQUERY:
   2127       case OP_TYPEPOSSTAR:
   2128       case OP_TYPEPOSPLUS:
   2129       case OP_TYPEPOSQUERY:
   2130       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2131       break;
   2132 
   2133       case OP_TYPEUPTO:
   2134       case OP_TYPEMINUPTO:
   2135       case OP_TYPEEXACT:
   2136       case OP_TYPEPOSUPTO:
   2137       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2138         code += 2;
   2139       break;
   2140 
   2141       case OP_MARK:
   2142       case OP_PRUNE_ARG:
   2143       case OP_SKIP_ARG:
   2144       case OP_THEN_ARG:
   2145       code += code[1];
   2146       break;
   2147       }
   2148 
   2149     /* Add in the fixed length from the table */
   2150 
   2151     code += PRIV(OP_lengths)[c];
   2152 
   2153   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
   2154   a multi-byte character. The length in the table is a minimum, so we have to
   2155   arrange to skip the extra bytes. */
   2156 
   2157 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2158     if (utf) switch(c)
   2159       {
   2160       case OP_CHAR:
   2161       case OP_CHARI:
   2162       case OP_NOT:
   2163       case OP_NOTI:
   2164       case OP_EXACT:
   2165       case OP_EXACTI:
   2166       case OP_NOTEXACT:
   2167       case OP_NOTEXACTI:
   2168       case OP_UPTO:
   2169       case OP_UPTOI:
   2170       case OP_NOTUPTO:
   2171       case OP_NOTUPTOI:
   2172       case OP_MINUPTO:
   2173       case OP_MINUPTOI:
   2174       case OP_NOTMINUPTO:
   2175       case OP_NOTMINUPTOI:
   2176       case OP_POSUPTO:
   2177       case OP_POSUPTOI:
   2178       case OP_NOTPOSUPTO:
   2179       case OP_NOTPOSUPTOI:
   2180       case OP_STAR:
   2181       case OP_STARI:
   2182       case OP_NOTSTAR:
   2183       case OP_NOTSTARI:
   2184       case OP_MINSTAR:
   2185       case OP_MINSTARI:
   2186       case OP_NOTMINSTAR:
   2187       case OP_NOTMINSTARI:
   2188       case OP_POSSTAR:
   2189       case OP_POSSTARI:
   2190       case OP_NOTPOSSTAR:
   2191       case OP_NOTPOSSTARI:
   2192       case OP_PLUS:
   2193       case OP_PLUSI:
   2194       case OP_NOTPLUS:
   2195       case OP_NOTPLUSI:
   2196       case OP_MINPLUS:
   2197       case OP_MINPLUSI:
   2198       case OP_NOTMINPLUS:
   2199       case OP_NOTMINPLUSI:
   2200       case OP_POSPLUS:
   2201       case OP_POSPLUSI:
   2202       case OP_NOTPOSPLUS:
   2203       case OP_NOTPOSPLUSI:
   2204       case OP_QUERY:
   2205       case OP_QUERYI:
   2206       case OP_NOTQUERY:
   2207       case OP_NOTQUERYI:
   2208       case OP_MINQUERY:
   2209       case OP_MINQUERYI:
   2210       case OP_NOTMINQUERY:
   2211       case OP_NOTMINQUERYI:
   2212       case OP_POSQUERY:
   2213       case OP_POSQUERYI:
   2214       case OP_NOTPOSQUERY:
   2215       case OP_NOTPOSQUERYI:
   2216       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   2217       break;
   2218       }
   2219 #else
   2220     (void)(utf);  /* Keep compiler happy by referencing function argument */
   2221 #endif
   2222     }
   2223   }
   2224 }
   2225 
   2226 
   2227 
   2228 /*************************************************
   2229 *   Scan compiled regex for recursion reference  *
   2230 *************************************************/
   2231 
   2232 /* This little function scans through a compiled pattern until it finds an
   2233 instance of OP_RECURSE.
   2234 
   2235 Arguments:
   2236   code        points to start of expression
   2237   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   2238 
   2239 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
   2240 */
   2241 
   2242 static const pcre_uchar *
   2243 find_recurse(const pcre_uchar *code, BOOL utf)
   2244 {
   2245 for (;;)
   2246   {
   2247   register pcre_uchar c = *code;
   2248   if (c == OP_END) return NULL;
   2249   if (c == OP_RECURSE) return code;
   2250 
   2251   /* XCLASS is used for classes that cannot be represented just by a bit
   2252   map. This includes negated single high-valued characters. The length in
   2253   the table is zero; the actual length is stored in the compiled code. */
   2254 
   2255   if (c == OP_XCLASS) code += GET(code, 1);
   2256 
   2257   /* Otherwise, we can get the item's length from the table, except that for
   2258   repeated character types, we have to test for \p and \P, which have an extra
   2259   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   2260   must add in its length. */
   2261 
   2262   else
   2263     {
   2264     switch(c)
   2265       {
   2266       case OP_TYPESTAR:
   2267       case OP_TYPEMINSTAR:
   2268       case OP_TYPEPLUS:
   2269       case OP_TYPEMINPLUS:
   2270       case OP_TYPEQUERY:
   2271       case OP_TYPEMINQUERY:
   2272       case OP_TYPEPOSSTAR:
   2273       case OP_TYPEPOSPLUS:
   2274       case OP_TYPEPOSQUERY:
   2275       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2276       break;
   2277 
   2278       case OP_TYPEPOSUPTO:
   2279       case OP_TYPEUPTO:
   2280       case OP_TYPEMINUPTO:
   2281       case OP_TYPEEXACT:
   2282       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2283         code += 2;
   2284       break;
   2285 
   2286       case OP_MARK:
   2287       case OP_PRUNE_ARG:
   2288       case OP_SKIP_ARG:
   2289       case OP_THEN_ARG:
   2290       code += code[1];
   2291       break;
   2292       }
   2293 
   2294     /* Add in the fixed length from the table */
   2295 
   2296     code += PRIV(OP_lengths)[c];
   2297 
   2298     /* In UTF-8 mode, opcodes that are followed by a character may be followed
   2299     by a multi-byte character. The length in the table is a minimum, so we have
   2300     to arrange to skip the extra bytes. */
   2301 
   2302 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2303     if (utf) switch(c)
   2304       {
   2305       case OP_CHAR:
   2306       case OP_CHARI:
   2307       case OP_NOT:
   2308       case OP_NOTI:
   2309       case OP_EXACT:
   2310       case OP_EXACTI:
   2311       case OP_NOTEXACT:
   2312       case OP_NOTEXACTI:
   2313       case OP_UPTO:
   2314       case OP_UPTOI:
   2315       case OP_NOTUPTO:
   2316       case OP_NOTUPTOI:
   2317       case OP_MINUPTO:
   2318       case OP_MINUPTOI:
   2319       case OP_NOTMINUPTO:
   2320       case OP_NOTMINUPTOI:
   2321       case OP_POSUPTO:
   2322       case OP_POSUPTOI:
   2323       case OP_NOTPOSUPTO:
   2324       case OP_NOTPOSUPTOI:
   2325       case OP_STAR:
   2326       case OP_STARI:
   2327       case OP_NOTSTAR:
   2328       case OP_NOTSTARI:
   2329       case OP_MINSTAR:
   2330       case OP_MINSTARI:
   2331       case OP_NOTMINSTAR:
   2332       case OP_NOTMINSTARI:
   2333       case OP_POSSTAR:
   2334       case OP_POSSTARI:
   2335       case OP_NOTPOSSTAR:
   2336       case OP_NOTPOSSTARI:
   2337       case OP_PLUS:
   2338       case OP_PLUSI:
   2339       case OP_NOTPLUS:
   2340       case OP_NOTPLUSI:
   2341       case OP_MINPLUS:
   2342       case OP_MINPLUSI:
   2343       case OP_NOTMINPLUS:
   2344       case OP_NOTMINPLUSI:
   2345       case OP_POSPLUS:
   2346       case OP_POSPLUSI:
   2347       case OP_NOTPOSPLUS:
   2348       case OP_NOTPOSPLUSI:
   2349       case OP_QUERY:
   2350       case OP_QUERYI:
   2351       case OP_NOTQUERY:
   2352       case OP_NOTQUERYI:
   2353       case OP_MINQUERY:
   2354       case OP_MINQUERYI:
   2355       case OP_NOTMINQUERY:
   2356       case OP_NOTMINQUERYI:
   2357       case OP_POSQUERY:
   2358       case OP_POSQUERYI:
   2359       case OP_NOTPOSQUERY:
   2360       case OP_NOTPOSQUERYI:
   2361       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   2362       break;
   2363       }
   2364 #else
   2365     (void)(utf);  /* Keep compiler happy by referencing function argument */
   2366 #endif
   2367     }
   2368   }
   2369 }
   2370 
   2371 
   2372 
   2373 /*************************************************
   2374 *    Scan compiled branch for non-emptiness      *
   2375 *************************************************/
   2376 
   2377 /* This function scans through a branch of a compiled pattern to see whether it
   2378 can match the empty string or not. It is called from could_be_empty()
   2379 below and from compile_branch() when checking for an unlimited repeat of a
   2380 group that can match nothing. Note that first_significant_code() skips over
   2381 backward and negative forward assertions when its final argument is TRUE. If we
   2382 hit an unclosed bracket, we return "empty" - this means we've struck an inner
   2383 bracket whose current branch will already have been scanned.
   2384 
   2385 Arguments:
   2386   code        points to start of search
   2387   endcode     points to where to stop
   2388   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2389   cd          contains pointers to tables etc.
   2390   recurses    chain of recurse_check to catch mutual recursion
   2391 
   2392 Returns:      TRUE if what is matched could be empty
   2393 */
   2394 
   2395 static BOOL
   2396 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
   2397   BOOL utf, compile_data *cd, recurse_check *recurses)
   2398 {
   2399 register pcre_uchar c;
   2400 recurse_check this_recurse;
   2401 
   2402 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
   2403      code < endcode;
   2404      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
   2405   {
   2406   const pcre_uchar *ccode;
   2407 
   2408   c = *code;
   2409 
   2410   /* Skip over forward assertions; the other assertions are skipped by
   2411   first_significant_code() with a TRUE final argument. */
   2412 
   2413   if (c == OP_ASSERT)
   2414     {
   2415     do code += GET(code, 1); while (*code == OP_ALT);
   2416     c = *code;
   2417     continue;
   2418     }
   2419 
   2420   /* For a recursion/subroutine call, if its end has been reached, which
   2421   implies a backward reference subroutine call, we can scan it. If it's a
   2422   forward reference subroutine call, we can't. To detect forward reference
   2423   we have to scan up the list that is kept in the workspace. This function is
   2424   called only when doing the real compile, not during the pre-compile that
   2425   measures the size of the compiled pattern. */
   2426 
   2427   if (c == OP_RECURSE)
   2428     {
   2429     const pcre_uchar *scode = cd->start_code + GET(code, 1);
   2430     const pcre_uchar *endgroup = scode;
   2431     BOOL empty_branch;
   2432 
   2433     /* Test for forward reference or uncompleted reference. This is disabled
   2434     when called to scan a completed pattern by setting cd->start_workspace to
   2435     NULL. */
   2436 
   2437     if (cd->start_workspace != NULL)
   2438       {
   2439       const pcre_uchar *tcode;
   2440       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
   2441         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
   2442       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
   2443       }
   2444 
   2445     /* If the reference is to a completed group, we need to detect whether this
   2446     is a recursive call, as otherwise there will be an infinite loop. If it is
   2447     a recursion, just skip over it. Simple recursions are easily detected. For
   2448     mutual recursions we keep a chain on the stack. */
   2449 
   2450     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
   2451     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
   2452     else
   2453       {
   2454       recurse_check *r = recurses;
   2455       for (r = recurses; r != NULL; r = r->prev)
   2456         if (r->group == scode) break;
   2457       if (r != NULL) continue;   /* Mutual recursion */
   2458       }
   2459 
   2460     /* Completed reference; scan the referenced group, remembering it on the
   2461     stack chain to detect mutual recursions. */
   2462 
   2463     empty_branch = FALSE;
   2464     this_recurse.prev = recurses;
   2465     this_recurse.group = scode;
   2466 
   2467     do
   2468       {
   2469       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
   2470         {
   2471         empty_branch = TRUE;
   2472         break;
   2473         }
   2474       scode += GET(scode, 1);
   2475       }
   2476     while (*scode == OP_ALT);
   2477 
   2478     if (!empty_branch) return FALSE;  /* All branches are non-empty */
   2479     continue;
   2480     }
   2481 
   2482   /* Groups with zero repeats can of course be empty; skip them. */
   2483 
   2484   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
   2485       c == OP_BRAPOSZERO)
   2486     {
   2487     code += PRIV(OP_lengths)[c];
   2488     do code += GET(code, 1); while (*code == OP_ALT);
   2489     c = *code;
   2490     continue;
   2491     }
   2492 
   2493   /* A nested group that is already marked as "could be empty" can just be
   2494   skipped. */
   2495 
   2496   if (c == OP_SBRA  || c == OP_SBRAPOS ||
   2497       c == OP_SCBRA || c == OP_SCBRAPOS)
   2498     {
   2499     do code += GET(code, 1); while (*code == OP_ALT);
   2500     c = *code;
   2501     continue;
   2502     }
   2503 
   2504   /* For other groups, scan the branches. */
   2505 
   2506   if (c == OP_BRA  || c == OP_BRAPOS ||
   2507       c == OP_CBRA || c == OP_CBRAPOS ||
   2508       c == OP_ONCE || c == OP_ONCE_NC ||
   2509       c == OP_COND || c == OP_SCOND)
   2510     {
   2511     BOOL empty_branch;
   2512     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
   2513 
   2514     /* If a conditional group has only one branch, there is a second, implied,
   2515     empty branch, so just skip over the conditional, because it could be empty.
   2516     Otherwise, scan the individual branches of the group. */
   2517 
   2518     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
   2519       code += GET(code, 1);
   2520     else
   2521       {
   2522       empty_branch = FALSE;
   2523       do
   2524         {
   2525         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
   2526           recurses)) empty_branch = TRUE;
   2527         code += GET(code, 1);
   2528         }
   2529       while (*code == OP_ALT);
   2530       if (!empty_branch) return FALSE;   /* All branches are non-empty */
   2531       }
   2532 
   2533     c = *code;
   2534     continue;
   2535     }
   2536 
   2537   /* Handle the other opcodes */
   2538 
   2539   switch (c)
   2540     {
   2541     /* Check for quantifiers after a class. XCLASS is used for classes that
   2542     cannot be represented just by a bit map. This includes negated single
   2543     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
   2544     actual length is stored in the compiled code, so we must update "code"
   2545     here. */
   2546 
   2547 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   2548     case OP_XCLASS:
   2549     ccode = code += GET(code, 1);
   2550     goto CHECK_CLASS_REPEAT;
   2551 #endif
   2552 
   2553     case OP_CLASS:
   2554     case OP_NCLASS:
   2555     ccode = code + PRIV(OP_lengths)[OP_CLASS];
   2556 
   2557 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   2558     CHECK_CLASS_REPEAT:
   2559 #endif
   2560 
   2561     switch (*ccode)
   2562       {
   2563       case OP_CRSTAR:            /* These could be empty; continue */
   2564       case OP_CRMINSTAR:
   2565       case OP_CRQUERY:
   2566       case OP_CRMINQUERY:
   2567       case OP_CRPOSSTAR:
   2568       case OP_CRPOSQUERY:
   2569       break;
   2570 
   2571       default:                   /* Non-repeat => class must match */
   2572       case OP_CRPLUS:            /* These repeats aren't empty */
   2573       case OP_CRMINPLUS:
   2574       case OP_CRPOSPLUS:
   2575       return FALSE;
   2576 
   2577       case OP_CRRANGE:
   2578       case OP_CRMINRANGE:
   2579       case OP_CRPOSRANGE:
   2580       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
   2581       break;
   2582       }
   2583     break;
   2584 
   2585     /* Opcodes that must match a character */
   2586 
   2587     case OP_ANY:
   2588     case OP_ALLANY:
   2589     case OP_ANYBYTE:
   2590 
   2591     case OP_PROP:
   2592     case OP_NOTPROP:
   2593     case OP_ANYNL:
   2594 
   2595     case OP_NOT_HSPACE:
   2596     case OP_HSPACE:
   2597     case OP_NOT_VSPACE:
   2598     case OP_VSPACE:
   2599     case OP_EXTUNI:
   2600 
   2601     case OP_NOT_DIGIT:
   2602     case OP_DIGIT:
   2603     case OP_NOT_WHITESPACE:
   2604     case OP_WHITESPACE:
   2605     case OP_NOT_WORDCHAR:
   2606     case OP_WORDCHAR:
   2607 
   2608     case OP_CHAR:
   2609     case OP_CHARI:
   2610     case OP_NOT:
   2611     case OP_NOTI:
   2612 
   2613     case OP_PLUS:
   2614     case OP_PLUSI:
   2615     case OP_MINPLUS:
   2616     case OP_MINPLUSI:
   2617 
   2618     case OP_NOTPLUS:
   2619     case OP_NOTPLUSI:
   2620     case OP_NOTMINPLUS:
   2621     case OP_NOTMINPLUSI:
   2622 
   2623     case OP_POSPLUS:
   2624     case OP_POSPLUSI:
   2625     case OP_NOTPOSPLUS:
   2626     case OP_NOTPOSPLUSI:
   2627 
   2628     case OP_EXACT:
   2629     case OP_EXACTI:
   2630     case OP_NOTEXACT:
   2631     case OP_NOTEXACTI:
   2632 
   2633     case OP_TYPEPLUS:
   2634     case OP_TYPEMINPLUS:
   2635     case OP_TYPEPOSPLUS:
   2636     case OP_TYPEEXACT:
   2637 
   2638     return FALSE;
   2639 
   2640     /* These are going to continue, as they may be empty, but we have to
   2641     fudge the length for the \p and \P cases. */
   2642 
   2643     case OP_TYPESTAR:
   2644     case OP_TYPEMINSTAR:
   2645     case OP_TYPEPOSSTAR:
   2646     case OP_TYPEQUERY:
   2647     case OP_TYPEMINQUERY:
   2648     case OP_TYPEPOSQUERY:
   2649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2650     break;
   2651 
   2652     /* Same for these */
   2653 
   2654     case OP_TYPEUPTO:
   2655     case OP_TYPEMINUPTO:
   2656     case OP_TYPEPOSUPTO:
   2657     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   2658       code += 2;
   2659     break;
   2660 
   2661     /* End of branch */
   2662 
   2663     case OP_KET:
   2664     case OP_KETRMAX:
   2665     case OP_KETRMIN:
   2666     case OP_KETRPOS:
   2667     case OP_ALT:
   2668     return TRUE;
   2669 
   2670     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
   2671     MINUPTO, and POSUPTO and their caseless and negative versions may be
   2672     followed by a multibyte character. */
   2673 
   2674 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   2675     case OP_STAR:
   2676     case OP_STARI:
   2677     case OP_NOTSTAR:
   2678     case OP_NOTSTARI:
   2679 
   2680     case OP_MINSTAR:
   2681     case OP_MINSTARI:
   2682     case OP_NOTMINSTAR:
   2683     case OP_NOTMINSTARI:
   2684 
   2685     case OP_POSSTAR:
   2686     case OP_POSSTARI:
   2687     case OP_NOTPOSSTAR:
   2688     case OP_NOTPOSSTARI:
   2689 
   2690     case OP_QUERY:
   2691     case OP_QUERYI:
   2692     case OP_NOTQUERY:
   2693     case OP_NOTQUERYI:
   2694 
   2695     case OP_MINQUERY:
   2696     case OP_MINQUERYI:
   2697     case OP_NOTMINQUERY:
   2698     case OP_NOTMINQUERYI:
   2699 
   2700     case OP_POSQUERY:
   2701     case OP_POSQUERYI:
   2702     case OP_NOTPOSQUERY:
   2703     case OP_NOTPOSQUERYI:
   2704 
   2705     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
   2706     break;
   2707 
   2708     case OP_UPTO:
   2709     case OP_UPTOI:
   2710     case OP_NOTUPTO:
   2711     case OP_NOTUPTOI:
   2712 
   2713     case OP_MINUPTO:
   2714     case OP_MINUPTOI:
   2715     case OP_NOTMINUPTO:
   2716     case OP_NOTMINUPTOI:
   2717 
   2718     case OP_POSUPTO:
   2719     case OP_POSUPTOI:
   2720     case OP_NOTPOSUPTO:
   2721     case OP_NOTPOSUPTOI:
   2722 
   2723     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
   2724     break;
   2725 #endif
   2726 
   2727     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
   2728     string. */
   2729 
   2730     case OP_MARK:
   2731     case OP_PRUNE_ARG:
   2732     case OP_SKIP_ARG:
   2733     case OP_THEN_ARG:
   2734     code += code[1];
   2735     break;
   2736 
   2737     /* None of the remaining opcodes are required to match a character. */
   2738 
   2739     default:
   2740     break;
   2741     }
   2742   }
   2743 
   2744 return TRUE;
   2745 }
   2746 
   2747 
   2748 
   2749 /*************************************************
   2750 *    Scan compiled regex for non-emptiness       *
   2751 *************************************************/
   2752 
   2753 /* This function is called to check for left recursive calls. We want to check
   2754 the current branch of the current pattern to see if it could match the empty
   2755 string. If it could, we must look outwards for branches at other levels,
   2756 stopping when we pass beyond the bracket which is the subject of the recursion.
   2757 This function is called only during the real compile, not during the
   2758 pre-compile.
   2759 
   2760 Arguments:
   2761   code        points to start of the recursion
   2762   endcode     points to where to stop (current RECURSE item)
   2763   bcptr       points to the chain of current (unclosed) branch starts
   2764   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2765   cd          pointers to tables etc
   2766 
   2767 Returns:      TRUE if what is matched could be empty
   2768 */
   2769 
   2770 static BOOL
   2771 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
   2772   branch_chain *bcptr, BOOL utf, compile_data *cd)
   2773 {
   2774 while (bcptr != NULL && bcptr->current_branch >= code)
   2775   {
   2776   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
   2777     return FALSE;
   2778   bcptr = bcptr->outer;
   2779   }
   2780 return TRUE;
   2781 }
   2782 
   2783 
   2784 
   2785 /*************************************************
   2786 *        Base opcode of repeated opcodes         *
   2787 *************************************************/
   2788 
   2789 /* Returns the base opcode for repeated single character type opcodes. If the
   2790 opcode is not a repeated character type, it returns with the original value.
   2791 
   2792 Arguments:  c opcode
   2793 Returns:    base opcode for the type
   2794 */
   2795 
   2796 static pcre_uchar
   2797 get_repeat_base(pcre_uchar c)
   2798 {
   2799 return (c > OP_TYPEPOSUPTO)? c :
   2800        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
   2801        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
   2802        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
   2803        (c >= OP_STARI)?      OP_STARI :
   2804                              OP_STAR;
   2805 }
   2806 
   2807 
   2808 
   2809 #ifdef SUPPORT_UCP
   2810 /*************************************************
   2811 *        Check a character and a property        *
   2812 *************************************************/
   2813 
   2814 /* This function is called by check_auto_possessive() when a property item
   2815 is adjacent to a fixed character.
   2816 
   2817 Arguments:
   2818   c            the character
   2819   ptype        the property type
   2820   pdata        the data for the type
   2821   negated      TRUE if it's a negated property (\P or \p{^)
   2822 
   2823 Returns:       TRUE if auto-possessifying is OK
   2824 */
   2825 
   2826 static BOOL
   2827 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
   2828   BOOL negated)
   2829 {
   2830 const pcre_uint32 *p;
   2831 const ucd_record *prop = GET_UCD(c);
   2832 
   2833 switch(ptype)
   2834   {
   2835   case PT_LAMP:
   2836   return (prop->chartype == ucp_Lu ||
   2837           prop->chartype == ucp_Ll ||
   2838           prop->chartype == ucp_Lt) == negated;
   2839 
   2840   case PT_GC:
   2841   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
   2842 
   2843   case PT_PC:
   2844   return (pdata == prop->chartype) == negated;
   2845 
   2846   case PT_SC:
   2847   return (pdata == prop->script) == negated;
   2848 
   2849   /* These are specials */
   2850 
   2851   case PT_ALNUM:
   2852   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
   2853           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
   2854 
   2855   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
   2856   means that Perl space and POSIX space are now identical. PCRE was changed
   2857   at release 8.34. */
   2858 
   2859   case PT_SPACE:    /* Perl space */
   2860   case PT_PXSPACE:  /* POSIX space */
   2861   switch(c)
   2862     {
   2863     HSPACE_CASES:
   2864     VSPACE_CASES:
   2865     return negated;
   2866 
   2867     default:
   2868     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
   2869     }
   2870   break;  /* Control never reaches here */
   2871 
   2872   case PT_WORD:
   2873   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
   2874           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
   2875           c == CHAR_UNDERSCORE) == negated;
   2876 
   2877   case PT_CLIST:
   2878   p = PRIV(ucd_caseless_sets) + prop->caseset;
   2879   for (;;)
   2880     {
   2881     if (c < *p) return !negated;
   2882     if (c == *p++) return negated;
   2883     }
   2884   break;  /* Control never reaches here */
   2885   }
   2886 
   2887 return FALSE;
   2888 }
   2889 #endif  /* SUPPORT_UCP */
   2890 
   2891 
   2892 
   2893 /*************************************************
   2894 *        Fill the character property list        *
   2895 *************************************************/
   2896 
   2897 /* Checks whether the code points to an opcode that can take part in auto-
   2898 possessification, and if so, fills a list with its properties.
   2899 
   2900 Arguments:
   2901   code        points to start of expression
   2902   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
   2903   fcc         points to case-flipping table
   2904   list        points to output list
   2905               list[0] will be filled with the opcode
   2906               list[1] will be non-zero if this opcode
   2907                 can match an empty character string
   2908               list[2..7] depends on the opcode
   2909 
   2910 Returns:      points to the start of the next opcode if *code is accepted
   2911               NULL if *code is not accepted
   2912 */
   2913 
   2914 static const pcre_uchar *
   2915 get_chr_property_list(const pcre_uchar *code, BOOL utf,
   2916   const pcre_uint8 *fcc, pcre_uint32 *list)
   2917 {
   2918 pcre_uchar c = *code;
   2919 pcre_uchar base;
   2920 const pcre_uchar *end;
   2921 pcre_uint32 chr;
   2922 
   2923 #ifdef SUPPORT_UCP
   2924 pcre_uint32 *clist_dest;
   2925 const pcre_uint32 *clist_src;
   2926 #else
   2927 utf = utf;  /* Suppress "unused parameter" compiler warning */
   2928 #endif
   2929 
   2930 list[0] = c;
   2931 list[1] = FALSE;
   2932 code++;
   2933 
   2934 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
   2935   {
   2936   base = get_repeat_base(c);
   2937   c -= (base - OP_STAR);
   2938 
   2939   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
   2940     code += IMM2_SIZE;
   2941 
   2942   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
   2943 
   2944   switch(base)
   2945     {
   2946     case OP_STAR:
   2947     list[0] = OP_CHAR;
   2948     break;
   2949 
   2950     case OP_STARI:
   2951     list[0] = OP_CHARI;
   2952     break;
   2953 
   2954     case OP_NOTSTAR:
   2955     list[0] = OP_NOT;
   2956     break;
   2957 
   2958     case OP_NOTSTARI:
   2959     list[0] = OP_NOTI;
   2960     break;
   2961 
   2962     case OP_TYPESTAR:
   2963     list[0] = *code;
   2964     code++;
   2965     break;
   2966     }
   2967   c = list[0];
   2968   }
   2969 
   2970 switch(c)
   2971   {
   2972   case OP_NOT_DIGIT:
   2973   case OP_DIGIT:
   2974   case OP_NOT_WHITESPACE:
   2975   case OP_WHITESPACE:
   2976   case OP_NOT_WORDCHAR:
   2977   case OP_WORDCHAR:
   2978   case OP_ANY:
   2979   case OP_ALLANY:
   2980   case OP_ANYNL:
   2981   case OP_NOT_HSPACE:
   2982   case OP_HSPACE:
   2983   case OP_NOT_VSPACE:
   2984   case OP_VSPACE:
   2985   case OP_EXTUNI:
   2986   case OP_EODN:
   2987   case OP_EOD:
   2988   case OP_DOLL:
   2989   case OP_DOLLM:
   2990   return code;
   2991 
   2992   case OP_CHAR:
   2993   case OP_NOT:
   2994   GETCHARINCTEST(chr, code);
   2995   list[2] = chr;
   2996   list[3] = NOTACHAR;
   2997   return code;
   2998 
   2999   case OP_CHARI:
   3000   case OP_NOTI:
   3001   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
   3002   GETCHARINCTEST(chr, code);
   3003   list[2] = chr;
   3004 
   3005 #ifdef SUPPORT_UCP
   3006   if (chr < 128 || (chr < 256 && !utf))
   3007     list[3] = fcc[chr];
   3008   else
   3009     list[3] = UCD_OTHERCASE(chr);
   3010 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3011   list[3] = (chr < 256) ? fcc[chr] : chr;
   3012 #else
   3013   list[3] = fcc[chr];
   3014 #endif
   3015 
   3016   /* The othercase might be the same value. */
   3017 
   3018   if (chr == list[3])
   3019     list[3] = NOTACHAR;
   3020   else
   3021     list[4] = NOTACHAR;
   3022   return code;
   3023 
   3024 #ifdef SUPPORT_UCP
   3025   case OP_PROP:
   3026   case OP_NOTPROP:
   3027   if (code[0] != PT_CLIST)
   3028     {
   3029     list[2] = code[0];
   3030     list[3] = code[1];
   3031     return code + 2;
   3032     }
   3033 
   3034   /* Convert only if we have enough space. */
   3035 
   3036   clist_src = PRIV(ucd_caseless_sets) + code[1];
   3037   clist_dest = list + 2;
   3038   code += 2;
   3039 
   3040   do {
   3041      if (clist_dest >= list + 8)
   3042        {
   3043        /* Early return if there is not enough space. This should never
   3044        happen, since all clists are shorter than 5 character now. */
   3045        list[2] = code[0];
   3046        list[3] = code[1];
   3047        return code;
   3048        }
   3049      *clist_dest++ = *clist_src;
   3050      }
   3051   while(*clist_src++ != NOTACHAR);
   3052 
   3053   /* All characters are stored. The terminating NOTACHAR
   3054   is copied form the clist itself. */
   3055 
   3056   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
   3057   return code;
   3058 #endif
   3059 
   3060   case OP_NCLASS:
   3061   case OP_CLASS:
   3062 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3063   case OP_XCLASS:
   3064   if (c == OP_XCLASS)
   3065     end = code + GET(code, 0) - 1;
   3066   else
   3067 #endif
   3068     end = code + 32 / sizeof(pcre_uchar);
   3069 
   3070   switch(*end)
   3071     {
   3072     case OP_CRSTAR:
   3073     case OP_CRMINSTAR:
   3074     case OP_CRQUERY:
   3075     case OP_CRMINQUERY:
   3076     case OP_CRPOSSTAR:
   3077     case OP_CRPOSQUERY:
   3078     list[1] = TRUE;
   3079     end++;
   3080     break;
   3081 
   3082     case OP_CRPLUS:
   3083     case OP_CRMINPLUS:
   3084     case OP_CRPOSPLUS:
   3085     end++;
   3086     break;
   3087 
   3088     case OP_CRRANGE:
   3089     case OP_CRMINRANGE:
   3090     case OP_CRPOSRANGE:
   3091     list[1] = (GET2(end, 1) == 0);
   3092     end += 1 + 2 * IMM2_SIZE;
   3093     break;
   3094     }
   3095   list[2] = (pcre_uint32)(end - code);
   3096   return end;
   3097   }
   3098 return NULL;    /* Opcode not accepted */
   3099 }
   3100 
   3101 
   3102 
   3103 /*************************************************
   3104 *    Scan further character sets for match       *
   3105 *************************************************/
   3106 
   3107 /* Checks whether the base and the current opcode have a common character, in
   3108 which case the base cannot be possessified.
   3109 
   3110 Arguments:
   3111   code        points to the byte code
   3112   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   3113   cd          static compile data
   3114   base_list   the data list of the base opcode
   3115 
   3116 Returns:      TRUE if the auto-possessification is possible
   3117 */
   3118 
   3119 static BOOL
   3120 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
   3121   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
   3122 {
   3123 pcre_uchar c;
   3124 pcre_uint32 list[8];
   3125 const pcre_uint32 *chr_ptr;
   3126 const pcre_uint32 *ochr_ptr;
   3127 const pcre_uint32 *list_ptr;
   3128 const pcre_uchar *next_code;
   3129 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3130 const pcre_uchar *xclass_flags;
   3131 #endif
   3132 const pcre_uint8 *class_bitset;
   3133 const pcre_uint8 *set1, *set2, *set_end;
   3134 pcre_uint32 chr;
   3135 BOOL accepted, invert_bits;
   3136 BOOL entered_a_group = FALSE;
   3137 
   3138 if (*rec_limit == 0) return FALSE;
   3139 --(*rec_limit);
   3140 
   3141 /* Note: the base_list[1] contains whether the current opcode has greedy
   3142 (represented by a non-zero value) quantifier. This is a different from
   3143 other character type lists, which stores here that the character iterator
   3144 matches to an empty string (also represented by a non-zero value). */
   3145 
   3146 for(;;)
   3147   {
   3148   /* All operations move the code pointer forward.
   3149   Therefore infinite recursions are not possible. */
   3150 
   3151   c = *code;
   3152 
   3153   /* Skip over callouts */
   3154 
   3155   if (c == OP_CALLOUT)
   3156     {
   3157     code += PRIV(OP_lengths)[c];
   3158     continue;
   3159     }
   3160 
   3161   if (c == OP_ALT)
   3162     {
   3163     do code += GET(code, 1); while (*code == OP_ALT);
   3164     c = *code;
   3165     }
   3166 
   3167   switch(c)
   3168     {
   3169     case OP_END:
   3170     case OP_KETRPOS:
   3171     /* TRUE only in greedy case. The non-greedy case could be replaced by
   3172     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
   3173     uses more memory, which we cannot get at this stage.) */
   3174 
   3175     return base_list[1] != 0;
   3176 
   3177     case OP_KET:
   3178     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
   3179     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
   3180     cannot be converted to a possessive form. */
   3181 
   3182     if (base_list[1] == 0) return FALSE;
   3183 
   3184     switch(*(code - GET(code, 1)))
   3185       {
   3186       case OP_ASSERT:
   3187       case OP_ASSERT_NOT:
   3188       case OP_ASSERTBACK:
   3189       case OP_ASSERTBACK_NOT:
   3190       case OP_ONCE:
   3191       case OP_ONCE_NC:
   3192       /* Atomic sub-patterns and assertions can always auto-possessify their
   3193       last iterator. However, if the group was entered as a result of checking
   3194       a previous iterator, this is not possible. */
   3195 
   3196       return !entered_a_group;
   3197       }
   3198 
   3199     code += PRIV(OP_lengths)[c];
   3200     continue;
   3201 
   3202     case OP_ONCE:
   3203     case OP_ONCE_NC:
   3204     case OP_BRA:
   3205     case OP_CBRA:
   3206     next_code = code + GET(code, 1);
   3207     code += PRIV(OP_lengths)[c];
   3208 
   3209     while (*next_code == OP_ALT)
   3210       {
   3211       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
   3212         return FALSE;
   3213       code = next_code + 1 + LINK_SIZE;
   3214       next_code += GET(next_code, 1);
   3215       }
   3216 
   3217     entered_a_group = TRUE;
   3218     continue;
   3219 
   3220     case OP_BRAZERO:
   3221     case OP_BRAMINZERO:
   3222 
   3223     next_code = code + 1;
   3224     if (*next_code != OP_BRA && *next_code != OP_CBRA
   3225         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
   3226 
   3227     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
   3228 
   3229     /* The bracket content will be checked by the
   3230     OP_BRA/OP_CBRA case above. */
   3231     next_code += 1 + LINK_SIZE;
   3232     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
   3233       return FALSE;
   3234 
   3235     code += PRIV(OP_lengths)[c];
   3236     continue;
   3237 
   3238     default:
   3239     break;
   3240     }
   3241 
   3242   /* Check for a supported opcode, and load its properties. */
   3243 
   3244   code = get_chr_property_list(code, utf, cd->fcc, list);
   3245   if (code == NULL) return FALSE;    /* Unsupported */
   3246 
   3247   /* If either opcode is a small character list, set pointers for comparing
   3248   characters from that list with another list, or with a property. */
   3249 
   3250   if (base_list[0] == OP_CHAR)
   3251     {
   3252     chr_ptr = base_list + 2;
   3253     list_ptr = list;
   3254     }
   3255   else if (list[0] == OP_CHAR)
   3256     {
   3257     chr_ptr = list + 2;
   3258     list_ptr = base_list;
   3259     }
   3260 
   3261   /* Character bitsets can also be compared to certain opcodes. */
   3262 
   3263   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
   3264 #ifdef COMPILE_PCRE8
   3265       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
   3266       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
   3267 #endif
   3268       )
   3269     {
   3270 #ifdef COMPILE_PCRE8
   3271     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
   3272 #else
   3273     if (base_list[0] == OP_CLASS)
   3274 #endif
   3275       {
   3276       set1 = (pcre_uint8 *)(base_end - base_list[2]);
   3277       list_ptr = list;
   3278       }
   3279     else
   3280       {
   3281       set1 = (pcre_uint8 *)(code - list[2]);
   3282       list_ptr = base_list;
   3283       }
   3284 
   3285     invert_bits = FALSE;
   3286     switch(list_ptr[0])
   3287       {
   3288       case OP_CLASS:
   3289       case OP_NCLASS:
   3290       set2 = (pcre_uint8 *)
   3291         ((list_ptr == list ? code : base_end) - list_ptr[2]);
   3292       break;
   3293 
   3294 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3295       case OP_XCLASS:
   3296       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
   3297       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
   3298       if ((*xclass_flags & XCL_MAP) == 0)
   3299         {
   3300         /* No bits are set for characters < 256. */
   3301         if (list[1] == 0) return TRUE;
   3302         /* Might be an empty repeat. */
   3303         continue;
   3304         }
   3305       set2 = (pcre_uint8 *)(xclass_flags + 1);
   3306       break;
   3307 #endif
   3308 
   3309       case OP_NOT_DIGIT:
   3310       invert_bits = TRUE;
   3311       /* Fall through */
   3312       case OP_DIGIT:
   3313       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
   3314       break;
   3315 
   3316       case OP_NOT_WHITESPACE:
   3317       invert_bits = TRUE;
   3318       /* Fall through */
   3319       case OP_WHITESPACE:
   3320       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
   3321       break;
   3322 
   3323       case OP_NOT_WORDCHAR:
   3324       invert_bits = TRUE;
   3325       /* Fall through */
   3326       case OP_WORDCHAR:
   3327       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
   3328       break;
   3329 
   3330       default:
   3331       return FALSE;
   3332       }
   3333 
   3334     /* Because the sets are unaligned, we need
   3335     to perform byte comparison here. */
   3336     set_end = set1 + 32;
   3337     if (invert_bits)
   3338       {
   3339       do
   3340         {
   3341         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
   3342         }
   3343       while (set1 < set_end);
   3344       }
   3345     else
   3346       {
   3347       do
   3348         {
   3349         if ((*set1++ & *set2++) != 0) return FALSE;
   3350         }
   3351       while (set1 < set_end);
   3352       }
   3353 
   3354     if (list[1] == 0) return TRUE;
   3355     /* Might be an empty repeat. */
   3356     continue;
   3357     }
   3358 
   3359   /* Some property combinations also acceptable. Unicode property opcodes are
   3360   processed specially; the rest can be handled with a lookup table. */
   3361 
   3362   else
   3363     {
   3364     pcre_uint32 leftop, rightop;
   3365 
   3366     leftop = base_list[0];
   3367     rightop = list[0];
   3368 
   3369 #ifdef SUPPORT_UCP
   3370     accepted = FALSE; /* Always set in non-unicode case. */
   3371     if (leftop == OP_PROP || leftop == OP_NOTPROP)
   3372       {
   3373       if (rightop == OP_EOD)
   3374         accepted = TRUE;
   3375       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
   3376         {
   3377         int n;
   3378         const pcre_uint8 *p;
   3379         BOOL same = leftop == rightop;
   3380         BOOL lisprop = leftop == OP_PROP;
   3381         BOOL risprop = rightop == OP_PROP;
   3382         BOOL bothprop = lisprop && risprop;
   3383 
   3384         /* There's a table that specifies how each combination is to be
   3385         processed:
   3386           0   Always return FALSE (never auto-possessify)
   3387           1   Character groups are distinct (possessify if both are OP_PROP)
   3388           2   Check character categories in the same group (general or particular)
   3389           3   Return TRUE if the two opcodes are not the same
   3390           ... see comments below
   3391         */
   3392 
   3393         n = propposstab[base_list[2]][list[2]];
   3394         switch(n)
   3395           {
   3396           case 0: break;
   3397           case 1: accepted = bothprop; break;
   3398           case 2: accepted = (base_list[3] == list[3]) != same; break;
   3399           case 3: accepted = !same; break;
   3400 
   3401           case 4:  /* Left general category, right particular category */
   3402           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
   3403           break;
   3404 
   3405           case 5:  /* Right general category, left particular category */
   3406           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
   3407           break;
   3408 
   3409           /* This code is logically tricky. Think hard before fiddling with it.
   3410           The posspropstab table has four entries per row. Each row relates to
   3411           one of PCRE's special properties such as ALNUM or SPACE or WORD.
   3412           Only WORD actually needs all four entries, but using repeats for the
   3413           others means they can all use the same code below.
   3414 
   3415           The first two entries in each row are Unicode general categories, and
   3416           apply always, because all the characters they include are part of the
   3417           PCRE character set. The third and fourth entries are a general and a
   3418           particular category, respectively, that include one or more relevant
   3419           characters. One or the other is used, depending on whether the check
   3420           is for a general or a particular category. However, in both cases the
   3421           category contains more characters than the specials that are defined
   3422           for the property being tested against. Therefore, it cannot be used
   3423           in a NOTPROP case.
   3424 
   3425           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
   3426           Underscore is covered by ucp_P or ucp_Po. */
   3427 
   3428           case 6:  /* Left alphanum vs right general category */
   3429           case 7:  /* Left space vs right general category */
   3430           case 8:  /* Left word vs right general category */
   3431           p = posspropstab[n-6];
   3432           accepted = risprop && lisprop ==
   3433             (list[3] != p[0] &&
   3434              list[3] != p[1] &&
   3435             (list[3] != p[2] || !lisprop));
   3436           break;
   3437 
   3438           case 9:   /* Right alphanum vs left general category */
   3439           case 10:  /* Right space vs left general category */
   3440           case 11:  /* Right word vs left general category */
   3441           p = posspropstab[n-9];
   3442           accepted = lisprop && risprop ==
   3443             (base_list[3] != p[0] &&
   3444              base_list[3] != p[1] &&
   3445             (base_list[3] != p[2] || !risprop));
   3446           break;
   3447 
   3448           case 12:  /* Left alphanum vs right particular category */
   3449           case 13:  /* Left space vs right particular category */
   3450           case 14:  /* Left word vs right particular category */
   3451           p = posspropstab[n-12];
   3452           accepted = risprop && lisprop ==
   3453             (catposstab[p[0]][list[3]] &&
   3454              catposstab[p[1]][list[3]] &&
   3455             (list[3] != p[3] || !lisprop));
   3456           break;
   3457 
   3458           case 15:  /* Right alphanum vs left particular category */
   3459           case 16:  /* Right space vs left particular category */
   3460           case 17:  /* Right word vs left particular category */
   3461           p = posspropstab[n-15];
   3462           accepted = lisprop && risprop ==
   3463             (catposstab[p[0]][base_list[3]] &&
   3464              catposstab[p[1]][base_list[3]] &&
   3465             (base_list[3] != p[3] || !risprop));
   3466           break;
   3467           }
   3468         }
   3469       }
   3470 
   3471     else
   3472 #endif  /* SUPPORT_UCP */
   3473 
   3474     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
   3475            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
   3476            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
   3477 
   3478     if (!accepted) return FALSE;
   3479 
   3480     if (list[1] == 0) return TRUE;
   3481     /* Might be an empty repeat. */
   3482     continue;
   3483     }
   3484 
   3485   /* Control reaches here only if one of the items is a small character list.
   3486   All characters are checked against the other side. */
   3487 
   3488   do
   3489     {
   3490     chr = *chr_ptr;
   3491 
   3492     switch(list_ptr[0])
   3493       {
   3494       case OP_CHAR:
   3495       ochr_ptr = list_ptr + 2;
   3496       do
   3497         {
   3498         if (chr == *ochr_ptr) return FALSE;
   3499         ochr_ptr++;
   3500         }
   3501       while(*ochr_ptr != NOTACHAR);
   3502       break;
   3503 
   3504       case OP_NOT:
   3505       ochr_ptr = list_ptr + 2;
   3506       do
   3507         {
   3508         if (chr == *ochr_ptr)
   3509           break;
   3510         ochr_ptr++;
   3511         }
   3512       while(*ochr_ptr != NOTACHAR);
   3513       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
   3514       break;
   3515 
   3516       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
   3517       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
   3518 
   3519       case OP_DIGIT:
   3520       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
   3521       break;
   3522 
   3523       case OP_NOT_DIGIT:
   3524       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
   3525       break;
   3526 
   3527       case OP_WHITESPACE:
   3528       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
   3529       break;
   3530 
   3531       case OP_NOT_WHITESPACE:
   3532       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
   3533       break;
   3534 
   3535       case OP_WORDCHAR:
   3536       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
   3537       break;
   3538 
   3539       case OP_NOT_WORDCHAR:
   3540       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
   3541       break;
   3542 
   3543       case OP_HSPACE:
   3544       switch(chr)
   3545         {
   3546         HSPACE_CASES: return FALSE;
   3547         default: break;
   3548         }
   3549       break;
   3550 
   3551       case OP_NOT_HSPACE:
   3552       switch(chr)
   3553         {
   3554         HSPACE_CASES: break;
   3555         default: return FALSE;
   3556         }
   3557       break;
   3558 
   3559       case OP_ANYNL:
   3560       case OP_VSPACE:
   3561       switch(chr)
   3562         {
   3563         VSPACE_CASES: return FALSE;
   3564         default: break;
   3565         }
   3566       break;
   3567 
   3568       case OP_NOT_VSPACE:
   3569       switch(chr)
   3570         {
   3571         VSPACE_CASES: break;
   3572         default: return FALSE;
   3573         }
   3574       break;
   3575 
   3576       case OP_DOLL:
   3577       case OP_EODN:
   3578       switch (chr)
   3579         {
   3580         case CHAR_CR:
   3581         case CHAR_LF:
   3582         case CHAR_VT:
   3583         case CHAR_FF:
   3584         case CHAR_NEL:
   3585 #ifndef EBCDIC
   3586         case 0x2028:
   3587         case 0x2029:
   3588 #endif  /* Not EBCDIC */
   3589         return FALSE;
   3590         }
   3591       break;
   3592 
   3593       case OP_EOD:    /* Can always possessify before \z */
   3594       break;
   3595 
   3596 #ifdef SUPPORT_UCP
   3597       case OP_PROP:
   3598       case OP_NOTPROP:
   3599       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
   3600             list_ptr[0] == OP_NOTPROP))
   3601         return FALSE;
   3602       break;
   3603 #endif
   3604 
   3605       case OP_NCLASS:
   3606       if (chr > 255) return FALSE;
   3607       /* Fall through */
   3608 
   3609       case OP_CLASS:
   3610       if (chr > 255) break;
   3611       class_bitset = (pcre_uint8 *)
   3612         ((list_ptr == list ? code : base_end) - list_ptr[2]);
   3613       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
   3614       break;
   3615 
   3616 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3617       case OP_XCLASS:
   3618       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
   3619           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
   3620       break;
   3621 #endif
   3622 
   3623       default:
   3624       return FALSE;
   3625       }
   3626 
   3627     chr_ptr++;
   3628     }
   3629   while(*chr_ptr != NOTACHAR);
   3630 
   3631   /* At least one character must be matched from this opcode. */
   3632 
   3633   if (list[1] == 0) return TRUE;
   3634   }
   3635 
   3636 /* Control never reaches here. There used to be a fail-save return FALSE; here,
   3637 but some compilers complain about an unreachable statement. */
   3638 
   3639 }
   3640 
   3641 
   3642 
   3643 /*************************************************
   3644 *    Scan compiled regex for auto-possession     *
   3645 *************************************************/
   3646 
   3647 /* Replaces single character iterations with their possessive alternatives
   3648 if appropriate. This function modifies the compiled opcode!
   3649 
   3650 Arguments:
   3651   code        points to start of the byte code
   3652   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
   3653   cd          static compile data
   3654 
   3655 Returns:      nothing
   3656 */
   3657 
   3658 static void
   3659 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
   3660 {
   3661 register pcre_uchar c;
   3662 const pcre_uchar *end;
   3663 pcre_uchar *repeat_opcode;
   3664 pcre_uint32 list[8];
   3665 int rec_limit;
   3666 
   3667 for (;;)
   3668   {
   3669   c = *code;
   3670 
   3671   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
   3672   it may compile without complaining, but may get into a loop here if the code
   3673   pointer points to a bad value. This is, of course a documentated possibility,
   3674   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
   3675   just give up on this optimization. */
   3676 
   3677   if (c >= OP_TABLE_LENGTH) return;
   3678 
   3679   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
   3680     {
   3681     c -= get_repeat_base(c) - OP_STAR;
   3682     end = (c <= OP_MINUPTO) ?
   3683       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
   3684     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
   3685 
   3686     rec_limit = 1000;
   3687     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
   3688       {
   3689       switch(c)
   3690         {
   3691         case OP_STAR:
   3692         *code += OP_POSSTAR - OP_STAR;
   3693         break;
   3694 
   3695         case OP_MINSTAR:
   3696         *code += OP_POSSTAR - OP_MINSTAR;
   3697         break;
   3698 
   3699         case OP_PLUS:
   3700         *code += OP_POSPLUS - OP_PLUS;
   3701         break;
   3702 
   3703         case OP_MINPLUS:
   3704         *code += OP_POSPLUS - OP_MINPLUS;
   3705         break;
   3706 
   3707         case OP_QUERY:
   3708         *code += OP_POSQUERY - OP_QUERY;
   3709         break;
   3710 
   3711         case OP_MINQUERY:
   3712         *code += OP_POSQUERY - OP_MINQUERY;
   3713         break;
   3714 
   3715         case OP_UPTO:
   3716         *code += OP_POSUPTO - OP_UPTO;
   3717         break;
   3718 
   3719         case OP_MINUPTO:
   3720         *code += OP_POSUPTO - OP_MINUPTO;
   3721         break;
   3722         }
   3723       }
   3724     c = *code;
   3725     }
   3726   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
   3727     {
   3728 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3729     if (c == OP_XCLASS)
   3730       repeat_opcode = code + GET(code, 1);
   3731     else
   3732 #endif
   3733       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
   3734 
   3735     c = *repeat_opcode;
   3736     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
   3737       {
   3738       /* end must not be NULL. */
   3739       end = get_chr_property_list(code, utf, cd->fcc, list);
   3740 
   3741       list[1] = (c & 1) == 0;
   3742 
   3743       rec_limit = 1000;
   3744       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
   3745         {
   3746         switch (c)
   3747           {
   3748           case OP_CRSTAR:
   3749           case OP_CRMINSTAR:
   3750           *repeat_opcode = OP_CRPOSSTAR;
   3751           break;
   3752 
   3753           case OP_CRPLUS:
   3754           case OP_CRMINPLUS:
   3755           *repeat_opcode = OP_CRPOSPLUS;
   3756           break;
   3757 
   3758           case OP_CRQUERY:
   3759           case OP_CRMINQUERY:
   3760           *repeat_opcode = OP_CRPOSQUERY;
   3761           break;
   3762 
   3763           case OP_CRRANGE:
   3764           case OP_CRMINRANGE:
   3765           *repeat_opcode = OP_CRPOSRANGE;
   3766           break;
   3767           }
   3768         }
   3769       }
   3770     c = *code;
   3771     }
   3772 
   3773   switch(c)
   3774     {
   3775     case OP_END:
   3776     return;
   3777 
   3778     case OP_TYPESTAR:
   3779     case OP_TYPEMINSTAR:
   3780     case OP_TYPEPLUS:
   3781     case OP_TYPEMINPLUS:
   3782     case OP_TYPEQUERY:
   3783     case OP_TYPEMINQUERY:
   3784     case OP_TYPEPOSSTAR:
   3785     case OP_TYPEPOSPLUS:
   3786     case OP_TYPEPOSQUERY:
   3787     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   3788     break;
   3789 
   3790     case OP_TYPEUPTO:
   3791     case OP_TYPEMINUPTO:
   3792     case OP_TYPEEXACT:
   3793     case OP_TYPEPOSUPTO:
   3794     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
   3795       code += 2;
   3796     break;
   3797 
   3798 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   3799     case OP_XCLASS:
   3800     code += GET(code, 1);
   3801     break;
   3802 #endif
   3803 
   3804     case OP_MARK:
   3805     case OP_PRUNE_ARG:
   3806     case OP_SKIP_ARG:
   3807     case OP_THEN_ARG:
   3808     code += code[1];
   3809     break;
   3810     }
   3811 
   3812   /* Add in the fixed length from the table */
   3813 
   3814   code += PRIV(OP_lengths)[c];
   3815 
   3816   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
   3817   a multi-byte character. The length in the table is a minimum, so we have to
   3818   arrange to skip the extra bytes. */
   3819 
   3820 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   3821   if (utf) switch(c)
   3822     {
   3823     case OP_CHAR:
   3824     case OP_CHARI:
   3825     case OP_NOT:
   3826     case OP_NOTI:
   3827     case OP_STAR:
   3828     case OP_MINSTAR:
   3829     case OP_PLUS:
   3830     case OP_MINPLUS:
   3831     case OP_QUERY:
   3832     case OP_MINQUERY:
   3833     case OP_UPTO:
   3834     case OP_MINUPTO:
   3835     case OP_EXACT:
   3836     case OP_POSSTAR:
   3837     case OP_POSPLUS:
   3838     case OP_POSQUERY:
   3839     case OP_POSUPTO:
   3840     case OP_STARI:
   3841     case OP_MINSTARI:
   3842     case OP_PLUSI:
   3843     case OP_MINPLUSI:
   3844     case OP_QUERYI:
   3845     case OP_MINQUERYI:
   3846     case OP_UPTOI:
   3847     case OP_MINUPTOI:
   3848     case OP_EXACTI:
   3849     case OP_POSSTARI:
   3850     case OP_POSPLUSI:
   3851     case OP_POSQUERYI:
   3852     case OP_POSUPTOI:
   3853     case OP_NOTSTAR:
   3854     case OP_NOTMINSTAR:
   3855     case OP_NOTPLUS:
   3856     case OP_NOTMINPLUS:
   3857     case OP_NOTQUERY:
   3858     case OP_NOTMINQUERY:
   3859     case OP_NOTUPTO:
   3860     case OP_NOTMINUPTO:
   3861     case OP_NOTEXACT:
   3862     case OP_NOTPOSSTAR:
   3863     case OP_NOTPOSPLUS:
   3864     case OP_NOTPOSQUERY:
   3865     case OP_NOTPOSUPTO:
   3866     case OP_NOTSTARI:
   3867     case OP_NOTMINSTARI:
   3868     case OP_NOTPLUSI:
   3869     case OP_NOTMINPLUSI:
   3870     case OP_NOTQUERYI:
   3871     case OP_NOTMINQUERYI:
   3872     case OP_NOTUPTOI:
   3873     case OP_NOTMINUPTOI:
   3874     case OP_NOTEXACTI:
   3875     case OP_NOTPOSSTARI:
   3876     case OP_NOTPOSPLUSI:
   3877     case OP_NOTPOSQUERYI:
   3878     case OP_NOTPOSUPTOI:
   3879     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
   3880     break;
   3881     }
   3882 #else
   3883   (void)(utf);  /* Keep compiler happy by referencing function argument */
   3884 #endif
   3885   }
   3886 }
   3887 
   3888 
   3889 
   3890 /*************************************************
   3891 *           Check for POSIX class syntax         *
   3892 *************************************************/
   3893 
   3894 /* This function is called when the sequence "[:" or "[." or "[=" is
   3895 encountered in a character class. It checks whether this is followed by a
   3896 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
   3897 reach an unescaped ']' without the special preceding character, return FALSE.
   3898 
   3899 Originally, this function only recognized a sequence of letters between the
   3900 terminators, but it seems that Perl recognizes any sequence of characters,
   3901 though of course unknown POSIX names are subsequently rejected. Perl gives an
   3902 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
   3903 didn't consider this to be a POSIX class. Likewise for [:1234:].
   3904 
   3905 The problem in trying to be exactly like Perl is in the handling of escapes. We
   3906 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
   3907 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
   3908 below handles the special cases \\ and \], but does not try to do any other
   3909 escape processing. This makes it different from Perl for cases such as
   3910 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
   3911 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
   3912 when Perl does, I think.
   3913 
   3914 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
   3915 It seems that the appearance of a nested POSIX class supersedes an apparent
   3916 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
   3917 a digit.
   3918 
   3919 In Perl, unescaped square brackets may also appear as part of class names. For
   3920 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
   3921 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
   3922 seem right at all. PCRE does not allow closing square brackets in POSIX class
   3923 names.
   3924 
   3925 Arguments:
   3926   ptr      pointer to the initial [
   3927   endptr   where to return the end pointer
   3928 
   3929 Returns:   TRUE or FALSE
   3930 */
   3931 
   3932 static BOOL
   3933 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
   3934 {
   3935 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
   3936 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
   3937 for (++ptr; *ptr != CHAR_NULL; ptr++)
   3938   {
   3939   if (*ptr == CHAR_BACKSLASH &&
   3940       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
   3941        ptr[1] == CHAR_BACKSLASH))
   3942     ptr++;
   3943   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
   3944             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
   3945   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   3946     {
   3947     *endptr = ptr;
   3948     return TRUE;
   3949     }
   3950   }
   3951 return FALSE;
   3952 }
   3953 
   3954 
   3955 
   3956 
   3957 /*************************************************
   3958 *          Check POSIX class name                *
   3959 *************************************************/
   3960 
   3961 /* This function is called to check the name given in a POSIX-style class entry
   3962 such as [:alnum:].
   3963 
   3964 Arguments:
   3965   ptr        points to the first letter
   3966   len        the length of the name
   3967 
   3968 Returns:     a value representing the name, or -1 if unknown
   3969 */
   3970 
   3971 static int
   3972 check_posix_name(const pcre_uchar *ptr, int len)
   3973 {
   3974 const char *pn = posix_names;
   3975 register int yield = 0;
   3976 while (posix_name_lengths[yield] != 0)
   3977   {
   3978   if (len == posix_name_lengths[yield] &&
   3979     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
   3980   pn += posix_name_lengths[yield] + 1;
   3981   yield++;
   3982   }
   3983 return -1;
   3984 }
   3985 
   3986 
   3987 /*************************************************
   3988 *    Adjust OP_RECURSE items in repeated group   *
   3989 *************************************************/
   3990 
   3991 /* OP_RECURSE items contain an offset from the start of the regex to the group
   3992 that is referenced. This means that groups can be replicated for fixed
   3993 repetition simply by copying (because the recursion is allowed to refer to
   3994 earlier groups that are outside the current group). However, when a group is
   3995 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
   3996 inserted before it, after it has been compiled. This means that any OP_RECURSE
   3997 items within it that refer to the group itself or any contained groups have to
   3998 have their offsets adjusted. That one of the jobs of this function. Before it
   3999 is called, the partially compiled regex must be temporarily terminated with
   4000 OP_END.
   4001 
   4002 This function has been extended to cope with forward references for recursions
   4003 and subroutine calls. It must check the list of such references for the
   4004 group we are dealing with. If it finds that one of the recursions in the
   4005 current group is on this list, it does not adjust the value in the reference
   4006 (which is a group number). After the group has been scanned, all the offsets in
   4007 the forward reference list for the group are adjusted.
   4008 
   4009 Arguments:
   4010   group      points to the start of the group
   4011   adjust     the amount by which the group is to be moved
   4012   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
   4013   cd         contains pointers to tables etc.
   4014   save_hwm_offset   the hwm forward reference offset at the start of the group
   4015 
   4016 Returns:     nothing
   4017 */
   4018 
   4019 static void
   4020 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
   4021   size_t save_hwm_offset)
   4022 {
   4023 int offset;
   4024 pcre_uchar *hc;
   4025 pcre_uchar *ptr = group;
   4026 
   4027 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
   4028   {
   4029   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
   4030        hc += LINK_SIZE)
   4031     {
   4032     offset = (int)GET(hc, 0);
   4033     if (cd->start_code + offset == ptr + 1) break;
   4034     }
   4035 
   4036   /* If we have not found this recursion on the forward reference list, adjust
   4037   the recursion's offset if it's after the start of this group. */
   4038 
   4039   if (hc >= cd->hwm)
   4040     {
   4041     offset = (int)GET(ptr, 1);
   4042     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
   4043     }
   4044 
   4045   ptr += 1 + LINK_SIZE;
   4046   }
   4047 
   4048 /* Now adjust all forward reference offsets for the group. */
   4049 
   4050 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
   4051      hc += LINK_SIZE)
   4052   {
   4053   offset = (int)GET(hc, 0);
   4054   PUT(hc, 0, offset + adjust);
   4055   }
   4056 }
   4057 
   4058 
   4059 
   4060 /*************************************************
   4061 *        Insert an automatic callout point       *
   4062 *************************************************/
   4063 
   4064 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
   4065 callout points before each pattern item.
   4066 
   4067 Arguments:
   4068   code           current code pointer
   4069   ptr            current pattern pointer
   4070   cd             pointers to tables etc
   4071 
   4072 Returns:         new code pointer
   4073 */
   4074 
   4075 static pcre_uchar *
   4076 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
   4077 {
   4078 *code++ = OP_CALLOUT;
   4079 *code++ = 255;
   4080 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
   4081 PUT(code, LINK_SIZE, 0);                       /* Default length */
   4082 return code + 2 * LINK_SIZE;
   4083 }
   4084 
   4085 
   4086 
   4087 /*************************************************
   4088 *         Complete a callout item                *
   4089 *************************************************/
   4090 
   4091 /* A callout item contains the length of the next item in the pattern, which
   4092 we can't fill in till after we have reached the relevant point. This is used
   4093 for both automatic and manual callouts.
   4094 
   4095 Arguments:
   4096   previous_callout   points to previous callout item
   4097   ptr                current pattern pointer
   4098   cd                 pointers to tables etc
   4099 
   4100 Returns:             nothing
   4101 */
   4102 
   4103 static void
   4104 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
   4105 {
   4106 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
   4107 PUT(previous_callout, 2 + LINK_SIZE, length);
   4108 }
   4109 
   4110 
   4111 
   4112 #ifdef SUPPORT_UCP
   4113 /*************************************************
   4114 *           Get othercase range                  *
   4115 *************************************************/
   4116 
   4117 /* This function is passed the start and end of a class range, in UTF-8 mode
   4118 with UCP support. It searches up the characters, looking for ranges of
   4119 characters in the "other" case. Each call returns the next one, updating the
   4120 start address. A character with multiple other cases is returned on its own
   4121 with a special return value.
   4122 
   4123 Arguments:
   4124   cptr        points to starting character value; updated
   4125   d           end value
   4126   ocptr       where to put start of othercase range
   4127   odptr       where to put end of othercase range
   4128 
   4129 Yield:        -1 when no more
   4130                0 when a range is returned
   4131               >0 the CASESET offset for char with multiple other cases
   4132                 in this case, ocptr contains the original
   4133 */
   4134 
   4135 static int
   4136 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
   4137   pcre_uint32 *odptr)
   4138 {
   4139 pcre_uint32 c, othercase, next;
   4140 unsigned int co;
   4141 
   4142 /* Find the first character that has an other case. If it has multiple other
   4143 cases, return its case offset value. */
   4144 
   4145 for (c = *cptr; c <= d; c++)
   4146   {
   4147   if ((co = UCD_CASESET(c)) != 0)
   4148     {
   4149     *ocptr = c++;   /* Character that has the set */
   4150     *cptr = c;      /* Rest of input range */
   4151     return (int)co;
   4152     }
   4153   if ((othercase = UCD_OTHERCASE(c)) != c) break;
   4154   }
   4155 
   4156 if (c > d) return -1;  /* Reached end of range */
   4157 
   4158 /* Found a character that has a single other case. Search for the end of the
   4159 range, which is either the end of the input range, or a character that has zero
   4160 or more than one other cases. */
   4161 
   4162 *ocptr = othercase;
   4163 next = othercase + 1;
   4164 
   4165 for (++c; c <= d; c++)
   4166   {
   4167   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
   4168   next++;
   4169   }
   4170 
   4171 *odptr = next - 1;     /* End of othercase range */
   4172 *cptr = c;             /* Rest of input range */
   4173 return 0;
   4174 }
   4175 #endif  /* SUPPORT_UCP */
   4176 
   4177 
   4178 
   4179 /*************************************************
   4180 *        Add a character or range to a class     *
   4181 *************************************************/
   4182 
   4183 /* This function packages up the logic of adding a character or range of
   4184 characters to a class. The character values in the arguments will be within the
   4185 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
   4186 mutually recursive with the function immediately below.
   4187 
   4188 Arguments:
   4189   classbits     the bit map for characters < 256
   4190   uchardptr     points to the pointer for extra data
   4191   options       the options word
   4192   cd            contains pointers to tables etc.
   4193   start         start of range character
   4194   end           end of range character
   4195 
   4196 Returns:        the number of < 256 characters added
   4197                 the pointer to extra data is updated
   4198 */
   4199 
   4200 static int
   4201 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
   4202   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
   4203 {
   4204 pcre_uint32 c;
   4205 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
   4206 int n8 = 0;
   4207 
   4208 /* If caseless matching is required, scan the range and process alternate
   4209 cases. In Unicode, there are 8-bit characters that have alternate cases that
   4210 are greater than 255 and vice-versa. Sometimes we can just extend the original
   4211 range. */
   4212 
   4213 if ((options & PCRE_CASELESS) != 0)
   4214   {
   4215 #ifdef SUPPORT_UCP
   4216   if ((options & PCRE_UTF8) != 0)
   4217     {
   4218     int rc;
   4219     pcre_uint32 oc, od;
   4220 
   4221     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
   4222     c = start;
   4223 
   4224     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
   4225       {
   4226       /* Handle a single character that has more than one other case. */
   4227 
   4228       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
   4229         PRIV(ucd_caseless_sets) + rc, oc);
   4230 
   4231       /* Do nothing if the other case range is within the original range. */
   4232 
   4233       else if (oc >= start && od <= end) continue;
   4234 
   4235       /* Extend the original range if there is overlap, noting that if oc < c, we
   4236       can't have od > end because a subrange is always shorter than the basic
   4237       range. Otherwise, use a recursive call to add the additional range. */
   4238 
   4239       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
   4240       else if (od > end && oc <= end + 1)
   4241         {
   4242         end = od;       /* Extend upwards */
   4243         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
   4244         }
   4245       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
   4246       }
   4247     }
   4248   else
   4249 #endif  /* SUPPORT_UCP */
   4250 
   4251   /* Not UTF-mode, or no UCP */
   4252 
   4253   for (c = start; c <= classbits_end; c++)
   4254     {
   4255     SETBIT(classbits, cd->fcc[c]);
   4256     n8++;
   4257     }
   4258   }
   4259 
   4260 /* Now handle the original range. Adjust the final value according to the bit
   4261 length - this means that the same lists of (e.g.) horizontal spaces can be used
   4262 in all cases. */
   4263 
   4264 #if defined COMPILE_PCRE8
   4265 #ifdef SUPPORT_UTF
   4266   if ((options & PCRE_UTF8) == 0)
   4267 #endif
   4268   if (end > 0xff) end = 0xff;
   4269 
   4270 #elif defined COMPILE_PCRE16
   4271 #ifdef SUPPORT_UTF
   4272   if ((options & PCRE_UTF16) == 0)
   4273 #endif
   4274   if (end > 0xffff) end = 0xffff;
   4275 
   4276 #endif /* COMPILE_PCRE[8|16] */
   4277 
   4278 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
   4279 
   4280 for (c = start; c <= classbits_end; c++)
   4281   {
   4282   /* Regardless of start, c will always be <= 255. */
   4283   SETBIT(classbits, c);
   4284   n8++;
   4285   }
   4286 
   4287 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4288 if (start <= 0xff) start = 0xff + 1;
   4289 
   4290 if (end >= start)
   4291   {
   4292   pcre_uchar *uchardata = *uchardptr;
   4293 #ifdef SUPPORT_UTF
   4294   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
   4295     {
   4296     if (start < end)
   4297       {
   4298       *uchardata++ = XCL_RANGE;
   4299       uchardata += PRIV(ord2utf)(start, uchardata);
   4300       uchardata += PRIV(ord2utf)(end, uchardata);
   4301       }
   4302     else if (start == end)
   4303       {
   4304       *uchardata++ = XCL_SINGLE;
   4305       uchardata += PRIV(ord2utf)(start, uchardata);
   4306       }
   4307     }
   4308   else
   4309 #endif  /* SUPPORT_UTF */
   4310 
   4311   /* Without UTF support, character values are constrained by the bit length,
   4312   and can only be > 256 for 16-bit and 32-bit libraries. */
   4313 
   4314 #ifdef COMPILE_PCRE8
   4315     {}
   4316 #else
   4317   if (start < end)
   4318     {
   4319     *uchardata++ = XCL_RANGE;
   4320     *uchardata++ = start;
   4321     *uchardata++ = end;
   4322     }
   4323   else if (start == end)
   4324     {
   4325     *uchardata++ = XCL_SINGLE;
   4326     *uchardata++ = start;
   4327     }
   4328 #endif
   4329 
   4330   *uchardptr = uchardata;   /* Updata extra data pointer */
   4331   }
   4332 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
   4333 
   4334 return n8;    /* Number of 8-bit characters */
   4335 }
   4336 
   4337 
   4338 
   4339 
   4340 /*************************************************
   4341 *        Add a list of characters to a class     *
   4342 *************************************************/
   4343 
   4344 /* This function is used for adding a list of case-equivalent characters to a
   4345 class, and also for adding a list of horizontal or vertical whitespace. If the
   4346 list is in order (which it should be), ranges of characters are detected and
   4347 handled appropriately. This function is mutually recursive with the function
   4348 above.
   4349 
   4350 Arguments:
   4351   classbits     the bit map for characters < 256
   4352   uchardptr     points to the pointer for extra data
   4353   options       the options word
   4354   cd            contains pointers to tables etc.
   4355   p             points to row of 32-bit values, terminated by NOTACHAR
   4356   except        character to omit; this is used when adding lists of
   4357                   case-equivalent characters to avoid including the one we
   4358                   already know about
   4359 
   4360 Returns:        the number of < 256 characters added
   4361                 the pointer to extra data is updated
   4362 */
   4363 
   4364 static int
   4365 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
   4366   compile_data *cd, const pcre_uint32 *p, unsigned int except)
   4367 {
   4368 int n8 = 0;
   4369 while (p[0] < NOTACHAR)
   4370   {
   4371   int n = 0;
   4372   if (p[0] != except)
   4373     {
   4374     while(p[n+1] == p[0] + n + 1) n++;
   4375     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
   4376     }
   4377   p += n + 1;
   4378   }
   4379 return n8;
   4380 }
   4381 
   4382 
   4383 
   4384 /*************************************************
   4385 *    Add characters not in a list to a class     *
   4386 *************************************************/
   4387 
   4388 /* This function is used for adding the complement of a list of horizontal or
   4389 vertical whitespace to a class. The list must be in order.
   4390 
   4391 Arguments:
   4392   classbits     the bit map for characters < 256
   4393   uchardptr     points to the pointer for extra data
   4394   options       the options word
   4395   cd            contains pointers to tables etc.
   4396   p             points to row of 32-bit values, terminated by NOTACHAR
   4397 
   4398 Returns:        the number of < 256 characters added
   4399                 the pointer to extra data is updated
   4400 */
   4401 
   4402 static int
   4403 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
   4404   int options, compile_data *cd, const pcre_uint32 *p)
   4405 {
   4406 BOOL utf = (options & PCRE_UTF8) != 0;
   4407 int n8 = 0;
   4408 if (p[0] > 0)
   4409   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
   4410 while (p[0] < NOTACHAR)
   4411   {
   4412   while (p[1] == p[0] + 1) p++;
   4413   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
   4414     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
   4415   p++;
   4416   }
   4417 return n8;
   4418 }
   4419 
   4420 
   4421 
   4422 /*************************************************
   4423 *           Compile one branch                   *
   4424 *************************************************/
   4425 
   4426 /* Scan the pattern, compiling it into the a vector. If the options are
   4427 changed during the branch, the pointer is used to change the external options
   4428 bits. This function is used during the pre-compile phase when we are trying
   4429 to find out the amount of memory needed, as well as during the real compile
   4430 phase. The value of lengthptr distinguishes the two phases.
   4431 
   4432 Arguments:
   4433   optionsptr        pointer to the option bits
   4434   codeptr           points to the pointer to the current code point
   4435   ptrptr            points to the current pattern pointer
   4436   errorcodeptr      points to error code variable
   4437   firstcharptr      place to put the first required character
   4438   firstcharflagsptr place to put the first character flags, or a negative number
   4439   reqcharptr        place to put the last required character
   4440   reqcharflagsptr   place to put the last required character flags, or a negative number
   4441   bcptr             points to current branch chain
   4442   cond_depth        conditional nesting depth
   4443   cd                contains pointers to tables etc.
   4444   lengthptr         NULL during the real compile phase
   4445                     points to length accumulator during pre-compile phase
   4446 
   4447 Returns:            TRUE on success
   4448                     FALSE, with *errorcodeptr set non-zero on error
   4449 */
   4450 
   4451 static BOOL
   4452 compile_branch(int *optionsptr, pcre_uchar **codeptr,
   4453   const pcre_uchar **ptrptr, int *errorcodeptr,
   4454   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
   4455   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
   4456   branch_chain *bcptr, int cond_depth,
   4457   compile_data *cd, int *lengthptr)
   4458 {
   4459 int repeat_type, op_type;
   4460 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
   4461 int bravalue = 0;
   4462 int greedy_default, greedy_non_default;
   4463 pcre_uint32 firstchar, reqchar;
   4464 pcre_int32 firstcharflags, reqcharflags;
   4465 pcre_uint32 zeroreqchar, zerofirstchar;
   4466 pcre_int32 zeroreqcharflags, zerofirstcharflags;
   4467 pcre_int32 req_caseopt, reqvary, tempreqvary;
   4468 int options = *optionsptr;               /* May change dynamically */
   4469 int after_manual_callout = 0;
   4470 int length_prevgroup = 0;
   4471 register pcre_uint32 c;
   4472 int escape;
   4473 register pcre_uchar *code = *codeptr;
   4474 pcre_uchar *last_code = code;
   4475 pcre_uchar *orig_code = code;
   4476 pcre_uchar *tempcode;
   4477 BOOL inescq = FALSE;
   4478 BOOL groupsetfirstchar = FALSE;
   4479 const pcre_uchar *ptr = *ptrptr;
   4480 const pcre_uchar *tempptr;
   4481 const pcre_uchar *nestptr = NULL;
   4482 pcre_uchar *previous = NULL;
   4483 pcre_uchar *previous_callout = NULL;
   4484 size_t item_hwm_offset = 0;
   4485 pcre_uint8 classbits[32];
   4486 
   4487 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
   4488 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
   4489 dynamically as we process the pattern. */
   4490 
   4491 #ifdef SUPPORT_UTF
   4492 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
   4493 BOOL utf = (options & PCRE_UTF8) != 0;
   4494 #ifndef COMPILE_PCRE32
   4495 pcre_uchar utf_chars[6];
   4496 #endif
   4497 #else
   4498 BOOL utf = FALSE;
   4499 #endif
   4500 
   4501 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
   4502 class_uchardata always so that it can be passed to add_to_class() always,
   4503 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
   4504 alternative calls for the different cases. */
   4505 
   4506 pcre_uchar *class_uchardata;
   4507 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4508 BOOL xclass;
   4509 pcre_uchar *class_uchardata_base;
   4510 #endif
   4511 
   4512 #ifdef PCRE_DEBUG
   4513 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
   4514 #endif
   4515 
   4516 /* Set up the default and non-default settings for greediness */
   4517 
   4518 greedy_default = ((options & PCRE_UNGREEDY) != 0);
   4519 greedy_non_default = greedy_default ^ 1;
   4520 
   4521 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
   4522 matching encountered yet". It gets changed to REQ_NONE if we hit something that
   4523 matches a non-fixed char first char; reqchar just remains unset if we never
   4524 find one.
   4525 
   4526 When we hit a repeat whose minimum is zero, we may have to adjust these values
   4527 to take the zero repeat into account. This is implemented by setting them to
   4528 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
   4529 item types that can be repeated set these backoff variables appropriately. */
   4530 
   4531 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
   4532 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
   4533 
   4534 /* The variable req_caseopt contains either the REQ_CASELESS value
   4535 or zero, according to the current setting of the caseless flag. The
   4536 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
   4537 firstchar or reqchar variables to record the case status of the
   4538 value. This is used only for ASCII characters. */
   4539 
   4540 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
   4541 
   4542 /* Switch on next character until the end of the branch */
   4543 
   4544 for (;; ptr++)
   4545   {
   4546   BOOL negate_class;
   4547   BOOL should_flip_negation;
   4548   BOOL possessive_quantifier;
   4549   BOOL is_quantifier;
   4550   BOOL is_recurse;
   4551   BOOL reset_bracount;
   4552   int class_has_8bitchar;
   4553   int class_one_char;
   4554 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4555   BOOL xclass_has_prop;
   4556 #endif
   4557   int newoptions;
   4558   int recno;
   4559   int refsign;
   4560   int skipbytes;
   4561   pcre_uint32 subreqchar, subfirstchar;
   4562   pcre_int32 subreqcharflags, subfirstcharflags;
   4563   int terminator;
   4564   unsigned int mclength;
   4565   unsigned int tempbracount;
   4566   pcre_uint32 ec;
   4567   pcre_uchar mcbuffer[8];
   4568 
   4569   /* Get next character in the pattern */
   4570 
   4571   c = *ptr;
   4572 
   4573   /* If we are at the end of a nested substitution, revert to the outer level
   4574   string. Nesting only happens one level deep. */
   4575 
   4576   if (c == CHAR_NULL && nestptr != NULL)
   4577     {
   4578     ptr = nestptr;
   4579     nestptr = NULL;
   4580     c = *ptr;
   4581     }
   4582 
   4583   /* If we are in the pre-compile phase, accumulate the length used for the
   4584   previous cycle of this loop. */
   4585 
   4586   if (lengthptr != NULL)
   4587     {
   4588 #ifdef PCRE_DEBUG
   4589     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
   4590 #endif
   4591     if (code > cd->start_workspace + cd->workspace_size -
   4592         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
   4593       {
   4594       *errorcodeptr = ERR52;
   4595       goto FAILED;
   4596       }
   4597 
   4598     /* There is at least one situation where code goes backwards: this is the
   4599     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
   4600     the class is simply eliminated. However, it is created first, so we have to
   4601     allow memory for it. Therefore, don't ever reduce the length at this point.
   4602     */
   4603 
   4604     if (code < last_code) code = last_code;
   4605 
   4606     /* Paranoid check for integer overflow */
   4607 
   4608     if (OFLOW_MAX - *lengthptr < code - last_code)
   4609       {
   4610       *errorcodeptr = ERR20;
   4611       goto FAILED;
   4612       }
   4613 
   4614     *lengthptr += (int)(code - last_code);
   4615     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
   4616       (int)(code - last_code), c, c));
   4617 
   4618     /* If "previous" is set and it is not at the start of the work space, move
   4619     it back to there, in order to avoid filling up the work space. Otherwise,
   4620     if "previous" is NULL, reset the current code pointer to the start. */
   4621 
   4622     if (previous != NULL)
   4623       {
   4624       if (previous > orig_code)
   4625         {
   4626         memmove(orig_code, previous, IN_UCHARS(code - previous));
   4627         code -= previous - orig_code;
   4628         previous = orig_code;
   4629         }
   4630       }
   4631     else code = orig_code;
   4632 
   4633     /* Remember where this code item starts so we can pick up the length
   4634     next time round. */
   4635 
   4636     last_code = code;
   4637     }
   4638 
   4639   /* In the real compile phase, just check the workspace used by the forward
   4640   reference list. */
   4641 
   4642   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
   4643     {
   4644     *errorcodeptr = ERR52;
   4645     goto FAILED;
   4646     }
   4647 
   4648   /* If in \Q...\E, check for the end; if not, we have a literal */
   4649 
   4650   if (inescq && c != CHAR_NULL)
   4651     {
   4652     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   4653       {
   4654       inescq = FALSE;
   4655       ptr++;
   4656       continue;
   4657       }
   4658     else
   4659       {
   4660       if (previous_callout != NULL)
   4661         {
   4662         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
   4663           complete_callout(previous_callout, ptr, cd);
   4664         previous_callout = NULL;
   4665         }
   4666       if ((options & PCRE_AUTO_CALLOUT) != 0)
   4667         {
   4668         previous_callout = code;
   4669         code = auto_callout(code, ptr, cd);
   4670         }
   4671       goto NORMAL_CHAR;
   4672       }
   4673     /* Control does not reach here. */
   4674     }
   4675 
   4676   /* In extended mode, skip white space and comments. We need a loop in order
   4677   to check for more white space and more comments after a comment. */
   4678 
   4679   if ((options & PCRE_EXTENDED) != 0)
   4680     {
   4681     for (;;)
   4682       {
   4683       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
   4684       if (c != CHAR_NUMBER_SIGN) break;
   4685       ptr++;
   4686       while (*ptr != CHAR_NULL)
   4687         {
   4688         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
   4689           {                          /* IS_NEWLINE sets cd->nllen. */
   4690           ptr += cd->nllen;
   4691           break;
   4692           }
   4693         ptr++;
   4694 #ifdef SUPPORT_UTF
   4695         if (utf) FORWARDCHAR(ptr);
   4696 #endif
   4697         }
   4698       c = *ptr;     /* Either NULL or the char after a newline */
   4699       }
   4700     }
   4701 
   4702   /* See if the next thing is a quantifier. */
   4703 
   4704   is_quantifier =
   4705     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
   4706     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
   4707 
   4708   /* Fill in length of a previous callout, except when the next thing is a
   4709   quantifier or when processing a property substitution string in UCP mode. */
   4710 
   4711   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
   4712        after_manual_callout-- <= 0)
   4713     {
   4714     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
   4715       complete_callout(previous_callout, ptr, cd);
   4716     previous_callout = NULL;
   4717     }
   4718 
   4719   /* Create auto callout, except for quantifiers, or while processing property
   4720   strings that are substituted for \w etc in UCP mode. */
   4721 
   4722   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
   4723     {
   4724     previous_callout = code;
   4725     code = auto_callout(code, ptr, cd);
   4726     }
   4727 
   4728   /* Process the next pattern item. */
   4729 
   4730   switch(c)
   4731     {
   4732     /* ===================================================================*/
   4733     case CHAR_NULL:                /* The branch terminates at string end */
   4734     case CHAR_VERTICAL_LINE:       /* or | or ) */
   4735     case CHAR_RIGHT_PARENTHESIS:
   4736     *firstcharptr = firstchar;
   4737     *firstcharflagsptr = firstcharflags;
   4738     *reqcharptr = reqchar;
   4739     *reqcharflagsptr = reqcharflags;
   4740     *codeptr = code;
   4741     *ptrptr = ptr;
   4742     if (lengthptr != NULL)
   4743       {
   4744       if (OFLOW_MAX - *lengthptr < code - last_code)
   4745         {
   4746         *errorcodeptr = ERR20;
   4747         goto FAILED;
   4748         }
   4749       *lengthptr += (int)(code - last_code);   /* To include callout length */
   4750       DPRINTF((">> end branch\n"));
   4751       }
   4752     return TRUE;
   4753 
   4754 
   4755     /* ===================================================================*/
   4756     /* Handle single-character metacharacters. In multiline mode, ^ disables
   4757     the setting of any following char as a first character. */
   4758 
   4759     case CHAR_CIRCUMFLEX_ACCENT:
   4760     previous = NULL;
   4761     if ((options & PCRE_MULTILINE) != 0)
   4762       {
   4763       if (firstcharflags == REQ_UNSET)
   4764         zerofirstcharflags = firstcharflags = REQ_NONE;
   4765       *code++ = OP_CIRCM;
   4766       }
   4767     else *code++ = OP_CIRC;
   4768     break;
   4769 
   4770     case CHAR_DOLLAR_SIGN:
   4771     previous = NULL;
   4772     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
   4773     break;
   4774 
   4775     /* There can never be a first char if '.' is first, whatever happens about
   4776     repeats. The value of reqchar doesn't change either. */
   4777 
   4778     case CHAR_DOT:
   4779     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   4780     zerofirstchar = firstchar;
   4781     zerofirstcharflags = firstcharflags;
   4782     zeroreqchar = reqchar;
   4783     zeroreqcharflags = reqcharflags;
   4784     previous = code;
   4785     item_hwm_offset = cd->hwm - cd->start_workspace;
   4786     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
   4787     break;
   4788 
   4789 
   4790     /* ===================================================================*/
   4791     /* Character classes. If the included characters are all < 256, we build a
   4792     32-byte bitmap of the permitted characters, except in the special case
   4793     where there is only one such character. For negated classes, we build the
   4794     map as usual, then invert it at the end. However, we use a different opcode
   4795     so that data characters > 255 can be handled correctly.
   4796 
   4797     If the class contains characters outside the 0-255 range, a different
   4798     opcode is compiled. It may optionally have a bit map for characters < 256,
   4799     but those above are are explicitly listed afterwards. A flag byte tells
   4800     whether the bitmap is present, and whether this is a negated class or not.
   4801 
   4802     In JavaScript compatibility mode, an isolated ']' causes an error. In
   4803     default (Perl) mode, it is treated as a data character. */
   4804 
   4805     case CHAR_RIGHT_SQUARE_BRACKET:
   4806     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   4807       {
   4808       *errorcodeptr = ERR64;
   4809       goto FAILED;
   4810       }
   4811     goto NORMAL_CHAR;
   4812 
   4813     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
   4814     used for "start of word" and "end of word". As these are otherwise illegal
   4815     sequences, we don't break anything by recognizing them. They are replaced
   4816     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
   4817     erroneous and are handled by the normal code below. */
   4818 
   4819     case CHAR_LEFT_SQUARE_BRACKET:
   4820     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
   4821       {
   4822       nestptr = ptr + 7;
   4823       ptr = sub_start_of_word - 1;
   4824       continue;
   4825       }
   4826 
   4827     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
   4828       {
   4829       nestptr = ptr + 7;
   4830       ptr = sub_end_of_word - 1;
   4831       continue;
   4832       }
   4833 
   4834     /* Handle a real character class. */
   4835 
   4836     previous = code;
   4837     item_hwm_offset = cd->hwm - cd->start_workspace;
   4838 
   4839     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
   4840     they are encountered at the top level, so we'll do that too. */
   4841 
   4842     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4843          ptr[1] == CHAR_EQUALS_SIGN) &&
   4844         check_posix_syntax(ptr, &tempptr))
   4845       {
   4846       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
   4847       goto FAILED;
   4848       }
   4849 
   4850     /* If the first character is '^', set the negation flag and skip it. Also,
   4851     if the first few characters (either before or after ^) are \Q\E or \E we
   4852     skip them too. This makes for compatibility with Perl. */
   4853 
   4854     negate_class = FALSE;
   4855     for (;;)
   4856       {
   4857       c = *(++ptr);
   4858       if (c == CHAR_BACKSLASH)
   4859         {
   4860         if (ptr[1] == CHAR_E)
   4861           ptr++;
   4862         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
   4863           ptr += 3;
   4864         else
   4865           break;
   4866         }
   4867       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   4868         negate_class = TRUE;
   4869       else break;
   4870       }
   4871 
   4872     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
   4873     an initial ']' is taken as a data character -- the code below handles
   4874     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
   4875     [^] must match any character, so generate OP_ALLANY. */
   4876 
   4877     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   4878         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   4879       {
   4880       *code++ = negate_class? OP_ALLANY : OP_FAIL;
   4881       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   4882       zerofirstchar = firstchar;
   4883       zerofirstcharflags = firstcharflags;
   4884       break;
   4885       }
   4886 
   4887     /* If a class contains a negative special such as \S, we need to flip the
   4888     negation flag at the end, so that support for characters > 255 works
   4889     correctly (they are all included in the class). */
   4890 
   4891     should_flip_negation = FALSE;
   4892 
   4893     /* Extended class (xclass) will be used when characters > 255
   4894     might match. */
   4895 
   4896 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4897     xclass = FALSE;
   4898     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
   4899     class_uchardata_base = class_uchardata;   /* Save the start */
   4900 #endif
   4901 
   4902     /* For optimization purposes, we track some properties of the class:
   4903     class_has_8bitchar will be non-zero if the class contains at least one <
   4904     256 character; class_one_char will be 1 if the class contains just one
   4905     character; xclass_has_prop will be TRUE if unicode property checks
   4906     are present in the class. */
   4907 
   4908     class_has_8bitchar = 0;
   4909     class_one_char = 0;
   4910 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4911     xclass_has_prop = FALSE;
   4912 #endif
   4913 
   4914     /* Initialize the 32-char bit map to all zeros. We build the map in a
   4915     temporary bit of memory, in case the class contains fewer than two
   4916     8-bit characters because in that case the compiled code doesn't use the bit
   4917     map. */
   4918 
   4919     memset(classbits, 0, 32 * sizeof(pcre_uint8));
   4920 
   4921     /* Process characters until ] is reached. By writing this as a "do" it
   4922     means that an initial ] is taken as a data character. At the start of the
   4923     loop, c contains the first byte of the character. */
   4924 
   4925     if (c != CHAR_NULL) do
   4926       {
   4927       const pcre_uchar *oldptr;
   4928 
   4929 #ifdef SUPPORT_UTF
   4930       if (utf && HAS_EXTRALEN(c))
   4931         {                           /* Braces are required because the */
   4932         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
   4933         }
   4934 #endif
   4935 
   4936 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   4937       /* In the pre-compile phase, accumulate the length of any extra
   4938       data and reset the pointer. This is so that very large classes that
   4939       contain a zillion > 255 characters no longer overwrite the work space
   4940       (which is on the stack). We have to remember that there was XCLASS data,
   4941       however. */
   4942 
   4943       if (class_uchardata > class_uchardata_base) xclass = TRUE;
   4944 
   4945       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
   4946         {
   4947         *lengthptr += (int)(class_uchardata - class_uchardata_base);
   4948         class_uchardata = class_uchardata_base;
   4949         }
   4950 #endif
   4951 
   4952       /* Inside \Q...\E everything is literal except \E */
   4953 
   4954       if (inescq)
   4955         {
   4956         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
   4957           {
   4958           inescq = FALSE;                   /* Reset literal state */
   4959           ptr++;                            /* Skip the 'E' */
   4960           continue;                         /* Carry on with next */
   4961           }
   4962         goto CHECK_RANGE;                   /* Could be range if \E follows */
   4963         }
   4964 
   4965       /* Handle POSIX class names. Perl allows a negation extension of the
   4966       form [:^name:]. A square bracket that doesn't match the syntax is
   4967       treated as a literal. We also recognize the POSIX constructions
   4968       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
   4969       5.6 and 5.8 do. */
   4970 
   4971       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   4972           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   4973            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
   4974         {
   4975         BOOL local_negate = FALSE;
   4976         int posix_class, taboffset, tabopt;
   4977         register const pcre_uint8 *cbits = cd->cbits;
   4978         pcre_uint8 pbits[32];
   4979 
   4980         if (ptr[1] != CHAR_COLON)
   4981           {
   4982           *errorcodeptr = ERR31;
   4983           goto FAILED;
   4984           }
   4985 
   4986         ptr += 2;
   4987         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
   4988           {
   4989           local_negate = TRUE;
   4990           should_flip_negation = TRUE;  /* Note negative special */
   4991           ptr++;
   4992           }
   4993 
   4994         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
   4995         if (posix_class < 0)
   4996           {
   4997           *errorcodeptr = ERR30;
   4998           goto FAILED;
   4999           }
   5000 
   5001         /* If matching is caseless, upper and lower are converted to
   5002         alpha. This relies on the fact that the class table starts with
   5003         alpha, lower, upper as the first 3 entries. */
   5004 
   5005         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
   5006           posix_class = 0;
   5007 
   5008         /* When PCRE_UCP is set, some of the POSIX classes are converted to
   5009         different escape sequences that use Unicode properties \p or \P. Others
   5010         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
   5011         directly. */
   5012 
   5013 #ifdef SUPPORT_UCP
   5014         if ((options & PCRE_UCP) != 0)
   5015           {
   5016           unsigned int ptype = 0;
   5017           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
   5018 
   5019           /* The posix_substitutes table specifies which POSIX classes can be
   5020           converted to \p or \P items. */
   5021 
   5022           if (posix_substitutes[pc] != NULL)
   5023             {
   5024             nestptr = tempptr + 1;
   5025             ptr = posix_substitutes[pc] - 1;
   5026             continue;
   5027             }
   5028 
   5029           /* There are three other classes that generate special property calls
   5030           that are recognized only in an XCLASS. */
   5031 
   5032           else switch(posix_class)
   5033             {
   5034             case PC_GRAPH:
   5035             ptype = PT_PXGRAPH;
   5036             /* Fall through */
   5037             case PC_PRINT:
   5038             if (ptype == 0) ptype = PT_PXPRINT;
   5039             /* Fall through */
   5040             case PC_PUNCT:
   5041             if (ptype == 0) ptype = PT_PXPUNCT;
   5042             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
   5043             *class_uchardata++ = ptype;
   5044             *class_uchardata++ = 0;
   5045             xclass_has_prop = TRUE;
   5046             ptr = tempptr + 1;
   5047             continue;
   5048 
   5049             /* For the other POSIX classes (ascii, xdigit) we are going to fall
   5050             through to the non-UCP case and build a bit map for characters with
   5051             code points less than 256. If we are in a negated POSIX class
   5052             within a non-negated overall class, characters with code points
   5053             greater than 255 must all match. In the special case where we have
   5054             not yet generated any xclass data, and this is the final item in
   5055             the overall class, we need do nothing: later on, the opcode
   5056             OP_NCLASS will be used to indicate that characters greater than 255
   5057             are acceptable. If we have already seen an xclass item or one may
   5058             follow (we have to assume that it might if this is not the end of
   5059             the class), explicitly match all wide codepoints. */
   5060 
   5061             default:
   5062             if (!negate_class && local_negate &&
   5063                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
   5064               {
   5065               *class_uchardata++ = XCL_RANGE;
   5066               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
   5067               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
   5068               }
   5069             break;
   5070             }
   5071           }
   5072 #endif
   5073         /* In the non-UCP case, or when UCP makes no difference, we build the
   5074         bit map for the POSIX class in a chunk of local store because we may be
   5075         adding and subtracting from it, and we don't want to subtract bits that
   5076         may be in the main map already. At the end we or the result into the
   5077         bit map that is being built. */
   5078 
   5079         posix_class *= 3;
   5080 
   5081         /* Copy in the first table (always present) */
   5082 
   5083         memcpy(pbits, cbits + posix_class_maps[posix_class],
   5084           32 * sizeof(pcre_uint8));
   5085 
   5086         /* If there is a second table, add or remove it as required. */
   5087 
   5088         taboffset = posix_class_maps[posix_class + 1];
   5089         tabopt = posix_class_maps[posix_class + 2];
   5090 
   5091         if (taboffset >= 0)
   5092           {
   5093           if (tabopt >= 0)
   5094             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
   5095           else
   5096             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
   5097           }
   5098 
   5099         /* Now see if we need to remove any special characters. An option
   5100         value of 1 removes vertical space and 2 removes underscore. */
   5101 
   5102         if (tabopt < 0) tabopt = -tabopt;
   5103         if (tabopt == 1) pbits[1] &= ~0x3c;
   5104           else if (tabopt == 2) pbits[11] &= 0x7f;
   5105 
   5106         /* Add the POSIX table or its complement into the main table that is
   5107         being built and we are done. */
   5108 
   5109         if (local_negate)
   5110           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
   5111         else
   5112           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
   5113 
   5114         ptr = tempptr + 1;
   5115         /* Every class contains at least one < 256 character. */
   5116         class_has_8bitchar = 1;
   5117         /* Every class contains at least two characters. */
   5118         class_one_char = 2;
   5119         continue;    /* End of POSIX syntax handling */
   5120         }
   5121 
   5122       /* Backslash may introduce a single character, or it may introduce one
   5123       of the specials, which just set a flag. The sequence \b is a special
   5124       case. Inside a class (and only there) it is treated as backspace. We
   5125       assume that other escapes have more than one character in them, so
   5126       speculatively set both class_has_8bitchar and class_one_char bigger
   5127       than one. Unrecognized escapes fall through and are either treated
   5128       as literal characters (by default), or are faulted if
   5129       PCRE_EXTRA is set. */
   5130 
   5131       if (c == CHAR_BACKSLASH)
   5132         {
   5133         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
   5134           TRUE);
   5135         if (*errorcodeptr != 0) goto FAILED;
   5136         if (escape == 0) c = ec;
   5137         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
   5138         else if (escape == ESC_N)          /* \N is not supported in a class */
   5139           {
   5140           *errorcodeptr = ERR71;
   5141           goto FAILED;
   5142           }
   5143         else if (escape == ESC_Q)            /* Handle start of quoted string */
   5144           {
   5145           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   5146             {
   5147             ptr += 2; /* avoid empty string */
   5148             }
   5149           else inescq = TRUE;
   5150           continue;
   5151           }
   5152         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
   5153 
   5154         else
   5155           {
   5156           register const pcre_uint8 *cbits = cd->cbits;
   5157           /* Every class contains at least two < 256 characters. */
   5158           class_has_8bitchar++;
   5159           /* Every class contains at least two characters. */
   5160           class_one_char += 2;
   5161 
   5162           switch (escape)
   5163             {
   5164 #ifdef SUPPORT_UCP
   5165             case ESC_du:     /* These are the values given for \d etc */
   5166             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
   5167             case ESC_wu:     /* escape sequence with an appropriate \p */
   5168             case ESC_WU:     /* or \P to test Unicode properties instead */
   5169             case ESC_su:     /* of the default ASCII testing. */
   5170             case ESC_SU:
   5171             nestptr = ptr;
   5172             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
   5173             class_has_8bitchar--;                /* Undo! */
   5174             continue;
   5175 #endif
   5176             case ESC_d:
   5177             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
   5178             continue;
   5179 
   5180             case ESC_D:
   5181             should_flip_negation = TRUE;
   5182             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
   5183             continue;
   5184 
   5185             case ESC_w:
   5186             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
   5187             continue;
   5188 
   5189             case ESC_W:
   5190             should_flip_negation = TRUE;
   5191             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
   5192             continue;
   5193 
   5194             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
   5195             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
   5196             previously set by something earlier in the character class.
   5197             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
   5198             we could just adjust the appropriate bit. From PCRE 8.34 we no
   5199             longer treat \s and \S specially. */
   5200 
   5201             case ESC_s:
   5202             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
   5203             continue;
   5204 
   5205             case ESC_S:
   5206             should_flip_negation = TRUE;
   5207             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
   5208             continue;
   5209 
   5210             /* The rest apply in both UCP and non-UCP cases. */
   5211 
   5212             case ESC_h:
   5213             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
   5214               PRIV(hspace_list), NOTACHAR);
   5215             continue;
   5216 
   5217             case ESC_H:
   5218             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   5219               cd, PRIV(hspace_list));
   5220             continue;
   5221 
   5222             case ESC_v:
   5223             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
   5224               PRIV(vspace_list), NOTACHAR);
   5225             continue;
   5226 
   5227             case ESC_V:
   5228             (void)add_not_list_to_class(classbits, &class_uchardata, options,
   5229               cd, PRIV(vspace_list));
   5230             continue;
   5231 
   5232             case ESC_p:
   5233             case ESC_P:
   5234 #ifdef SUPPORT_UCP
   5235               {
   5236               BOOL negated;
   5237               unsigned int ptype = 0, pdata = 0;
   5238               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
   5239                 goto FAILED;
   5240               *class_uchardata++ = ((escape == ESC_p) != negated)?
   5241                 XCL_PROP : XCL_NOTPROP;
   5242               *class_uchardata++ = ptype;
   5243               *class_uchardata++ = pdata;
   5244               xclass_has_prop = TRUE;
   5245               class_has_8bitchar--;                /* Undo! */
   5246               continue;
   5247               }
   5248 #else
   5249             *errorcodeptr = ERR45;
   5250             goto FAILED;
   5251 #endif
   5252             /* Unrecognized escapes are faulted if PCRE is running in its
   5253             strict mode. By default, for compatibility with Perl, they are
   5254             treated as literals. */
   5255 
   5256             default:
   5257             if ((options & PCRE_EXTRA) != 0)
   5258               {
   5259               *errorcodeptr = ERR7;
   5260               goto FAILED;
   5261               }
   5262             class_has_8bitchar--;    /* Undo the speculative increase. */
   5263             class_one_char -= 2;     /* Undo the speculative increase. */
   5264             c = *ptr;                /* Get the final character and fall through */
   5265             break;
   5266             }
   5267           }
   5268 
   5269         /* Fall through if the escape just defined a single character (c >= 0).
   5270         This may be greater than 256. */
   5271 
   5272         escape = 0;
   5273 
   5274         }   /* End of backslash handling */
   5275 
   5276       /* A character may be followed by '-' to form a range. However, Perl does
   5277       not permit ']' to be the end of the range. A '-' character at the end is
   5278       treated as a literal. Perl ignores orphaned \E sequences entirely. The
   5279       code for handling \Q and \E is messy. */
   5280 
   5281       CHECK_RANGE:
   5282       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   5283         {
   5284         inescq = FALSE;
   5285         ptr += 2;
   5286         }
   5287       oldptr = ptr;
   5288 
   5289       /* Remember if \r or \n were explicitly used */
   5290 
   5291       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   5292 
   5293       /* Check for range */
   5294 
   5295       if (!inescq && ptr[1] == CHAR_MINUS)
   5296         {
   5297         pcre_uint32 d;
   5298         ptr += 2;
   5299         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
   5300 
   5301         /* If we hit \Q (not followed by \E) at this point, go into escaped
   5302         mode. */
   5303 
   5304         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
   5305           {
   5306           ptr += 2;
   5307           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   5308             { ptr += 2; continue; }
   5309           inescq = TRUE;
   5310           break;
   5311           }
   5312 
   5313         /* Minus (hyphen) at the end of a class is treated as a literal, so put
   5314         back the pointer and jump to handle the character that preceded it. */
   5315 
   5316         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
   5317           {
   5318           ptr = oldptr;
   5319           goto CLASS_SINGLE_CHARACTER;
   5320           }
   5321 
   5322         /* Otherwise, we have a potential range; pick up the next character */
   5323 
   5324 #ifdef SUPPORT_UTF
   5325         if (utf)
   5326           {                           /* Braces are required because the */
   5327           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
   5328           }
   5329         else
   5330 #endif
   5331         d = *ptr;  /* Not UTF-8 mode */
   5332 
   5333         /* The second part of a range can be a single-character escape
   5334         sequence, but not any of the other escapes. Perl treats a hyphen as a
   5335         literal in such circumstances. However, in Perl's warning mode, a
   5336         warning is given, so PCRE now faults it as it is almost certainly a
   5337         mistake on the user's part. */
   5338 
   5339         if (!inescq)
   5340           {
   5341           if (d == CHAR_BACKSLASH)
   5342             {
   5343             int descape;
   5344             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
   5345             if (*errorcodeptr != 0) goto FAILED;
   5346 
   5347             /* 0 means a character was put into d; \b is backspace; any other
   5348             special causes an error. */
   5349 
   5350             if (descape != 0)
   5351               {
   5352               if (descape == ESC_b) d = CHAR_BS; else
   5353                 {
   5354                 *errorcodeptr = ERR83;
   5355                 goto FAILED;
   5356                 }
   5357               }
   5358             }
   5359 
   5360           /* A hyphen followed by a POSIX class is treated in the same way. */
   5361 
   5362           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
   5363                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   5364                     ptr[1] == CHAR_EQUALS_SIGN) &&
   5365                    check_posix_syntax(ptr, &tempptr))
   5366             {
   5367             *errorcodeptr = ERR83;
   5368             goto FAILED;
   5369             }
   5370           }
   5371 
   5372         /* Check that the two values are in the correct order. Optimize
   5373         one-character ranges. */
   5374 
   5375         if (d < c)
   5376           {
   5377           *errorcodeptr = ERR8;
   5378           goto FAILED;
   5379           }
   5380         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
   5381 
   5382         /* We have found a character range, so single character optimizations
   5383         cannot be done anymore. Any value greater than 1 indicates that there
   5384         is more than one character. */
   5385 
   5386         class_one_char = 2;
   5387 
   5388         /* Remember an explicit \r or \n, and add the range to the class. */
   5389 
   5390         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   5391 
   5392         class_has_8bitchar +=
   5393           add_to_class(classbits, &class_uchardata, options, cd, c, d);
   5394 
   5395         continue;   /* Go get the next char in the class */
   5396         }
   5397 
   5398       /* Handle a single character - we can get here for a normal non-escape
   5399       char, or after \ that introduces a single character or for an apparent
   5400       range that isn't. Only the value 1 matters for class_one_char, so don't
   5401       increase it if it is already 2 or more ... just in case there's a class
   5402       with a zillion characters in it. */
   5403 
   5404       CLASS_SINGLE_CHARACTER:
   5405       if (class_one_char < 2) class_one_char++;
   5406 
   5407       /* If xclass_has_prop is false and class_one_char is 1, we have the first
   5408       single character in the class, and there have been no prior ranges, or
   5409       XCLASS items generated by escapes. If this is the final character in the
   5410       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
   5411       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
   5412       can cause firstchar to be set. Otherwise, there can be no first char if
   5413       this item is first, whatever repeat count may follow. In the case of
   5414       reqchar, save the previous value for reinstating. */
   5415 
   5416       if (!inescq &&
   5417 #ifdef SUPPORT_UCP
   5418           !xclass_has_prop &&
   5419 #endif
   5420           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   5421         {
   5422         ptr++;
   5423         zeroreqchar = reqchar;
   5424         zeroreqcharflags = reqcharflags;
   5425 
   5426         if (negate_class)
   5427           {
   5428 #ifdef SUPPORT_UCP
   5429           int d;
   5430 #endif
   5431           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   5432           zerofirstchar = firstchar;
   5433           zerofirstcharflags = firstcharflags;
   5434 
   5435           /* For caseless UTF-8 mode when UCP support is available, check
   5436           whether this character has more than one other case. If so, generate
   5437           a special OP_NOTPROP item instead of OP_NOTI. */
   5438 
   5439 #ifdef SUPPORT_UCP
   5440           if (utf && (options & PCRE_CASELESS) != 0 &&
   5441               (d = UCD_CASESET(c)) != 0)
   5442             {
   5443             *code++ = OP_NOTPROP;
   5444             *code++ = PT_CLIST;
   5445             *code++ = d;
   5446             }
   5447           else
   5448 #endif
   5449           /* Char has only one other case, or UCP not available */
   5450 
   5451             {
   5452             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
   5453 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5454             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
   5455               code += PRIV(ord2utf)(c, code);
   5456             else
   5457 #endif
   5458               *code++ = c;
   5459             }
   5460 
   5461           /* We are finished with this character class */
   5462 
   5463           goto END_CLASS;
   5464           }
   5465 
   5466         /* For a single, positive character, get the value into mcbuffer, and
   5467         then we can handle this with the normal one-character code. */
   5468 
   5469 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
   5470         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
   5471           mclength = PRIV(ord2utf)(c, mcbuffer);
   5472         else
   5473 #endif
   5474           {
   5475           mcbuffer[0] = c;
   5476           mclength = 1;
   5477           }
   5478         goto ONE_CHAR;
   5479         }       /* End of 1-char optimization */
   5480 
   5481       /* There is more than one character in the class, or an XCLASS item
   5482       has been generated. Add this character to the class. */
   5483 
   5484       class_has_8bitchar +=
   5485         add_to_class(classbits, &class_uchardata, options, cd, c, c);
   5486       }
   5487 
   5488     /* Loop until ']' reached. This "while" is the end of the "do" far above.
   5489     If we are at the end of an internal nested string, revert to the outer
   5490     string. */
   5491 
   5492     while (((c = *(++ptr)) != CHAR_NULL ||
   5493            (nestptr != NULL &&
   5494              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
   5495            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
   5496 
   5497     /* Check for missing terminating ']' */
   5498 
   5499     if (c == CHAR_NULL)
   5500       {
   5501       *errorcodeptr = ERR6;
   5502       goto FAILED;
   5503       }
   5504 
   5505     /* We will need an XCLASS if data has been placed in class_uchardata. In
   5506     the second phase this is a sufficient test. However, in the pre-compile
   5507     phase, class_uchardata gets emptied to prevent workspace overflow, so it
   5508     only if the very last character in the class needs XCLASS will it contain
   5509     anything at this point. For this reason, xclass gets set TRUE above when
   5510     uchar_classdata is emptied, and that's why this code is the way it is here
   5511     instead of just doing a test on class_uchardata below. */
   5512 
   5513 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   5514     if (class_uchardata > class_uchardata_base) xclass = TRUE;
   5515 #endif
   5516 
   5517     /* If this is the first thing in the branch, there can be no first char
   5518     setting, whatever the repeat count. Any reqchar setting must remain
   5519     unchanged after any kind of repeat. */
   5520 
   5521     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
   5522     zerofirstchar = firstchar;
   5523     zerofirstcharflags = firstcharflags;
   5524     zeroreqchar = reqchar;
   5525     zeroreqcharflags = reqcharflags;
   5526 
   5527     /* If there are characters with values > 255, we have to compile an
   5528     extended class, with its own opcode, unless there was a negated special
   5529     such as \S in the class, and PCRE_UCP is not set, because in that case all
   5530     characters > 255 are in the class, so any that were explicitly given as
   5531     well can be ignored. If (when there are explicit characters > 255 that must
   5532     be listed) there are no characters < 256, we can omit the bitmap in the
   5533     actual compiled code. */
   5534 
   5535 #ifdef SUPPORT_UTF
   5536     if (xclass && (xclass_has_prop || !should_flip_negation ||
   5537         (options & PCRE_UCP) != 0))
   5538 #elif !defined COMPILE_PCRE8
   5539     if (xclass && (xclass_has_prop || !should_flip_negation))
   5540 #endif
   5541 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   5542       {
   5543       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
   5544       *code++ = OP_XCLASS;
   5545       code += LINK_SIZE;
   5546       *code = negate_class? XCL_NOT:0;
   5547       if (xclass_has_prop) *code |= XCL_HASPROP;
   5548 
   5549       /* If the map is required, move up the extra data to make room for it;
   5550       otherwise just move the code pointer to the end of the extra data. */
   5551 
   5552       if (class_has_8bitchar > 0)
   5553         {
   5554         *code++ |= XCL_MAP;
   5555         memmove(code + (32 / sizeof(pcre_uchar)), code,
   5556           IN_UCHARS(class_uchardata - code));
   5557         if (negate_class && !xclass_has_prop)
   5558           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5559         memcpy(code, classbits, 32);
   5560         code = class_uchardata + (32 / sizeof(pcre_uchar));
   5561         }
   5562       else code = class_uchardata;
   5563 
   5564       /* Now fill in the complete length of the item */
   5565 
   5566       PUT(previous, 1, (int)(code - previous));
   5567       break;   /* End of class handling */
   5568       }
   5569 
   5570     /* Even though any XCLASS list is now discarded, we must allow for
   5571     its memory. */
   5572 
   5573     if (lengthptr != NULL)
   5574       *lengthptr += (int)(class_uchardata - class_uchardata_base);
   5575 #endif
   5576 
   5577     /* If there are no characters > 255, or they are all to be included or
   5578     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
   5579     whole class was negated and whether there were negative specials such as \S
   5580     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
   5581     negating it if necessary. */
   5582 
   5583     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
   5584     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
   5585       {
   5586       if (negate_class)
   5587         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
   5588       memcpy(code, classbits, 32);
   5589       }
   5590     code += 32 / sizeof(pcre_uchar);
   5591 
   5592     END_CLASS:
   5593     break;
   5594 
   5595 
   5596     /* ===================================================================*/
   5597     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
   5598     has been tested above. */
   5599 
   5600     case CHAR_LEFT_CURLY_BRACKET:
   5601     if (!is_quantifier) goto NORMAL_CHAR;
   5602     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
   5603     if (*errorcodeptr != 0) goto FAILED;
   5604     goto REPEAT;
   5605 
   5606     case CHAR_ASTERISK:
   5607     repeat_min = 0;
   5608     repeat_max = -1;
   5609     goto REPEAT;
   5610 
   5611     case CHAR_PLUS:
   5612     repeat_min = 1;
   5613     repeat_max = -1;
   5614     goto REPEAT;
   5615 
   5616     case CHAR_QUESTION_MARK:
   5617     repeat_min = 0;
   5618     repeat_max = 1;
   5619 
   5620     REPEAT:
   5621     if (previous == NULL)
   5622       {
   5623       *errorcodeptr = ERR9;
   5624       goto FAILED;
   5625       }
   5626 
   5627     if (repeat_min == 0)
   5628       {
   5629       firstchar = zerofirstchar;    /* Adjust for zero repeat */
   5630       firstcharflags = zerofirstcharflags;
   5631       reqchar = zeroreqchar;        /* Ditto */
   5632       reqcharflags = zeroreqcharflags;
   5633       }
   5634 
   5635     /* Remember whether this is a variable length repeat */
   5636 
   5637     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
   5638 
   5639     op_type = 0;                    /* Default single-char op codes */
   5640     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
   5641 
   5642     /* Save start of previous item, in case we have to move it up in order to
   5643     insert something before it. */
   5644 
   5645     tempcode = previous;
   5646 
   5647     /* Before checking for a possessive quantifier, we must skip over
   5648     whitespace and comments in extended mode because Perl allows white space at
   5649     this point. */
   5650 
   5651     if ((options & PCRE_EXTENDED) != 0)
   5652       {
   5653       const pcre_uchar *p = ptr + 1;
   5654       for (;;)
   5655         {
   5656         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
   5657         if (*p != CHAR_NUMBER_SIGN) break;
   5658         p++;
   5659         while (*p != CHAR_NULL)
   5660           {
   5661           if (