Home | History | Annotate | Download | only in pcre
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9            Copyright (c) 1997-2010 University of Cambridge
     10 
     11 -----------------------------------------------------------------------------
     12 Redistribution and use in source and binary forms, with or without
     13 modification, are permitted provided that the following conditions are met:
     14 
     15     * Redistributions of source code must retain the above copyright notice,
     16       this list of conditions and the following disclaimer.
     17 
     18     * Redistributions in binary form must reproduce the above copyright
     19       notice, this list of conditions and the following disclaimer in the
     20       documentation and/or other materials provided with the distribution.
     21 
     22     * Neither the name of the University of Cambridge nor the names of its
     23       contributors may be used to endorse or promote products derived from
     24       this software without specific prior written permission.
     25 
     26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36 POSSIBILITY OF SUCH DAMAGE.
     37 -----------------------------------------------------------------------------
     38 */
     39 
     40 
     41 /* This module contains the external function pcre_compile(), along with
     42 supporting internal functions that are not used by other modules. */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #define NLBLOCK cd             /* Block containing newline information */
     50 #define PSSTART start_pattern  /* Field containing processed string start */
     51 #define PSEND   end_pattern    /* Field containing processed string end */
     52 
     53 #include "pcre_internal.h"
     54 
     55 
     56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
     57 also used by pcretest. PCRE_DEBUG is not defined when building a production
     58 library. */
     59 
     60 #ifdef PCRE_DEBUG
     61 #include "pcre_printint.src"
     62 #endif
     63 
     64 
     65 /* Macro for setting individual bits in class bitmaps. */
     66 
     67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
     68 
     69 /* Maximum length value to check against when making sure that the integer that
     70 holds the compiled pattern length does not overflow. We make it a bit less than
     71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
     72 to check them every time. */
     73 
     74 #define OFLOW_MAX (INT_MAX - 20)
     75 
     76 
     77 /*************************************************
     78 *      Code parameters and static tables         *
     79 *************************************************/
     80 
     81 /* This value specifies the size of stack workspace that is used during the
     82 first pre-compile phase that determines how much memory is required. The regex
     83 is partly compiled into this space, but the compiled parts are discarded as
     84 soon as they can be, so that hopefully there will never be an overrun. The code
     85 does, however, check for an overrun. The largest amount I've seen used is 218,
     86 so this number is very generous.
     87 
     88 The same workspace is used during the second, actual compile phase for
     89 remembering forward references to groups so that they can be filled in at the
     90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
     91 is 4 there is plenty of room. */
     92 
     93 #define COMPILE_WORK_SIZE (4096)
     94 
     95 /* The overrun tests check for a slightly smaller size so that they detect the
     96 overrun before it actually does run off the end of the data block. */
     97 
     98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
     99 
    100 
    101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
    102 are simple data values; negative values are for special things like \d and so
    103 on. Zero means further processing is needed (for things like \x), or the escape
    104 is invalid. */
    105 
    106 #ifndef EBCDIC
    107 
    108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
    109 in UTF-8 mode. */
    110 
    111 static const short int escapes[] = {
    112      0,                       0,
    113      0,                       0,
    114      0,                       0,
    115      0,                       0,
    116      0,                       0,
    117      CHAR_COLON,              CHAR_SEMICOLON,
    118      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
    119      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
    120      CHAR_COMMERCIAL_AT,      -ESC_A,
    121      -ESC_B,                  -ESC_C,
    122      -ESC_D,                  -ESC_E,
    123      0,                       -ESC_G,
    124      -ESC_H,                  0,
    125      0,                       -ESC_K,
    126      0,                       0,
    127      -ESC_N,                  0,
    128      -ESC_P,                  -ESC_Q,
    129      -ESC_R,                  -ESC_S,
    130      0,                       0,
    131      -ESC_V,                  -ESC_W,
    132      -ESC_X,                  0,
    133      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
    134      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
    135      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
    136      CHAR_GRAVE_ACCENT,       7,
    137      -ESC_b,                  0,
    138      -ESC_d,                  ESC_e,
    139      ESC_f,                   0,
    140      -ESC_h,                  0,
    141      0,                       -ESC_k,
    142      0,                       0,
    143      ESC_n,                   0,
    144      -ESC_p,                  0,
    145      ESC_r,                   -ESC_s,
    146      ESC_tee,                 0,
    147      -ESC_v,                  -ESC_w,
    148      0,                       0,
    149      -ESC_z
    150 };
    151 
    152 #else
    153 
    154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
    155 
    156 static const short int escapes[] = {
    157 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
    158 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
    159 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
    160 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
    161 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
    162 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
    163 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
    164 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
    165 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
    166 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
    167 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
    168 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
    169 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
    170 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
    171 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
    172 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
    173 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
    174 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
    175 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
    176 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
    177 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
    178 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
    179 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
    180 };
    181 #endif
    182 
    183 
    184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
    185 searched linearly. Put all the names into a single string, in order to reduce
    186 the number of relocations when a shared library is dynamically linked. The
    187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
    188 platforms. */
    189 
    190 typedef struct verbitem {
    191   int   len;                 /* Length of verb name */
    192   int   op;                  /* Op when no arg, or -1 if arg mandatory */
    193   int   op_arg;              /* Op when arg present, or -1 if not allowed */
    194 } verbitem;
    195 
    196 static const char verbnames[] =
    197   "\0"                       /* Empty name is a shorthand for MARK */
    198   STRING_MARK0
    199   STRING_ACCEPT0
    200   STRING_COMMIT0
    201   STRING_F0
    202   STRING_FAIL0
    203   STRING_PRUNE0
    204   STRING_SKIP0
    205   STRING_THEN;
    206 
    207 static const verbitem verbs[] = {
    208   { 0, -1,        OP_MARK },
    209   { 4, -1,        OP_MARK },
    210   { 6, OP_ACCEPT, -1 },
    211   { 6, OP_COMMIT, -1 },
    212   { 1, OP_FAIL,   -1 },
    213   { 4, OP_FAIL,   -1 },
    214   { 5, OP_PRUNE,  OP_PRUNE_ARG },
    215   { 4, OP_SKIP,   OP_SKIP_ARG  },
    216   { 4, OP_THEN,   OP_THEN_ARG  }
    217 };
    218 
    219 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
    220 
    221 
    222 /* Tables of names of POSIX character classes and their lengths. The names are
    223 now all in a single string, to reduce the number of relocations when a shared
    224 library is dynamically loaded. The list of lengths is terminated by a zero
    225 length entry. The first three must be alpha, lower, upper, as this is assumed
    226 for handling case independence. */
    227 
    228 static const char posix_names[] =
    229   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
    230   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
    231   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
    232   STRING_word0  STRING_xdigit;
    233 
    234 static const uschar posix_name_lengths[] = {
    235   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
    236 
    237 /* Table of class bit maps for each POSIX class. Each class is formed from a
    238 base map, with an optional addition or removal of another map. Then, for some
    239 classes, there is some additional tweaking: for [:blank:] the vertical space
    240 characters are removed, and for [:alpha:] and [:alnum:] the underscore
    241 character is removed. The triples in the table consist of the base map offset,
    242 second map offset or -1 if no second map, and a non-negative value for map
    243 addition or a negative value for map subtraction (if there are two maps). The
    244 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
    245 remove vertical space characters, 2 => remove underscore. */
    246 
    247 static const int posix_class_maps[] = {
    248   cbit_word,  cbit_digit, -2,             /* alpha */
    249   cbit_lower, -1,          0,             /* lower */
    250   cbit_upper, -1,          0,             /* upper */
    251   cbit_word,  -1,          2,             /* alnum - word without underscore */
    252   cbit_print, cbit_cntrl,  0,             /* ascii */
    253   cbit_space, -1,          1,             /* blank - a GNU extension */
    254   cbit_cntrl, -1,          0,             /* cntrl */
    255   cbit_digit, -1,          0,             /* digit */
    256   cbit_graph, -1,          0,             /* graph */
    257   cbit_print, -1,          0,             /* print */
    258   cbit_punct, -1,          0,             /* punct */
    259   cbit_space, -1,          0,             /* space */
    260   cbit_word,  -1,          0,             /* word - a Perl extension */
    261   cbit_xdigit,-1,          0              /* xdigit */
    262 };
    263 
    264 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
    265 substitutes must be in the order of the names, defined above, and there are
    266 both positive and negative cases. NULL means no substitute. */
    267 
    268 #ifdef SUPPORT_UCP
    269 static const uschar *substitutes[] = {
    270   (uschar *)"\\P{Nd}",    /* \D */
    271   (uschar *)"\\p{Nd}",    /* \d */
    272   (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
    273   (uschar *)"\\p{Xsp}",   /* \s */
    274   (uschar *)"\\P{Xwd}",   /* \W */
    275   (uschar *)"\\p{Xwd}"    /* \w */
    276 };
    277 
    278 static const uschar *posix_substitutes[] = {
    279   (uschar *)"\\p{L}",     /* alpha */
    280   (uschar *)"\\p{Ll}",    /* lower */
    281   (uschar *)"\\p{Lu}",    /* upper */
    282   (uschar *)"\\p{Xan}",   /* alnum */
    283   NULL,                   /* ascii */
    284   (uschar *)"\\h",        /* blank */
    285   NULL,                   /* cntrl */
    286   (uschar *)"\\p{Nd}",    /* digit */
    287   NULL,                   /* graph */
    288   NULL,                   /* print */
    289   NULL,                   /* punct */
    290   (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
    291   (uschar *)"\\p{Xwd}",   /* word */
    292   NULL,                   /* xdigit */
    293   /* Negated cases */
    294   (uschar *)"\\P{L}",     /* ^alpha */
    295   (uschar *)"\\P{Ll}",    /* ^lower */
    296   (uschar *)"\\P{Lu}",    /* ^upper */
    297   (uschar *)"\\P{Xan}",   /* ^alnum */
    298   NULL,                   /* ^ascii */
    299   (uschar *)"\\H",        /* ^blank */
    300   NULL,                   /* ^cntrl */
    301   (uschar *)"\\P{Nd}",    /* ^digit */
    302   NULL,                   /* ^graph */
    303   NULL,                   /* ^print */
    304   NULL,                   /* ^punct */
    305   (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
    306   (uschar *)"\\P{Xwd}",   /* ^word */
    307   NULL                    /* ^xdigit */
    308 };
    309 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
    310 #endif
    311 
    312 #define STRING(a)  # a
    313 #define XSTRING(s) STRING(s)
    314 
    315 /* The texts of compile-time error messages. These are "char *" because they
    316 are passed to the outside world. Do not ever re-use any error number, because
    317 they are documented. Always add a new error instead. Messages marked DEAD below
    318 are no longer used. This used to be a table of strings, but in order to reduce
    319 the number of relocations needed when a shared library is loaded dynamically,
    320 it is now one long string. We cannot use a table of offsets, because the
    321 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
    322 simply count through to the one we want - this isn't a performance issue
    323 because these strings are used only when there is a compilation error.
    324 
    325 Each substring ends with \0 to insert a null character. This includes the final
    326 substring, so that the whole string ends with \0\0, which can be detected when
    327 counting through. */
    328 
    329 static const char error_texts[] =
    330   "no error\0"
    331   "\\ at end of pattern\0"
    332   "\\c at end of pattern\0"
    333   "unrecognized character follows \\\0"
    334   "numbers out of order in {} quantifier\0"
    335   /* 5 */
    336   "number too big in {} quantifier\0"
    337   "missing terminating ] for character class\0"
    338   "invalid escape sequence in character class\0"
    339   "range out of order in character class\0"
    340   "nothing to repeat\0"
    341   /* 10 */
    342   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
    343   "internal error: unexpected repeat\0"
    344   "unrecognized character after (? or (?-\0"
    345   "POSIX named classes are supported only within a class\0"
    346   "missing )\0"
    347   /* 15 */
    348   "reference to non-existent subpattern\0"
    349   "erroffset passed as NULL\0"
    350   "unknown option bit(s) set\0"
    351   "missing ) after comment\0"
    352   "parentheses nested too deeply\0"  /** DEAD **/
    353   /* 20 */
    354   "regular expression is too large\0"
    355   "failed to get memory\0"
    356   "unmatched parentheses\0"
    357   "internal error: code overflow\0"
    358   "unrecognized character after (?<\0"
    359   /* 25 */
    360   "lookbehind assertion is not fixed length\0"
    361   "malformed number or name after (?(\0"
    362   "conditional group contains more than two branches\0"
    363   "assertion expected after (?(\0"
    364   "(?R or (?[+-]digits must be followed by )\0"
    365   /* 30 */
    366   "unknown POSIX class name\0"
    367   "POSIX collating elements are not supported\0"
    368   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
    369   "spare error\0"  /** DEAD **/
    370   "character value in \\x{...} sequence is too large\0"
    371   /* 35 */
    372   "invalid condition (?(0)\0"
    373   "\\C not allowed in lookbehind assertion\0"
    374   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
    375   "number after (?C is > 255\0"
    376   "closing ) for (?C expected\0"
    377   /* 40 */
    378   "recursive call could loop indefinitely\0"
    379   "unrecognized character after (?P\0"
    380   "syntax error in subpattern name (missing terminator)\0"
    381   "two named subpatterns have the same name\0"
    382   "invalid UTF-8 string\0"
    383   /* 45 */
    384   "support for \\P, \\p, and \\X has not been compiled\0"
    385   "malformed \\P or \\p sequence\0"
    386   "unknown property name after \\P or \\p\0"
    387   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
    388   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
    389   /* 50 */
    390   "repeated subpattern is too long\0"    /** DEAD **/
    391   "octal value is greater than \\377 (not in UTF-8 mode)\0"
    392   "internal error: overran compiling workspace\0"
    393   "internal error: previously-checked referenced subpattern not found\0"
    394   "DEFINE group contains more than one branch\0"
    395   /* 55 */
    396   "repeating a DEFINE group is not allowed\0"
    397   "inconsistent NEWLINE options\0"
    398   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
    399   "a numbered reference must not be zero\0"
    400   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
    401   /* 60 */
    402   "(*VERB) not recognized\0"
    403   "number is too big\0"
    404   "subpattern name expected\0"
    405   "digit expected after (?+\0"
    406   "] is an invalid data character in JavaScript compatibility mode\0"
    407   /* 65 */
    408   "different names for subpatterns of the same number are not allowed\0"
    409   "(*MARK) must have an argument\0"
    410   "this version of PCRE is not compiled with PCRE_UCP support\0"
    411   "\\c must be followed by an ASCII character\0"
    412   ;
    413 
    414 /* Table to identify digits and hex digits. This is used when compiling
    415 patterns. Note that the tables in chartables are dependent on the locale, and
    416 may mark arbitrary characters as digits - but the PCRE compiling code expects
    417 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
    418 a private table here. It costs 256 bytes, but it is a lot faster than doing
    419 character value tests (at least in some simple cases I timed), and in some
    420 applications one wants PCRE to compile efficiently as well as match
    421 efficiently.
    422 
    423 For convenience, we use the same bit definitions as in chartables:
    424 
    425   0x04   decimal digit
    426   0x08   hexadecimal digit
    427 
    428 Then we can use ctype_digit and ctype_xdigit in the code. */
    429 
    430 #ifndef EBCDIC
    431 
    432 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
    433 UTF-8 mode. */
    434 
    435 static const unsigned char digitab[] =
    436   {
    437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
    438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
    439   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
    440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    441   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
    442   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
    443   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
    444   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
    445   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
    446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
    447   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
    448   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
    449   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
    450   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
    451   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
    452   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
    453   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
    454   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
    455   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
    456   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
    457   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
    458   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
    459   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
    460   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    461   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
    462   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
    463   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
    464   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
    465   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
    466   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
    467   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
    468   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
    469 
    470 #else
    471 
    472 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
    473 
    474 static const unsigned char digitab[] =
    475   {
    476   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
    477   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
    478   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
    479   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
    480   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
    481   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
    482   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
    483   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
    484   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
    485   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
    486   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
    487   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
    488   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
    489   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
    490   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
    491   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
    492   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
    493   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
    494   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
    495   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
    496   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
    497   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
    498   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
    499   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
    500   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
    501   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
    502   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
    503   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
    504   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
    505   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
    506   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
    507   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
    508 
    509 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
    510   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
    511   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
    512   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
    513   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
    514   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
    515   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
    516   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
    517   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
    518   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
    519   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
    520   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
    521   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
    522   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
    523   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
    524   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
    525   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
    526   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
    527   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
    528   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
    529   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
    530   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
    531   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
    532   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
    533   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
    534   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
    535   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
    536   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
    537   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
    538   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
    539   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
    540   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
    541   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
    542 #endif
    543 
    544 
    545 /* Definition to allow mutual recursion */
    546 
    547 static BOOL
    548   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
    549     int *, int *, branch_chain *, compile_data *, int *);
    550 
    551 
    552 
    553 /*************************************************
    554 *            Find an error text                  *
    555 *************************************************/
    556 
    557 /* The error texts are now all in one long string, to save on relocations. As
    558 some of the text is of unknown length, we can't use a table of offsets.
    559 Instead, just count through the strings. This is not a performance issue
    560 because it happens only when there has been a compilation error.
    561 
    562 Argument:   the error number
    563 Returns:    pointer to the error string
    564 */
    565 
    566 static const char *
    567 find_error_text(int n)
    568 {
    569 const char *s = error_texts;
    570 for (; n > 0; n--)
    571   {
    572   while (*s++ != 0) {};
    573   if (*s == 0) return "Error text not found (please report)";
    574   }
    575 return s;
    576 }
    577 
    578 
    579 /*************************************************
    580 *            Handle escapes                      *
    581 *************************************************/
    582 
    583 /* This function is called when a \ has been encountered. It either returns a
    584 positive value for a simple escape such as \n, or a negative value which
    585 encodes one of the more complicated things such as \d. A backreference to group
    586 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
    587 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
    588 ptr is pointing at the \. On exit, it is on the final character of the escape
    589 sequence.
    590 
    591 Arguments:
    592   ptrptr         points to the pattern position pointer
    593   errorcodeptr   points to the errorcode variable
    594   bracount       number of previous extracting brackets
    595   options        the options bits
    596   isclass        TRUE if inside a character class
    597 
    598 Returns:         zero or positive => a data character
    599                  negative => a special escape sequence
    600                  on error, errorcodeptr is set
    601 */
    602 
    603 static int
    604 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
    605   int options, BOOL isclass)
    606 {
    607 BOOL utf8 = (options & PCRE_UTF8) != 0;
    608 const uschar *ptr = *ptrptr + 1;
    609 int c, i;
    610 
    611 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
    612 ptr--;                            /* Set pointer back to the last byte */
    613 
    614 /* If backslash is at the end of the pattern, it's an error. */
    615 
    616 if (c == 0) *errorcodeptr = ERR1;
    617 
    618 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
    619 in a table. A non-zero result is something that can be returned immediately.
    620 Otherwise further processing may be required. */
    621 
    622 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
    623 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
    624 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
    625 
    626 #else           /* EBCDIC coding */
    627 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
    628 else if ((i = escapes[c - 0x48]) != 0)  c = i;
    629 #endif
    630 
    631 /* Escapes that need further processing, or are illegal. */
    632 
    633 else
    634   {
    635   const uschar *oldptr;
    636   BOOL braced, negated;
    637 
    638   switch (c)
    639     {
    640     /* A number of Perl escapes are not handled by PCRE. We give an explicit
    641     error. */
    642 
    643     case CHAR_l:
    644     case CHAR_L:
    645     case CHAR_u:
    646     case CHAR_U:
    647     *errorcodeptr = ERR37;
    648     break;
    649 
    650     /* \g must be followed by one of a number of specific things:
    651 
    652     (1) A number, either plain or braced. If positive, it is an absolute
    653     backreference. If negative, it is a relative backreference. This is a Perl
    654     5.10 feature.
    655 
    656     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
    657     is part of Perl's movement towards a unified syntax for back references. As
    658     this is synonymous with \k{name}, we fudge it up by pretending it really
    659     was \k.
    660 
    661     (3) For Oniguruma compatibility we also support \g followed by a name or a
    662     number either in angle brackets or in single quotes. However, these are
    663     (possibly recursive) subroutine calls, _not_ backreferences. Just return
    664     the -ESC_g code (cf \k). */
    665 
    666     case CHAR_g:
    667     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
    668       {
    669       c = -ESC_g;
    670       break;
    671       }
    672 
    673     /* Handle the Perl-compatible cases */
    674 
    675     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
    676       {
    677       const uschar *p;
    678       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
    679         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
    680       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
    681         {
    682         c = -ESC_k;
    683         break;
    684         }
    685       braced = TRUE;
    686       ptr++;
    687       }
    688     else braced = FALSE;
    689 
    690     if (ptr[1] == CHAR_MINUS)
    691       {
    692       negated = TRUE;
    693       ptr++;
    694       }
    695     else negated = FALSE;
    696 
    697     c = 0;
    698     while ((digitab[ptr[1]] & ctype_digit) != 0)
    699       c = c * 10 + *(++ptr) - CHAR_0;
    700 
    701     if (c < 0)   /* Integer overflow */
    702       {
    703       *errorcodeptr = ERR61;
    704       break;
    705       }
    706 
    707     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
    708       {
    709       *errorcodeptr = ERR57;
    710       break;
    711       }
    712 
    713     if (c == 0)
    714       {
    715       *errorcodeptr = ERR58;
    716       break;
    717       }
    718 
    719     if (negated)
    720       {
    721       if (c > bracount)
    722         {
    723         *errorcodeptr = ERR15;
    724         break;
    725         }
    726       c = bracount - (c - 1);
    727       }
    728 
    729     c = -(ESC_REF + c);
    730     break;
    731 
    732     /* The handling of escape sequences consisting of a string of digits
    733     starting with one that is not zero is not straightforward. By experiment,
    734     the way Perl works seems to be as follows:
    735 
    736     Outside a character class, the digits are read as a decimal number. If the
    737     number is less than 10, or if there are that many previous extracting
    738     left brackets, then it is a back reference. Otherwise, up to three octal
    739     digits are read to form an escaped byte. Thus \123 is likely to be octal
    740     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
    741     value is greater than 377, the least significant 8 bits are taken. Inside a
    742     character class, \ followed by a digit is always an octal number. */
    743 
    744     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
    745     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
    746 
    747     if (!isclass)
    748       {
    749       oldptr = ptr;
    750       c -= CHAR_0;
    751       while ((digitab[ptr[1]] & ctype_digit) != 0)
    752         c = c * 10 + *(++ptr) - CHAR_0;
    753       if (c < 0)    /* Integer overflow */
    754         {
    755         *errorcodeptr = ERR61;
    756         break;
    757         }
    758       if (c < 10 || c <= bracount)
    759         {
    760         c = -(ESC_REF + c);
    761         break;
    762         }
    763       ptr = oldptr;      /* Put the pointer back and fall through */
    764       }
    765 
    766     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
    767     generates a binary zero byte and treats the digit as a following literal.
    768     Thus we have to pull back the pointer by one. */
    769 
    770     if ((c = *ptr) >= CHAR_8)
    771       {
    772       ptr--;
    773       c = 0;
    774       break;
    775       }
    776 
    777     /* \0 always starts an octal number, but we may drop through to here with a
    778     larger first octal digit. The original code used just to take the least
    779     significant 8 bits of octal numbers (I think this is what early Perls used
    780     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
    781     than 3 octal digits. */
    782 
    783     case CHAR_0:
    784     c -= CHAR_0;
    785     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
    786         c = c * 8 + *(++ptr) - CHAR_0;
    787     if (!utf8 && c > 255) *errorcodeptr = ERR51;
    788     break;
    789 
    790     /* \x is complicated. \x{ddd} is a character number which can be greater
    791     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
    792     treated as a data character. */
    793 
    794     case CHAR_x:
    795     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
    796       {
    797       const uschar *pt = ptr + 2;
    798       int count = 0;
    799 
    800       c = 0;
    801       while ((digitab[*pt] & ctype_xdigit) != 0)
    802         {
    803         register int cc = *pt++;
    804         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
    805         count++;
    806 
    807 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
    808         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
    809         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
    810 #else           /* EBCDIC coding */
    811         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
    812         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
    813 #endif
    814         }
    815 
    816       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
    817         {
    818         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
    819         ptr = pt;
    820         break;
    821         }
    822 
    823       /* If the sequence of hex digits does not end with '}', then we don't
    824       recognize this construct; fall through to the normal \x handling. */
    825       }
    826 
    827     /* Read just a single-byte hex-defined char */
    828 
    829     c = 0;
    830     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
    831       {
    832       int cc;                                  /* Some compilers don't like */
    833       cc = *(++ptr);                           /* ++ in initializers */
    834 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
    835       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
    836       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
    837 #else           /* EBCDIC coding */
    838       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
    839       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
    840 #endif
    841       }
    842     break;
    843 
    844     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
    845     An error is given if the byte following \c is not an ASCII character. This
    846     coding is ASCII-specific, but then the whole concept of \cx is
    847     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
    848 
    849     case CHAR_c:
    850     c = *(++ptr);
    851     if (c == 0)
    852       {
    853       *errorcodeptr = ERR2;
    854       break;
    855       }
    856 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
    857     if (c > 127)  /* Excludes all non-ASCII in either mode */
    858       {
    859       *errorcodeptr = ERR68;
    860       break;
    861       }
    862     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
    863     c ^= 0x40;
    864 #else             /* EBCDIC coding */
    865     if (c >= CHAR_a && c <= CHAR_z) c += 64;
    866     c ^= 0xC0;
    867 #endif
    868     break;
    869 
    870     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
    871     other alphanumeric following \ is an error if PCRE_EXTRA was set;
    872     otherwise, for Perl compatibility, it is a literal. This code looks a bit
    873     odd, but there used to be some cases other than the default, and there may
    874     be again in future, so I haven't "optimized" it. */
    875 
    876     default:
    877     if ((options & PCRE_EXTRA) != 0) switch(c)
    878       {
    879       default:
    880       *errorcodeptr = ERR3;
    881       break;
    882       }
    883     break;
    884     }
    885   }
    886 
    887 /* Perl supports \N{name} for character names, as well as plain \N for "not
    888 newline". PCRE does not support \N{name}. */
    889 
    890 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
    891   *errorcodeptr = ERR37;
    892 
    893 /* If PCRE_UCP is set, we change the values for \d etc. */
    894 
    895 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
    896   c -= (ESC_DU - ESC_D);
    897 
    898 /* Set the pointer to the final character before returning. */
    899 
    900 *ptrptr = ptr;
    901 return c;
    902 }
    903 
    904 
    905 
    906 #ifdef SUPPORT_UCP
    907 /*************************************************
    908 *               Handle \P and \p                 *
    909 *************************************************/
    910 
    911 /* This function is called after \P or \p has been encountered, provided that
    912 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
    913 pointing at the P or p. On exit, it is pointing at the final character of the
    914 escape sequence.
    915 
    916 Argument:
    917   ptrptr         points to the pattern position pointer
    918   negptr         points to a boolean that is set TRUE for negation else FALSE
    919   dptr           points to an int that is set to the detailed property value
    920   errorcodeptr   points to the error code variable
    921 
    922 Returns:         type value from ucp_type_table, or -1 for an invalid type
    923 */
    924 
    925 static int
    926 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
    927 {
    928 int c, i, bot, top;
    929 const uschar *ptr = *ptrptr;
    930 char name[32];
    931 
    932 c = *(++ptr);
    933 if (c == 0) goto ERROR_RETURN;
    934 
    935 *negptr = FALSE;
    936 
    937 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
    938 negation. */
    939 
    940 if (c == CHAR_LEFT_CURLY_BRACKET)
    941   {
    942   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
    943     {
    944     *negptr = TRUE;
    945     ptr++;
    946     }
    947   for (i = 0; i < (int)sizeof(name) - 1; i++)
    948     {
    949     c = *(++ptr);
    950     if (c == 0) goto ERROR_RETURN;
    951     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
    952     name[i] = c;
    953     }
    954   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
    955   name[i] = 0;
    956   }
    957 
    958 /* Otherwise there is just one following character */
    959 
    960 else
    961   {
    962   name[0] = c;
    963   name[1] = 0;
    964   }
    965 
    966 *ptrptr = ptr;
    967 
    968 /* Search for a recognized property name using binary chop */
    969 
    970 bot = 0;
    971 top = _pcre_utt_size;
    972 
    973 while (bot < top)
    974   {
    975   i = (bot + top) >> 1;
    976   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
    977   if (c == 0)
    978     {
    979     *dptr = _pcre_utt[i].value;
    980     return _pcre_utt[i].type;
    981     }
    982   if (c > 0) bot = i + 1; else top = i;
    983   }
    984 
    985 *errorcodeptr = ERR47;
    986 *ptrptr = ptr;
    987 return -1;
    988 
    989 ERROR_RETURN:
    990 *errorcodeptr = ERR46;
    991 *ptrptr = ptr;
    992 return -1;
    993 }
    994 #endif
    995 
    996 
    997 
    998 
    999 /*************************************************
   1000 *            Check for counted repeat            *
   1001 *************************************************/
   1002 
   1003 /* This function is called when a '{' is encountered in a place where it might
   1004 start a quantifier. It looks ahead to see if it really is a quantifier or not.
   1005 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
   1006 where the ddds are digits.
   1007 
   1008 Arguments:
   1009   p         pointer to the first char after '{'
   1010 
   1011 Returns:    TRUE or FALSE
   1012 */
   1013 
   1014 static BOOL
   1015 is_counted_repeat(const uschar *p)
   1016 {
   1017 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
   1018 while ((digitab[*p] & ctype_digit) != 0) p++;
   1019 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
   1020 
   1021 if (*p++ != CHAR_COMMA) return FALSE;
   1022 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
   1023 
   1024 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
   1025 while ((digitab[*p] & ctype_digit) != 0) p++;
   1026 
   1027 return (*p == CHAR_RIGHT_CURLY_BRACKET);
   1028 }
   1029 
   1030 
   1031 
   1032 /*************************************************
   1033 *         Read repeat counts                     *
   1034 *************************************************/
   1035 
   1036 /* Read an item of the form {n,m} and return the values. This is called only
   1037 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
   1038 so the syntax is guaranteed to be correct, but we need to check the values.
   1039 
   1040 Arguments:
   1041   p              pointer to first char after '{'
   1042   minp           pointer to int for min
   1043   maxp           pointer to int for max
   1044                  returned as -1 if no max
   1045   errorcodeptr   points to error code variable
   1046 
   1047 Returns:         pointer to '}' on success;
   1048                  current ptr on error, with errorcodeptr set non-zero
   1049 */
   1050 
   1051 static const uschar *
   1052 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
   1053 {
   1054 int min = 0;
   1055 int max = -1;
   1056 
   1057 /* Read the minimum value and do a paranoid check: a negative value indicates
   1058 an integer overflow. */
   1059 
   1060 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
   1061 if (min < 0 || min > 65535)
   1062   {
   1063   *errorcodeptr = ERR5;
   1064   return p;
   1065   }
   1066 
   1067 /* Read the maximum value if there is one, and again do a paranoid on its size.
   1068 Also, max must not be less than min. */
   1069 
   1070 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
   1071   {
   1072   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
   1073     {
   1074     max = 0;
   1075     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
   1076     if (max < 0 || max > 65535)
   1077       {
   1078       *errorcodeptr = ERR5;
   1079       return p;
   1080       }
   1081     if (max < min)
   1082       {
   1083       *errorcodeptr = ERR4;
   1084       return p;
   1085       }
   1086     }
   1087   }
   1088 
   1089 /* Fill in the required variables, and pass back the pointer to the terminating
   1090 '}'. */
   1091 
   1092 *minp = min;
   1093 *maxp = max;
   1094 return p;
   1095 }
   1096 
   1097 
   1098 
   1099 /*************************************************
   1100 *  Subroutine for finding forward reference      *
   1101 *************************************************/
   1102 
   1103 /* This recursive function is called only from find_parens() below. The
   1104 top-level call starts at the beginning of the pattern. All other calls must
   1105 start at a parenthesis. It scans along a pattern's text looking for capturing
   1106 subpatterns, and counting them. If it finds a named pattern that matches the
   1107 name it is given, it returns its number. Alternatively, if the name is NULL, it
   1108 returns when it reaches a given numbered subpattern. Recursion is used to keep
   1109 track of subpatterns that reset the capturing group numbers - the (?| feature.
   1110 
   1111 This function was originally called only from the second pass, in which we know
   1112 that if (?< or (?' or (?P< is encountered, the name will be correctly
   1113 terminated because that is checked in the first pass. There is now one call to
   1114 this function in the first pass, to check for a recursive back reference by
   1115 name (so that we can make the whole group atomic). In this case, we need check
   1116 only up to the current position in the pattern, and that is still OK because
   1117 and previous occurrences will have been checked. To make this work, the test
   1118 for "end of pattern" is a check against cd->end_pattern in the main loop,
   1119 instead of looking for a binary zero. This means that the special first-pass
   1120 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
   1121 processing items within the loop are OK, because afterwards the main loop will
   1122 terminate.)
   1123 
   1124 Arguments:
   1125   ptrptr       address of the current character pointer (updated)
   1126   cd           compile background data
   1127   name         name to seek, or NULL if seeking a numbered subpattern
   1128   lorn         name length, or subpattern number if name is NULL
   1129   xmode        TRUE if we are in /x mode
   1130   utf8         TRUE if we are in UTF-8 mode
   1131   count        pointer to the current capturing subpattern number (updated)
   1132 
   1133 Returns:       the number of the named subpattern, or -1 if not found
   1134 */
   1135 
   1136 static int
   1137 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
   1138   BOOL xmode, BOOL utf8, int *count)
   1139 {
   1140 uschar *ptr = *ptrptr;
   1141 int start_count = *count;
   1142 int hwm_count = start_count;
   1143 BOOL dup_parens = FALSE;
   1144 
   1145 /* If the first character is a parenthesis, check on the type of group we are
   1146 dealing with. The very first call may not start with a parenthesis. */
   1147 
   1148 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
   1149   {
   1150   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
   1151 
   1152   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
   1153 
   1154   /* Handle a normal, unnamed capturing parenthesis. */
   1155 
   1156   else if (ptr[1] != CHAR_QUESTION_MARK)
   1157     {
   1158     *count += 1;
   1159     if (name == NULL && *count == lorn) return *count;
   1160     ptr++;
   1161     }
   1162 
   1163   /* All cases now have (? at the start. Remember when we are in a group
   1164   where the parenthesis numbers are duplicated. */
   1165 
   1166   else if (ptr[2] == CHAR_VERTICAL_LINE)
   1167     {
   1168     ptr += 3;
   1169     dup_parens = TRUE;
   1170     }
   1171 
   1172   /* Handle comments; all characters are allowed until a ket is reached. */
   1173 
   1174   else if (ptr[2] == CHAR_NUMBER_SIGN)
   1175     {
   1176     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
   1177     goto FAIL_EXIT;
   1178     }
   1179 
   1180   /* Handle a condition. If it is an assertion, just carry on so that it
   1181   is processed as normal. If not, skip to the closing parenthesis of the
   1182   condition (there can't be any nested parens). */
   1183 
   1184   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
   1185     {
   1186     ptr += 2;
   1187     if (ptr[1] != CHAR_QUESTION_MARK)
   1188       {
   1189       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
   1190       if (*ptr != 0) ptr++;
   1191       }
   1192     }
   1193 
   1194   /* Start with (? but not a condition. */
   1195 
   1196   else
   1197     {
   1198     ptr += 2;
   1199     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
   1200 
   1201     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
   1202 
   1203     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
   1204         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
   1205       {
   1206       int term;
   1207       const uschar *thisname;
   1208       *count += 1;
   1209       if (name == NULL && *count == lorn) return *count;
   1210       term = *ptr++;
   1211       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
   1212       thisname = ptr;
   1213       while (*ptr != term) ptr++;
   1214       if (name != NULL && lorn == ptr - thisname &&
   1215           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
   1216         return *count;
   1217       term++;
   1218       }
   1219     }
   1220   }
   1221 
   1222 /* Past any initial parenthesis handling, scan for parentheses or vertical
   1223 bars. Stop if we get to cd->end_pattern. Note that this is important for the
   1224 first-pass call when this value is temporarily adjusted to stop at the current
   1225 position. So DO NOT change this to a test for binary zero. */
   1226 
   1227 for (; ptr < cd->end_pattern; ptr++)
   1228   {
   1229   /* Skip over backslashed characters and also entire \Q...\E */
   1230 
   1231   if (*ptr == CHAR_BACKSLASH)
   1232     {
   1233     if (*(++ptr) == 0) goto FAIL_EXIT;
   1234     if (*ptr == CHAR_Q) for (;;)
   1235       {
   1236       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
   1237       if (*ptr == 0) goto FAIL_EXIT;
   1238       if (*(++ptr) == CHAR_E) break;
   1239       }
   1240     continue;
   1241     }
   1242 
   1243   /* Skip over character classes; this logic must be similar to the way they
   1244   are handled for real. If the first character is '^', skip it. Also, if the
   1245   first few characters (either before or after ^) are \Q\E or \E we skip them
   1246   too. This makes for compatibility with Perl. Note the use of STR macros to
   1247   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
   1248 
   1249   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
   1250     {
   1251     BOOL negate_class = FALSE;
   1252     for (;;)
   1253       {
   1254       if (ptr[1] == CHAR_BACKSLASH)
   1255         {
   1256         if (ptr[2] == CHAR_E)
   1257           ptr+= 2;
   1258         else if (strncmp((const char *)ptr+2,
   1259                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
   1260           ptr += 4;
   1261         else
   1262           break;
   1263         }
   1264       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
   1265         {
   1266         negate_class = TRUE;
   1267         ptr++;
   1268         }
   1269       else break;
   1270       }
   1271 
   1272     /* If the next character is ']', it is a data character that must be
   1273     skipped, except in JavaScript compatibility mode. */
   1274 
   1275     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
   1276         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
   1277       ptr++;
   1278 
   1279     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
   1280       {
   1281       if (*ptr == 0) return -1;
   1282       if (*ptr == CHAR_BACKSLASH)
   1283         {
   1284         if (*(++ptr) == 0) goto FAIL_EXIT;
   1285         if (*ptr == CHAR_Q) for (;;)
   1286           {
   1287           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
   1288           if (*ptr == 0) goto FAIL_EXIT;
   1289           if (*(++ptr) == CHAR_E) break;
   1290           }
   1291         continue;
   1292         }
   1293       }
   1294     continue;
   1295     }
   1296 
   1297   /* Skip comments in /x mode */
   1298 
   1299   if (xmode && *ptr == CHAR_NUMBER_SIGN)
   1300     {
   1301     ptr++;
   1302     while (*ptr != 0)
   1303       {
   1304       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
   1305       ptr++;
   1306 #ifdef SUPPORT_UTF8
   1307       if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
   1308 #endif
   1309       }
   1310     if (*ptr == 0) goto FAIL_EXIT;
   1311     continue;
   1312     }
   1313 
   1314   /* Check for the special metacharacters */
   1315 
   1316   if (*ptr == CHAR_LEFT_PARENTHESIS)
   1317     {
   1318     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
   1319     if (rc > 0) return rc;
   1320     if (*ptr == 0) goto FAIL_EXIT;
   1321     }
   1322 
   1323   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
   1324     {
   1325     if (dup_parens && *count < hwm_count) *count = hwm_count;
   1326     goto FAIL_EXIT;
   1327     }
   1328 
   1329   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
   1330     {
   1331     if (*count > hwm_count) hwm_count = *count;
   1332     *count = start_count;
   1333     }
   1334   }
   1335 
   1336 FAIL_EXIT:
   1337 *ptrptr = ptr;
   1338 return -1;
   1339 }
   1340 
   1341 
   1342 
   1343 
   1344 /*************************************************
   1345 *       Find forward referenced subpattern       *
   1346 *************************************************/
   1347 
   1348 /* This function scans along a pattern's text looking for capturing
   1349 subpatterns, and counting them. If it finds a named pattern that matches the
   1350 name it is given, it returns its number. Alternatively, if the name is NULL, it
   1351 returns when it reaches a given numbered subpattern. This is used for forward
   1352 references to subpatterns. We used to be able to start this scan from the
   1353 current compiling point, using the current count value from cd->bracount, and
   1354 do it all in a single loop, but the addition of the possibility of duplicate
   1355 subpattern numbers means that we have to scan from the very start, in order to
   1356 take account of such duplicates, and to use a recursive function to keep track
   1357 of the different types of group.
   1358 
   1359 Arguments:
   1360   cd           compile background data
   1361   name         name to seek, or NULL if seeking a numbered subpattern
   1362   lorn         name length, or subpattern number if name is NULL
   1363   xmode        TRUE if we are in /x mode
   1364   utf8         TRUE if we are in UTF-8 mode
   1365 
   1366 Returns:       the number of the found subpattern, or -1 if not found
   1367 */
   1368 
   1369 static int
   1370 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
   1371   BOOL utf8)
   1372 {
   1373 uschar *ptr = (uschar *)cd->start_pattern;
   1374 int count = 0;
   1375 int rc;
   1376 
   1377 /* If the pattern does not start with an opening parenthesis, the first call
   1378 to find_parens_sub() will scan right to the end (if necessary). However, if it
   1379 does start with a parenthesis, find_parens_sub() will return when it hits the
   1380 matching closing parens. That is why we have to have a loop. */
   1381 
   1382 for (;;)
   1383   {
   1384   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
   1385   if (rc > 0 || *ptr++ == 0) break;
   1386   }
   1387 
   1388 return rc;
   1389 }
   1390 
   1391 
   1392 
   1393 
   1394 /*************************************************
   1395 *      Find first significant op code            *
   1396 *************************************************/
   1397 
   1398 /* This is called by several functions that scan a compiled expression looking
   1399 for a fixed first character, or an anchoring op code etc. It skips over things
   1400 that do not influence this. For some calls, a change of option is important.
   1401 For some calls, it makes sense to skip negative forward and all backward
   1402 assertions, and also the \b assertion; for others it does not.
   1403 
   1404 Arguments:
   1405   code         pointer to the start of the group
   1406   options      pointer to external options
   1407   optbit       the option bit whose changing is significant, or
   1408                  zero if none are
   1409   skipassert   TRUE if certain assertions are to be skipped
   1410 
   1411 Returns:       pointer to the first significant opcode
   1412 */
   1413 
   1414 static const uschar*
   1415 first_significant_code(const uschar *code, int *options, int optbit,
   1416   BOOL skipassert)
   1417 {
   1418 for (;;)
   1419   {
   1420   switch ((int)*code)
   1421     {
   1422     case OP_OPT:
   1423     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
   1424       *options = (int)code[1];
   1425     code += 2;
   1426     break;
   1427 
   1428     case OP_ASSERT_NOT:
   1429     case OP_ASSERTBACK:
   1430     case OP_ASSERTBACK_NOT:
   1431     if (!skipassert) return code;
   1432     do code += GET(code, 1); while (*code == OP_ALT);
   1433     code += _pcre_OP_lengths[*code];
   1434     break;
   1435 
   1436     case OP_WORD_BOUNDARY:
   1437     case OP_NOT_WORD_BOUNDARY:
   1438     if (!skipassert) return code;
   1439     /* Fall through */
   1440 
   1441     case OP_CALLOUT:
   1442     case OP_CREF:
   1443     case OP_NCREF:
   1444     case OP_RREF:
   1445     case OP_NRREF:
   1446     case OP_DEF:
   1447     code += _pcre_OP_lengths[*code];
   1448     break;
   1449 
   1450     default:
   1451     return code;
   1452     }
   1453   }
   1454 /* Control never reaches here */
   1455 }
   1456 
   1457 
   1458 
   1459 
   1460 /*************************************************
   1461 *        Find the fixed length of a branch       *
   1462 *************************************************/
   1463 
   1464 /* Scan a branch and compute the fixed length of subject that will match it,
   1465 if the length is fixed. This is needed for dealing with backward assertions.
   1466 In UTF8 mode, the result is in characters rather than bytes. The branch is
   1467 temporarily terminated with OP_END when this function is called.
   1468 
   1469 This function is called when a backward assertion is encountered, so that if it
   1470 fails, the error message can point to the correct place in the pattern.
   1471 However, we cannot do this when the assertion contains subroutine calls,
   1472 because they can be forward references. We solve this by remembering this case
   1473 and doing the check at the end; a flag specifies which mode we are running in.
   1474 
   1475 Arguments:
   1476   code     points to the start of the pattern (the bracket)
   1477   options  the compiling options
   1478   atend    TRUE if called when the pattern is complete
   1479   cd       the "compile data" structure
   1480 
   1481 Returns:   the fixed length,
   1482              or -1 if there is no fixed length,
   1483              or -2 if \C was encountered
   1484              or -3 if an OP_RECURSE item was encountered and atend is FALSE
   1485 */
   1486 
   1487 static int
   1488 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
   1489 {
   1490 int length = -1;
   1491 
   1492 register int branchlength = 0;
   1493 register uschar *cc = code + 1 + LINK_SIZE;
   1494 
   1495 /* Scan along the opcodes for this branch. If we get to the end of the
   1496 branch, check the length against that of the other branches. */
   1497 
   1498 for (;;)
   1499   {
   1500   int d;
   1501   uschar *ce, *cs;
   1502   register int op = *cc;
   1503   switch (op)
   1504     {
   1505     case OP_CBRA:
   1506     case OP_BRA:
   1507     case OP_ONCE:
   1508     case OP_COND:
   1509     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
   1510     if (d < 0) return d;
   1511     branchlength += d;
   1512     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1513     cc += 1 + LINK_SIZE;
   1514     break;
   1515 
   1516     /* Reached end of a branch; if it's a ket it is the end of a nested
   1517     call. If it's ALT it is an alternation in a nested call. If it is
   1518     END it's the end of the outer call. All can be handled by the same code. */
   1519 
   1520     case OP_ALT:
   1521     case OP_KET:
   1522     case OP_KETRMAX:
   1523     case OP_KETRMIN:
   1524     case OP_END:
   1525     if (length < 0) length = branchlength;
   1526       else if (length != branchlength) return -1;
   1527     if (*cc != OP_ALT) return length;
   1528     cc += 1 + LINK_SIZE;
   1529     branchlength = 0;
   1530     break;
   1531 
   1532     /* A true recursion implies not fixed length, but a subroutine call may
   1533     be OK. If the subroutine is a forward reference, we can't deal with
   1534     it until the end of the pattern, so return -3. */
   1535 
   1536     case OP_RECURSE:
   1537     if (!atend) return -3;
   1538     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
   1539     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
   1540     if (cc > cs && cc < ce) return -1;                /* Recursion */
   1541     d = find_fixedlength(cs + 2, options, atend, cd);
   1542     if (d < 0) return d;
   1543     branchlength += d;
   1544     cc += 1 + LINK_SIZE;
   1545     break;
   1546 
   1547     /* Skip over assertive subpatterns */
   1548 
   1549     case OP_ASSERT:
   1550     case OP_ASSERT_NOT:
   1551     case OP_ASSERTBACK:
   1552     case OP_ASSERTBACK_NOT:
   1553     do cc += GET(cc, 1); while (*cc == OP_ALT);
   1554     /* Fall through */
   1555 
   1556     /* Skip over things that don't match chars */
   1557 
   1558     case OP_REVERSE:
   1559     case OP_CREF:
   1560     case OP_NCREF:
   1561     case OP_RREF:
   1562     case OP_NRREF:
   1563     case OP_DEF:
   1564     case OP_OPT:
   1565     case OP_CALLOUT:
   1566     case OP_SOD:
   1567     case OP_SOM:
   1568     case OP_SET_SOM:
   1569     case OP_EOD:
   1570     case OP_EODN:
   1571     case OP_CIRC:
   1572     case OP_DOLL:
   1573     case OP_NOT_WORD_BOUNDARY:
   1574     case OP_WORD_BOUNDARY:
   1575     cc += _pcre_OP_lengths[*cc];
   1576     break;
   1577 
   1578     /* Handle literal characters */
   1579 
   1580     case OP_CHAR:
   1581     case OP_CHARNC:
   1582     case OP_NOT:
   1583     branchlength++;
   1584     cc += 2;
   1585 #ifdef SUPPORT_UTF8
   1586     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
   1587       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
   1588 #endif
   1589     break;
   1590 
   1591     /* Handle exact repetitions. The count is already in characters, but we
   1592     need to skip over a multibyte character in UTF8 mode.  */
   1593 
   1594     case OP_EXACT:
   1595     branchlength += GET2(cc,1);
   1596     cc += 4;
   1597 #ifdef SUPPORT_UTF8
   1598     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
   1599       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
   1600 #endif
   1601     break;
   1602 
   1603     case OP_TYPEEXACT:
   1604     branchlength += GET2(cc,1);
   1605     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
   1606     cc += 4;
   1607     break;
   1608 
   1609     /* Handle single-char matchers */
   1610 
   1611     case OP_PROP:
   1612     case OP_NOTPROP:
   1613     cc += 2;
   1614     /* Fall through */
   1615 
   1616     case OP_NOT_DIGIT:
   1617     case OP_DIGIT:
   1618     case OP_NOT_WHITESPACE:
   1619     case OP_WHITESPACE:
   1620     case OP_NOT_WORDCHAR:
   1621     case OP_WORDCHAR:
   1622     case OP_ANY:
   1623     case OP_ALLANY:
   1624     branchlength++;
   1625     cc++;
   1626     break;
   1627 
   1628     /* The single-byte matcher isn't allowed */
   1629 
   1630     case OP_ANYBYTE:
   1631     return -2;
   1632 
   1633     /* Check a class for variable quantification */
   1634 
   1635 #ifdef SUPPORT_UTF8
   1636     case OP_XCLASS:
   1637     cc += GET(cc, 1) - 33;
   1638     /* Fall through */
   1639 #endif
   1640 
   1641     case OP_CLASS:
   1642     case OP_NCLASS:
   1643     cc += 33;
   1644 
   1645     switch (*cc)
   1646       {
   1647       case OP_CRSTAR:
   1648       case OP_CRMINSTAR:
   1649       case OP_CRQUERY:
   1650       case OP_CRMINQUERY:
   1651       return -1;
   1652 
   1653       case OP_CRRANGE:
   1654       case OP_CRMINRANGE:
   1655       if (GET2(cc,1) != GET2(cc,3)) return -1;
   1656       branchlength += GET2(cc,1);
   1657       cc += 5;
   1658       break;
   1659 
   1660       default:
   1661       branchlength++;
   1662       }
   1663     break;
   1664 
   1665     /* Anything else is variable length */
   1666 
   1667     default:
   1668     return -1;
   1669     }
   1670   }
   1671 /* Control never gets here */
   1672 }
   1673 
   1674 
   1675 
   1676 
   1677 /*************************************************
   1678 *    Scan compiled regex for specific bracket    *
   1679 *************************************************/
   1680 
   1681 /* This little function scans through a compiled pattern until it finds a
   1682 capturing bracket with the given number, or, if the number is negative, an
   1683 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
   1684 so that it can be called from pcre_study() when finding the minimum matching
   1685 length.
   1686 
   1687 Arguments:
   1688   code        points to start of expression
   1689   utf8        TRUE in UTF-8 mode
   1690   number      the required bracket number or negative to find a lookbehind
   1691 
   1692 Returns:      pointer to the opcode for the bracket, or NULL if not found
   1693 */
   1694 
   1695 const uschar *
   1696 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
   1697 {
   1698 for (;;)
   1699   {
   1700   register int c = *code;
   1701   if (c == OP_END) return NULL;
   1702 
   1703   /* XCLASS is used for classes that cannot be represented just by a bit
   1704   map. This includes negated single high-valued characters. The length in
   1705   the table is zero; the actual length is stored in the compiled code. */
   1706 
   1707   if (c == OP_XCLASS) code += GET(code, 1);
   1708 
   1709   /* Handle recursion */
   1710 
   1711   else if (c == OP_REVERSE)
   1712     {
   1713     if (number < 0) return (uschar *)code;
   1714     code += _pcre_OP_lengths[c];
   1715     }
   1716 
   1717   /* Handle capturing bracket */
   1718 
   1719   else if (c == OP_CBRA)
   1720     {
   1721     int n = GET2(code, 1+LINK_SIZE);
   1722     if (n == number) return (uschar *)code;
   1723     code += _pcre_OP_lengths[c];
   1724     }
   1725 
   1726   /* Otherwise, we can get the item's length from the table, except that for
   1727   repeated character types, we have to test for \p and \P, which have an extra
   1728   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   1729   must add in its length. */
   1730 
   1731   else
   1732     {
   1733     switch(c)
   1734       {
   1735       case OP_TYPESTAR:
   1736       case OP_TYPEMINSTAR:
   1737       case OP_TYPEPLUS:
   1738       case OP_TYPEMINPLUS:
   1739       case OP_TYPEQUERY:
   1740       case OP_TYPEMINQUERY:
   1741       case OP_TYPEPOSSTAR:
   1742       case OP_TYPEPOSPLUS:
   1743       case OP_TYPEPOSQUERY:
   1744       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   1745       break;
   1746 
   1747       case OP_TYPEUPTO:
   1748       case OP_TYPEMINUPTO:
   1749       case OP_TYPEEXACT:
   1750       case OP_TYPEPOSUPTO:
   1751       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
   1752       break;
   1753 
   1754       case OP_MARK:
   1755       case OP_PRUNE_ARG:
   1756       case OP_SKIP_ARG:
   1757       code += code[1];
   1758       break;
   1759 
   1760       case OP_THEN_ARG:
   1761       code += code[1+LINK_SIZE];
   1762       break;
   1763       }
   1764 
   1765     /* Add in the fixed length from the table */
   1766 
   1767     code += _pcre_OP_lengths[c];
   1768 
   1769   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
   1770   a multi-byte character. The length in the table is a minimum, so we have to
   1771   arrange to skip the extra bytes. */
   1772 
   1773 #ifdef SUPPORT_UTF8
   1774     if (utf8) switch(c)
   1775       {
   1776       case OP_CHAR:
   1777       case OP_CHARNC:
   1778       case OP_EXACT:
   1779       case OP_UPTO:
   1780       case OP_MINUPTO:
   1781       case OP_POSUPTO:
   1782       case OP_STAR:
   1783       case OP_MINSTAR:
   1784       case OP_POSSTAR:
   1785       case OP_PLUS:
   1786       case OP_MINPLUS:
   1787       case OP_POSPLUS:
   1788       case OP_QUERY:
   1789       case OP_MINQUERY:
   1790       case OP_POSQUERY:
   1791       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   1792       break;
   1793       }
   1794 #else
   1795     (void)(utf8);  /* Keep compiler happy by referencing function argument */
   1796 #endif
   1797     }
   1798   }
   1799 }
   1800 
   1801 
   1802 
   1803 /*************************************************
   1804 *   Scan compiled regex for recursion reference  *
   1805 *************************************************/
   1806 
   1807 /* This little function scans through a compiled pattern until it finds an
   1808 instance of OP_RECURSE.
   1809 
   1810 Arguments:
   1811   code        points to start of expression
   1812   utf8        TRUE in UTF-8 mode
   1813 
   1814 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
   1815 */
   1816 
   1817 static const uschar *
   1818 find_recurse(const uschar *code, BOOL utf8)
   1819 {
   1820 for (;;)
   1821   {
   1822   register int c = *code;
   1823   if (c == OP_END) return NULL;
   1824   if (c == OP_RECURSE) return code;
   1825 
   1826   /* XCLASS is used for classes that cannot be represented just by a bit
   1827   map. This includes negated single high-valued characters. The length in
   1828   the table is zero; the actual length is stored in the compiled code. */
   1829 
   1830   if (c == OP_XCLASS) code += GET(code, 1);
   1831 
   1832   /* Otherwise, we can get the item's length from the table, except that for
   1833   repeated character types, we have to test for \p and \P, which have an extra
   1834   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
   1835   must add in its length. */
   1836 
   1837   else
   1838     {
   1839     switch(c)
   1840       {
   1841       case OP_TYPESTAR:
   1842       case OP_TYPEMINSTAR:
   1843       case OP_TYPEPLUS:
   1844       case OP_TYPEMINPLUS:
   1845       case OP_TYPEQUERY:
   1846       case OP_TYPEMINQUERY:
   1847       case OP_TYPEPOSSTAR:
   1848       case OP_TYPEPOSPLUS:
   1849       case OP_TYPEPOSQUERY:
   1850       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   1851       break;
   1852 
   1853       case OP_TYPEPOSUPTO:
   1854       case OP_TYPEUPTO:
   1855       case OP_TYPEMINUPTO:
   1856       case OP_TYPEEXACT:
   1857       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
   1858       break;
   1859 
   1860       case OP_MARK:
   1861       case OP_PRUNE_ARG:
   1862       case OP_SKIP_ARG:
   1863       code += code[1];
   1864       break;
   1865 
   1866       case OP_THEN_ARG:
   1867       code += code[1+LINK_SIZE];
   1868       break;
   1869       }
   1870 
   1871     /* Add in the fixed length from the table */
   1872 
   1873     code += _pcre_OP_lengths[c];
   1874 
   1875     /* In UTF-8 mode, opcodes that are followed by a character may be followed
   1876     by a multi-byte character. The length in the table is a minimum, so we have
   1877     to arrange to skip the extra bytes. */
   1878 
   1879 #ifdef SUPPORT_UTF8
   1880     if (utf8) switch(c)
   1881       {
   1882       case OP_CHAR:
   1883       case OP_CHARNC:
   1884       case OP_EXACT:
   1885       case OP_UPTO:
   1886       case OP_MINUPTO:
   1887       case OP_POSUPTO:
   1888       case OP_STAR:
   1889       case OP_MINSTAR:
   1890       case OP_POSSTAR:
   1891       case OP_PLUS:
   1892       case OP_MINPLUS:
   1893       case OP_POSPLUS:
   1894       case OP_QUERY:
   1895       case OP_MINQUERY:
   1896       case OP_POSQUERY:
   1897       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   1898       break;
   1899       }
   1900 #else
   1901     (void)(utf8);  /* Keep compiler happy by referencing function argument */
   1902 #endif
   1903     }
   1904   }
   1905 }
   1906 
   1907 
   1908 
   1909 /*************************************************
   1910 *    Scan compiled branch for non-emptiness      *
   1911 *************************************************/
   1912 
   1913 /* This function scans through a branch of a compiled pattern to see whether it
   1914 can match the empty string or not. It is called from could_be_empty()
   1915 below and from compile_branch() when checking for an unlimited repeat of a
   1916 group that can match nothing. Note that first_significant_code() skips over
   1917 backward and negative forward assertions when its final argument is TRUE. If we
   1918 hit an unclosed bracket, we return "empty" - this means we've struck an inner
   1919 bracket whose current branch will already have been scanned.
   1920 
   1921 Arguments:
   1922   code        points to start of search
   1923   endcode     points to where to stop
   1924   utf8        TRUE if in UTF8 mode
   1925   cd          contains pointers to tables etc.
   1926 
   1927 Returns:      TRUE if what is matched could be empty
   1928 */
   1929 
   1930 static BOOL
   1931 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
   1932   compile_data *cd)
   1933 {
   1934 register int c;
   1935 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
   1936      code < endcode;
   1937      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
   1938   {
   1939   const uschar *ccode;
   1940 
   1941   c = *code;
   1942 
   1943   /* Skip over forward assertions; the other assertions are skipped by
   1944   first_significant_code() with a TRUE final argument. */
   1945 
   1946   if (c == OP_ASSERT)
   1947     {
   1948     do code += GET(code, 1); while (*code == OP_ALT);
   1949     c = *code;
   1950     continue;
   1951     }
   1952 
   1953   /* Groups with zero repeats can of course be empty; skip them. */
   1954 
   1955   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
   1956     {
   1957     code += _pcre_OP_lengths[c];
   1958     do code += GET(code, 1); while (*code == OP_ALT);
   1959     c = *code;
   1960     continue;
   1961     }
   1962 
   1963   /* For a recursion/subroutine call, if its end has been reached, which
   1964   implies a subroutine call, we can scan it. */
   1965 
   1966   if (c == OP_RECURSE)
   1967     {
   1968     BOOL empty_branch = FALSE;
   1969     const uschar *scode = cd->start_code + GET(code, 1);
   1970     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
   1971     do
   1972       {
   1973       if (could_be_empty_branch(scode, endcode, utf8, cd))
   1974         {
   1975         empty_branch = TRUE;
   1976         break;
   1977         }
   1978       scode += GET(scode, 1);
   1979       }
   1980     while (*scode == OP_ALT);
   1981     if (!empty_branch) return FALSE;  /* All branches are non-empty */
   1982     continue;
   1983     }
   1984 
   1985   /* For other groups, scan the branches. */
   1986 
   1987   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
   1988     {
   1989     BOOL empty_branch;
   1990     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
   1991 
   1992     /* If a conditional group has only one branch, there is a second, implied,
   1993     empty branch, so just skip over the conditional, because it could be empty.
   1994     Otherwise, scan the individual branches of the group. */
   1995 
   1996     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
   1997       code += GET(code, 1);
   1998     else
   1999       {
   2000       empty_branch = FALSE;
   2001       do
   2002         {
   2003         if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
   2004           empty_branch = TRUE;
   2005         code += GET(code, 1);
   2006         }
   2007       while (*code == OP_ALT);
   2008       if (!empty_branch) return FALSE;   /* All branches are non-empty */
   2009       }
   2010 
   2011     c = *code;
   2012     continue;
   2013     }
   2014 
   2015   /* Handle the other opcodes */
   2016 
   2017   switch (c)
   2018     {
   2019     /* Check for quantifiers after a class. XCLASS is used for classes that
   2020     cannot be represented just by a bit map. This includes negated single
   2021     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
   2022     actual length is stored in the compiled code, so we must update "code"
   2023     here. */
   2024 
   2025 #ifdef SUPPORT_UTF8
   2026     case OP_XCLASS:
   2027     ccode = code += GET(code, 1);
   2028     goto CHECK_CLASS_REPEAT;
   2029 #endif
   2030 
   2031     case OP_CLASS:
   2032     case OP_NCLASS:
   2033     ccode = code + 33;
   2034 
   2035 #ifdef SUPPORT_UTF8
   2036     CHECK_CLASS_REPEAT:
   2037 #endif
   2038 
   2039     switch (*ccode)
   2040       {
   2041       case OP_CRSTAR:            /* These could be empty; continue */
   2042       case OP_CRMINSTAR:
   2043       case OP_CRQUERY:
   2044       case OP_CRMINQUERY:
   2045       break;
   2046 
   2047       default:                   /* Non-repeat => class must match */
   2048       case OP_CRPLUS:            /* These repeats aren't empty */
   2049       case OP_CRMINPLUS:
   2050       return FALSE;
   2051 
   2052       case OP_CRRANGE:
   2053       case OP_CRMINRANGE:
   2054       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
   2055       break;
   2056       }
   2057     break;
   2058 
   2059     /* Opcodes that must match a character */
   2060 
   2061     case OP_PROP:
   2062     case OP_NOTPROP:
   2063     case OP_EXTUNI:
   2064     case OP_NOT_DIGIT:
   2065     case OP_DIGIT:
   2066     case OP_NOT_WHITESPACE:
   2067     case OP_WHITESPACE:
   2068     case OP_NOT_WORDCHAR:
   2069     case OP_WORDCHAR:
   2070     case OP_ANY:
   2071     case OP_ALLANY:
   2072     case OP_ANYBYTE:
   2073     case OP_CHAR:
   2074     case OP_CHARNC:
   2075     case OP_NOT:
   2076     case OP_PLUS:
   2077     case OP_MINPLUS:
   2078     case OP_POSPLUS:
   2079     case OP_EXACT:
   2080     case OP_NOTPLUS:
   2081     case OP_NOTMINPLUS:
   2082     case OP_NOTPOSPLUS:
   2083     case OP_NOTEXACT:
   2084     case OP_TYPEPLUS:
   2085     case OP_TYPEMINPLUS:
   2086     case OP_TYPEPOSPLUS:
   2087     case OP_TYPEEXACT:
   2088     return FALSE;
   2089 
   2090     /* These are going to continue, as they may be empty, but we have to
   2091     fudge the length for the \p and \P cases. */
   2092 
   2093     case OP_TYPESTAR:
   2094     case OP_TYPEMINSTAR:
   2095     case OP_TYPEPOSSTAR:
   2096     case OP_TYPEQUERY:
   2097     case OP_TYPEMINQUERY:
   2098     case OP_TYPEPOSQUERY:
   2099     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
   2100     break;
   2101 
   2102     /* Same for these */
   2103 
   2104     case OP_TYPEUPTO:
   2105     case OP_TYPEMINUPTO:
   2106     case OP_TYPEPOSUPTO:
   2107     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
   2108     break;
   2109 
   2110     /* End of branch */
   2111 
   2112     case OP_KET:
   2113     case OP_KETRMAX:
   2114     case OP_KETRMIN:
   2115     case OP_ALT:
   2116     return TRUE;
   2117 
   2118     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
   2119     MINUPTO, and POSUPTO may be followed by a multibyte character */
   2120 
   2121 #ifdef SUPPORT_UTF8
   2122     case OP_STAR:
   2123     case OP_MINSTAR:
   2124     case OP_POSSTAR:
   2125     case OP_QUERY:
   2126     case OP_MINQUERY:
   2127     case OP_POSQUERY:
   2128     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
   2129     break;
   2130 
   2131     case OP_UPTO:
   2132     case OP_MINUPTO:
   2133     case OP_POSUPTO:
   2134     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
   2135     break;
   2136 #endif
   2137 
   2138     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
   2139     string. */
   2140 
   2141     case OP_MARK:
   2142     case OP_PRUNE_ARG:
   2143     case OP_SKIP_ARG:
   2144     code += code[1];
   2145     break;
   2146 
   2147     case OP_THEN_ARG:
   2148     code += code[1+LINK_SIZE];
   2149     break;
   2150 
   2151     /* None of the remaining opcodes are required to match a character. */
   2152 
   2153     default:
   2154     break;
   2155     }
   2156   }
   2157 
   2158 return TRUE;
   2159 }
   2160 
   2161 
   2162 
   2163 /*************************************************
   2164 *    Scan compiled regex for non-emptiness       *
   2165 *************************************************/
   2166 
   2167 /* This function is called to check for left recursive calls. We want to check
   2168 the current branch of the current pattern to see if it could match the empty
   2169 string. If it could, we must look outwards for branches at other levels,
   2170 stopping when we pass beyond the bracket which is the subject of the recursion.
   2171 
   2172 Arguments:
   2173   code        points to start of the recursion
   2174   endcode     points to where to stop (current RECURSE item)
   2175   bcptr       points to the chain of current (unclosed) branch starts
   2176   utf8        TRUE if in UTF-8 mode
   2177   cd          pointers to tables etc
   2178 
   2179 Returns:      TRUE if what is matched could be empty
   2180 */
   2181 
   2182 static BOOL
   2183 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
   2184   BOOL utf8, compile_data *cd)
   2185 {
   2186 while (bcptr != NULL && bcptr->current_branch >= code)
   2187   {
   2188   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
   2189     return FALSE;
   2190   bcptr = bcptr->outer;
   2191   }
   2192 return TRUE;
   2193 }
   2194 
   2195 
   2196 
   2197 /*************************************************
   2198 *           Check for POSIX class syntax         *
   2199 *************************************************/
   2200 
   2201 /* This function is called when the sequence "[:" or "[." or "[=" is
   2202 encountered in a character class. It checks whether this is followed by a
   2203 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
   2204 reach an unescaped ']' without the special preceding character, return FALSE.
   2205 
   2206 Originally, this function only recognized a sequence of letters between the
   2207 terminators, but it seems that Perl recognizes any sequence of characters,
   2208 though of course unknown POSIX names are subsequently rejected. Perl gives an
   2209 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
   2210 didn't consider this to be a POSIX class. Likewise for [:1234:].
   2211 
   2212 The problem in trying to be exactly like Perl is in the handling of escapes. We
   2213 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
   2214 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
   2215 below handles the special case of \], but does not try to do any other escape
   2216 processing. This makes it different from Perl for cases such as [:l\ower:]
   2217 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
   2218 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
   2219 I think.
   2220 
   2221 Arguments:
   2222   ptr      pointer to the initial [
   2223   endptr   where to return the end pointer
   2224 
   2225 Returns:   TRUE or FALSE
   2226 */
   2227 
   2228 static BOOL
   2229 check_posix_syntax(const uschar *ptr, const uschar **endptr)
   2230 {
   2231 int terminator;          /* Don't combine these lines; the Solaris cc */
   2232 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
   2233 for (++ptr; *ptr != 0; ptr++)
   2234   {
   2235   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
   2236     {
   2237     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
   2238     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
   2239       {
   2240       *endptr = ptr;
   2241       return TRUE;
   2242       }
   2243     }
   2244   }
   2245 return FALSE;
   2246 }
   2247 
   2248 
   2249 
   2250 
   2251 /*************************************************
   2252 *          Check POSIX class name                *
   2253 *************************************************/
   2254 
   2255 /* This function is called to check the name given in a POSIX-style class entry
   2256 such as [:alnum:].
   2257 
   2258 Arguments:
   2259   ptr        points to the first letter
   2260   len        the length of the name
   2261 
   2262 Returns:     a value representing the name, or -1 if unknown
   2263 */
   2264 
   2265 static int
   2266 check_posix_name(const uschar *ptr, int len)
   2267 {
   2268 const char *pn = posix_names;
   2269 register int yield = 0;
   2270 while (posix_name_lengths[yield] != 0)
   2271   {
   2272   if (len == posix_name_lengths[yield] &&
   2273     strncmp((const char *)ptr, pn, len) == 0) return yield;
   2274   pn += posix_name_lengths[yield] + 1;
   2275   yield++;
   2276   }
   2277 return -1;
   2278 }
   2279 
   2280 
   2281 /*************************************************
   2282 *    Adjust OP_RECURSE items in repeated group   *
   2283 *************************************************/
   2284 
   2285 /* OP_RECURSE items contain an offset from the start of the regex to the group
   2286 that is referenced. This means that groups can be replicated for fixed
   2287 repetition simply by copying (because the recursion is allowed to refer to
   2288 earlier groups that are outside the current group). However, when a group is
   2289 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
   2290 inserted before it, after it has been compiled. This means that any OP_RECURSE
   2291 items within it that refer to the group itself or any contained groups have to
   2292 have their offsets adjusted. That one of the jobs of this function. Before it
   2293 is called, the partially compiled regex must be temporarily terminated with
   2294 OP_END.
   2295 
   2296 This function has been extended with the possibility of forward references for
   2297 recursions and subroutine calls. It must also check the list of such references
   2298 for the group we are dealing with. If it finds that one of the recursions in
   2299 the current group is on this list, it adjusts the offset in the list, not the
   2300 value in the reference (which is a group number).
   2301 
   2302 Arguments:
   2303   group      points to the start of the group
   2304   adjust     the amount by which the group is to be moved
   2305   utf8       TRUE in UTF-8 mode
   2306   cd         contains pointers to tables etc.
   2307   save_hwm   the hwm forward reference pointer at the start of the group
   2308 
   2309 Returns:     nothing
   2310 */
   2311 
   2312 static void
   2313 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
   2314   uschar *save_hwm)
   2315 {
   2316 uschar *ptr = group;
   2317 
   2318 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
   2319   {
   2320   int offset;
   2321   uschar *hc;
   2322 
   2323   /* See if this recursion is on the forward reference list. If so, adjust the
   2324   reference. */
   2325 
   2326   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
   2327     {
   2328     offset = GET(hc, 0);
   2329     if (cd->start_code + offset == ptr + 1)
   2330       {
   2331       PUT(hc, 0, offset + adjust);
   2332       break;
   2333       }
   2334     }
   2335 
   2336   /* Otherwise, adjust the recursion offset if it's after the start of this
   2337   group. */
   2338 
   2339   if (hc >= cd->hwm)
   2340     {
   2341     offset = GET(ptr, 1);
   2342     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
   2343     }
   2344 
   2345   ptr += 1 + LINK_SIZE;
   2346   }
   2347 }
   2348 
   2349 
   2350 
   2351 /*************************************************
   2352 *        Insert an automatic callout point       *
   2353 *************************************************/
   2354 
   2355 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
   2356 callout points before each pattern item.
   2357 
   2358 Arguments:
   2359   code           current code pointer
   2360   ptr            current pattern pointer
   2361   cd             pointers to tables etc
   2362 
   2363 Returns:         new code pointer
   2364 */
   2365 
   2366 static uschar *
   2367 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
   2368 {
   2369 *code++ = OP_CALLOUT;
   2370 *code++ = 255;
   2371 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
   2372 PUT(code, LINK_SIZE, 0);                       /* Default length */
   2373 return code + 2*LINK_SIZE;
   2374 }
   2375 
   2376 
   2377 
   2378 /*************************************************
   2379 *         Complete a callout item                *
   2380 *************************************************/
   2381 
   2382 /* A callout item contains the length of the next item in the pattern, which
   2383 we can't fill in till after we have reached the relevant point. This is used
   2384 for both automatic and manual callouts.
   2385 
   2386 Arguments:
   2387   previous_callout   points to previous callout item
   2388   ptr                current pattern pointer
   2389   cd                 pointers to tables etc
   2390 
   2391 Returns:             nothing
   2392 */
   2393 
   2394 static void
   2395 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
   2396 {
   2397 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
   2398 PUT(previous_callout, 2 + LINK_SIZE, length);
   2399 }
   2400 
   2401 
   2402 
   2403 #ifdef SUPPORT_UCP
   2404 /*************************************************
   2405 *           Get othercase range                  *
   2406 *************************************************/
   2407 
   2408 /* This function is passed the start and end of a class range, in UTF-8 mode
   2409 with UCP support. It searches up the characters, looking for internal ranges of
   2410 characters in the "other" case. Each call returns the next one, updating the
   2411 start address.
   2412 
   2413 Arguments:
   2414   cptr        points to starting character value; updated
   2415   d           end value
   2416   ocptr       where to put start of othercase range
   2417   odptr       where to put end of othercase range
   2418 
   2419 Yield:        TRUE when range returned; FALSE when no more
   2420 */
   2421 
   2422 static BOOL
   2423 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
   2424   unsigned int *odptr)
   2425 {
   2426 unsigned int c, othercase, next;
   2427 
   2428 for (c = *cptr; c <= d; c++)
   2429   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
   2430 
   2431 if (c > d) return FALSE;
   2432 
   2433 *ocptr = othercase;
   2434 next = othercase + 1;
   2435 
   2436 for (++c; c <= d; c++)
   2437   {
   2438   if (UCD_OTHERCASE(c) != next) break;
   2439   next++;
   2440   }
   2441 
   2442 *odptr = next - 1;
   2443 *cptr = c;
   2444 
   2445 return TRUE;
   2446 }
   2447 
   2448 
   2449 
   2450 /*************************************************
   2451 *        Check a character and a property        *
   2452 *************************************************/
   2453 
   2454 /* This function is called by check_auto_possessive() when a property item
   2455 is adjacent to a fixed character.
   2456 
   2457 Arguments:
   2458   c            the character
   2459   ptype        the property type
   2460   pdata        the data for the type
   2461   negated      TRUE if it's a negated property (\P or \p{^)
   2462 
   2463 Returns:       TRUE if auto-possessifying is OK
   2464 */
   2465 
   2466 static BOOL
   2467 check_char_prop(int c, int ptype, int pdata, BOOL negated)
   2468 {
   2469 const ucd_record *prop = GET_UCD(c);
   2470 switch(ptype)
   2471   {
   2472   case PT_LAMP:
   2473   return (prop->chartype == ucp_Lu ||
   2474           prop->chartype == ucp_Ll ||
   2475           prop->chartype == ucp_Lt) == negated;
   2476 
   2477   case PT_GC:
   2478   return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
   2479 
   2480   case PT_PC:
   2481   return (pdata == prop->chartype) == negated;
   2482 
   2483   case PT_SC:
   2484   return (pdata == prop->script) == negated;
   2485 
   2486   /* These are specials */
   2487 
   2488   case PT_ALNUM:
   2489   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
   2490           _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
   2491 
   2492   case PT_SPACE:    /* Perl space */
   2493   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   2494           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
   2495           == negated;
   2496 
   2497   case PT_PXSPACE:  /* POSIX space */
   2498   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   2499           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
   2500           c == CHAR_FF || c == CHAR_CR)
   2501           == negated;
   2502 
   2503   case PT_WORD:
   2504   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
   2505           _pcre_ucp_gentype[prop->chartype] == ucp_N ||
   2506           c == CHAR_UNDERSCORE) == negated;
   2507   }
   2508 return FALSE;
   2509 }
   2510 #endif  /* SUPPORT_UCP */
   2511 
   2512 
   2513 
   2514 /*************************************************
   2515 *     Check if auto-possessifying is possible    *
   2516 *************************************************/
   2517 
   2518 /* This function is called for unlimited repeats of certain items, to see
   2519 whether the next thing could possibly match the repeated item. If not, it makes
   2520 sense to automatically possessify the repeated item.
   2521 
   2522 Arguments:
   2523   previous      pointer to the repeated opcode
   2524   utf8          TRUE in UTF-8 mode
   2525   ptr           next character in pattern
   2526   options       options bits
   2527   cd            contains pointers to tables etc.
   2528 
   2529 Returns:        TRUE if possessifying is wanted
   2530 */
   2531 
   2532 static BOOL
   2533 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
   2534   int options, compile_data *cd)
   2535 {
   2536 int c, next;
   2537 int op_code = *previous++;
   2538 
   2539 /* Skip whitespace and comments in extended mode */
   2540 
   2541 if ((options & PCRE_EXTENDED) != 0)
   2542   {
   2543   for (;;)
   2544     {
   2545     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
   2546     if (*ptr == CHAR_NUMBER_SIGN)
   2547       {
   2548       ptr++;
   2549       while (*ptr != 0)
   2550         {
   2551         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
   2552         ptr++;
   2553 #ifdef SUPPORT_UTF8
   2554         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
   2555 #endif
   2556         }
   2557       }
   2558     else break;
   2559     }
   2560   }
   2561 
   2562 /* If the next item is one that we can handle, get its value. A non-negative
   2563 value is a character, a negative value is an escape value. */
   2564 
   2565 if (*ptr == CHAR_BACKSLASH)
   2566   {
   2567   int temperrorcode = 0;
   2568   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
   2569   if (temperrorcode != 0) return FALSE;
   2570   ptr++;    /* Point after the escape sequence */
   2571   }
   2572 
   2573 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
   2574   {
   2575 #ifdef SUPPORT_UTF8
   2576   if (utf8) { GETCHARINC(next, ptr); } else
   2577 #endif
   2578   next = *ptr++;
   2579   }
   2580 
   2581 else return FALSE;
   2582 
   2583 /* Skip whitespace and comments in extended mode */
   2584 
   2585 if ((options & PCRE_EXTENDED) != 0)
   2586   {
   2587   for (;;)
   2588     {
   2589     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
   2590     if (*ptr == CHAR_NUMBER_SIGN)
   2591       {
   2592       ptr++;
   2593       while (*ptr != 0)
   2594         {
   2595         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
   2596         ptr++;
   2597 #ifdef SUPPORT_UTF8
   2598         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
   2599 #endif
   2600         }
   2601       }
   2602     else break;
   2603     }
   2604   }
   2605 
   2606 /* If the next thing is itself optional, we have to give up. */
   2607 
   2608 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
   2609   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
   2610     return FALSE;
   2611 
   2612 /* Now compare the next item with the previous opcode. First, handle cases when
   2613 the next item is a character. */
   2614 
   2615 if (next >= 0) switch(op_code)
   2616   {
   2617   case OP_CHAR:
   2618 #ifdef SUPPORT_UTF8
   2619   GETCHARTEST(c, previous);
   2620 #else
   2621   c = *previous;
   2622 #endif
   2623   return c != next;
   2624 
   2625   /* For CHARNC (caseless character) we must check the other case. If we have
   2626   Unicode property support, we can use it to test the other case of
   2627   high-valued characters. */
   2628 
   2629   case OP_CHARNC:
   2630 #ifdef SUPPORT_UTF8
   2631   GETCHARTEST(c, previous);
   2632 #else
   2633   c = *previous;
   2634 #endif
   2635   if (c == next) return FALSE;
   2636 #ifdef SUPPORT_UTF8
   2637   if (utf8)
   2638     {
   2639     unsigned int othercase;
   2640     if (next < 128) othercase = cd->fcc[next]; else
   2641 #ifdef SUPPORT_UCP
   2642     othercase = UCD_OTHERCASE((unsigned int)next);
   2643 #else
   2644     othercase = NOTACHAR;
   2645 #endif
   2646     return (unsigned int)c != othercase;
   2647     }
   2648   else
   2649 #endif  /* SUPPORT_UTF8 */
   2650   return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
   2651 
   2652   /* For OP_NOT, its data is always a single-byte character. */
   2653 
   2654   case OP_NOT:
   2655   if ((c = *previous) == next) return TRUE;
   2656   if ((options & PCRE_CASELESS) == 0) return FALSE;
   2657 #ifdef SUPPORT_UTF8
   2658   if (utf8)
   2659     {
   2660     unsigned int othercase;
   2661     if (next < 128) othercase = cd->fcc[next]; else
   2662 #ifdef SUPPORT_UCP
   2663     othercase = UCD_OTHERCASE(next);
   2664 #else
   2665     othercase = NOTACHAR;
   2666 #endif
   2667     return (unsigned int)c == othercase;
   2668     }
   2669   else
   2670 #endif  /* SUPPORT_UTF8 */
   2671   return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
   2672 
   2673   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
   2674   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
   2675 
   2676   case OP_DIGIT:
   2677   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
   2678 
   2679   case OP_NOT_DIGIT:
   2680   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
   2681 
   2682   case OP_WHITESPACE:
   2683   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
   2684 
   2685   case OP_NOT_WHITESPACE:
   2686   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
   2687 
   2688   case OP_WORDCHAR:
   2689   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
   2690 
   2691   case OP_NOT_WORDCHAR:
   2692   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
   2693 
   2694   case OP_HSPACE:
   2695   case OP_NOT_HSPACE:
   2696   switch(next)
   2697     {
   2698     case 0x09:
   2699     case 0x20:
   2700     case 0xa0:
   2701     case 0x1680:
   2702     case 0x180e:
   2703     case 0x2000:
   2704     case 0x2001:
   2705     case 0x2002:
   2706     case 0x2003:
   2707     case 0x2004:
   2708     case 0x2005:
   2709     case 0x2006:
   2710     case 0x2007:
   2711     case 0x2008:
   2712     case 0x2009:
   2713     case 0x200A:
   2714     case 0x202f:
   2715     case 0x205f:
   2716     case 0x3000:
   2717     return op_code == OP_NOT_HSPACE;
   2718     default:
   2719     return op_code != OP_NOT_HSPACE;
   2720     }
   2721 
   2722   case OP_ANYNL:
   2723   case OP_VSPACE:
   2724   case OP_NOT_VSPACE:
   2725   switch(next)
   2726     {
   2727     case 0x0a:
   2728     case 0x0b:
   2729     case 0x0c:
   2730     case 0x0d:
   2731     case 0x85:
   2732     case 0x2028:
   2733     case 0x2029:
   2734     return op_code == OP_NOT_VSPACE;
   2735     default:
   2736     return op_code != OP_NOT_VSPACE;
   2737     }
   2738 
   2739 #ifdef SUPPORT_UCP
   2740   case OP_PROP:
   2741   return check_char_prop(next, previous[0], previous[1], FALSE);
   2742 
   2743   case OP_NOTPROP:
   2744   return check_char_prop(next, previous[0], previous[1], TRUE);
   2745 #endif
   2746 
   2747   default:
   2748   return FALSE;
   2749   }
   2750 
   2751 
   2752 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
   2753 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
   2754 generated only when PCRE_UCP is *not* set, that is, when only ASCII
   2755 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
   2756 replaced by OP_PROP codes when PCRE_UCP is set. */
   2757 
   2758 switch(op_code)
   2759   {
   2760   case OP_CHAR:
   2761   case OP_CHARNC:
   2762 #ifdef SUPPORT_UTF8
   2763   GETCHARTEST(c, previous);
   2764 #else
   2765   c = *previous;
   2766 #endif
   2767   switch(-next)
   2768     {
   2769     case ESC_d:
   2770     return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
   2771 
   2772     case ESC_D:
   2773     return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
   2774 
   2775     case ESC_s:
   2776     return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
   2777 
   2778     case ESC_S:
   2779     return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
   2780 
   2781     case ESC_w:
   2782     return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
   2783 
   2784     case ESC_W:
   2785     return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
   2786 
   2787     case ESC_h:
   2788     case ESC_H:
   2789     switch(c)
   2790       {
   2791       case 0x09:
   2792       case 0x20:
   2793       case 0xa0:
   2794       case 0x1680:
   2795       case 0x180e:
   2796       case 0x2000:
   2797       case 0x2001:
   2798       case 0x2002:
   2799       case 0x2003:
   2800       case 0x2004:
   2801       case 0x2005:
   2802       case 0x2006:
   2803       case 0x2007:
   2804       case 0x2008:
   2805       case 0x2009:
   2806       case 0x200A:
   2807       case 0x202f:
   2808       case 0x205f:
   2809       case 0x3000:
   2810       return -next != ESC_h;
   2811       default:
   2812       return -next == ESC_h;
   2813       }
   2814 
   2815     case ESC_v:
   2816     case ESC_V:
   2817     switch(c)
   2818       {
   2819       case 0x0a:
   2820       case 0x0b:
   2821       case 0x0c:
   2822       case 0x0d:
   2823       case 0x85:
   2824       case 0x2028:
   2825       case 0x2029:
   2826       return -next != ESC_v;
   2827       default:
   2828       return -next == ESC_v;
   2829       }
   2830 
   2831     /* When PCRE_UCP is set, these values get generated for \d etc. Find
   2832     their substitutions and process them. The result will always be either
   2833     -ESC_p or -ESC_P. Then fall through to process those values. */
   2834 
   2835 #ifdef SUPPORT_UCP
   2836     case ESC_du:
   2837     case ESC_DU:
   2838     case ESC_wu:
   2839     case ESC_WU:
   2840     case ESC_su:
   2841     case ESC_SU:
   2842       {
   2843       int temperrorcode = 0;
   2844       ptr = substitutes[-next - ESC_DU];
   2845       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
   2846       if (temperrorcode != 0) return FALSE;
   2847       ptr++;    /* For compatibility */
   2848       }
   2849     /* Fall through */
   2850 
   2851     case ESC_p:
   2852     case ESC_P:
   2853       {
   2854       int ptype, pdata, errorcodeptr;
   2855       BOOL negated;
   2856 
   2857       ptr--;      /* Make ptr point at the p or P */
   2858       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
   2859       if (ptype < 0) return FALSE;
   2860       ptr++;      /* Point past the final curly ket */
   2861 
   2862       /* If the property item is optional, we have to give up. (When generated
   2863       from \d etc by PCRE_UCP, this test will have been applied much earlier,
   2864       to the original \d etc. At this point, ptr will point to a zero byte. */
   2865 
   2866       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
   2867         strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
   2868           return FALSE;
   2869 
   2870       /* Do the property check. */
   2871 
   2872       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
   2873       }
   2874 #endif
   2875 
   2876     default:
   2877     return FALSE;
   2878     }
   2879 
   2880   /* In principle, support for Unicode properties should be integrated here as
   2881   well. It means re-organizing the above code so as to get hold of the property
   2882   values before switching on the op-code. However, I wonder how many patterns
   2883   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
   2884   these op-codes are never generated.) */
   2885 
   2886   case OP_DIGIT:
   2887   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
   2888          next == -ESC_h || next == -ESC_v || next == -ESC_R;
   2889 
   2890   case OP_NOT_DIGIT:
   2891   return next == -ESC_d;
   2892 
   2893   case OP_WHITESPACE:
   2894   return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
   2895 
   2896   case OP_NOT_WHITESPACE:
   2897   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
   2898 
   2899   case OP_HSPACE:
   2900   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
   2901          next == -ESC_w || next == -ESC_v || next == -ESC_R;
   2902 
   2903   case OP_NOT_HSPACE:
   2904   return next == -ESC_h;
   2905 
   2906   /* Can't have \S in here because VT matches \S (Perl anomaly) */
   2907   case OP_ANYNL:
   2908   case OP_VSPACE:
   2909   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
   2910 
   2911   case OP_NOT_VSPACE:
   2912   return next == -ESC_v || next == -ESC_R;
   2913 
   2914   case OP_WORDCHAR:
   2915   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
   2916          next == -ESC_v || next == -ESC_R;
   2917 
   2918   case OP_NOT_WORDCHAR:
   2919   return next == -ESC_w || next == -ESC_d;
   2920 
   2921   default:
   2922   return FALSE;
   2923   }
   2924 
   2925 /* Control does not reach here */
   2926 }
   2927 
   2928 
   2929 
   2930 /*************************************************
   2931 *           Compile one branch                   *
   2932 *************************************************/
   2933 
   2934 /* Scan the pattern, compiling it into the a vector. If the options are
   2935 changed during the branch, the pointer is used to change the external options
   2936 bits. This function is used during the pre-compile phase when we are trying
   2937 to find out the amount of memory needed, as well as during the real compile
   2938 phase. The value of lengthptr distinguishes the two phases.
   2939 
   2940 Arguments:
   2941   optionsptr     pointer to the option bits
   2942   codeptr        points to the pointer to the current code point
   2943   ptrptr         points to the current pattern pointer
   2944   errorcodeptr   points to error code variable
   2945   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
   2946   reqbyteptr     set to the last literal character required, else < 0
   2947   bcptr          points to current branch chain
   2948   cd             contains pointers to tables etc.
   2949   lengthptr      NULL during the real compile phase
   2950                  points to length accumulator during pre-compile phase
   2951 
   2952 Returns:         TRUE on success
   2953                  FALSE, with *errorcodeptr set non-zero on error
   2954 */
   2955 
   2956 static BOOL
   2957 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
   2958   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
   2959   compile_data *cd, int *lengthptr)
   2960 {
   2961 int repeat_type, op_type;
   2962 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
   2963 int bravalue = 0;
   2964 int greedy_default, greedy_non_default;
   2965 int firstbyte, reqbyte;
   2966 int zeroreqbyte, zerofirstbyte;
   2967 int req_caseopt, reqvary, tempreqvary;
   2968 int options = *optionsptr;
   2969 int after_manual_callout = 0;
   2970 int length_prevgroup = 0;
   2971 register int c;
   2972 register uschar *code = *codeptr;
   2973 uschar *last_code = code;
   2974 uschar *orig_code = code;
   2975 uschar *tempcode;
   2976 BOOL inescq = FALSE;
   2977 BOOL groupsetfirstbyte = FALSE;
   2978 const uschar *ptr = *ptrptr;
   2979 const uschar *tempptr;
   2980 const uschar *nestptr = NULL;
   2981 uschar *previous = NULL;
   2982 uschar *previous_callout = NULL;
   2983 uschar *save_hwm = NULL;
   2984 uschar classbits[32];
   2985 
   2986 #ifdef SUPPORT_UTF8
   2987 BOOL class_utf8;
   2988 BOOL utf8 = (options & PCRE_UTF8) != 0;
   2989 uschar *class_utf8data;
   2990 uschar *class_utf8data_base;
   2991 uschar utf8_char[6];
   2992 #else
   2993 BOOL utf8 = FALSE;
   2994 uschar *utf8_char = NULL;
   2995 #endif
   2996 
   2997 #ifdef PCRE_DEBUG
   2998 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
   2999 #endif
   3000 
   3001 /* Set up the default and non-default settings for greediness */
   3002 
   3003 greedy_default = ((options & PCRE_UNGREEDY) != 0);
   3004 greedy_non_default = greedy_default ^ 1;
   3005 
   3006 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
   3007 matching encountered yet". It gets changed to REQ_NONE if we hit something that
   3008 matches a non-fixed char first char; reqbyte just remains unset if we never
   3009 find one.
   3010 
   3011 When we hit a repeat whose minimum is zero, we may have to adjust these values
   3012 to take the zero repeat into account. This is implemented by setting them to
   3013 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
   3014 item types that can be repeated set these backoff variables appropriately. */
   3015 
   3016 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
   3017 
   3018 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
   3019 according to the current setting of the caseless flag. REQ_CASELESS is a bit
   3020 value > 255. It is added into the firstbyte or reqbyte variables to record the
   3021 case status of the value. This is used only for ASCII characters. */
   3022 
   3023 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
   3024 
   3025 /* Switch on next character until the end of the branch */
   3026 
   3027 for (;; ptr++)
   3028   {
   3029   BOOL negate_class;
   3030   BOOL should_flip_negation;
   3031   BOOL possessive_quantifier;
   3032   BOOL is_quantifier;
   3033   BOOL is_recurse;
   3034   BOOL reset_bracount;
   3035   int class_charcount;
   3036   int class_lastchar;
   3037   int newoptions;
   3038   int recno;
   3039   int refsign;
   3040   int skipbytes;
   3041   int subreqbyte;
   3042   int subfirstbyte;
   3043   int terminator;
   3044   int mclength;
   3045   uschar mcbuffer[8];
   3046 
   3047   /* Get next byte in the pattern */
   3048 
   3049   c = *ptr;
   3050 
   3051   /* If we are at the end of a nested substitution, revert to the outer level
   3052   string. Nesting only happens one level deep. */
   3053 
   3054   if (c == 0 && nestptr != NULL)
   3055     {
   3056     ptr = nestptr;
   3057     nestptr = NULL;
   3058     c = *ptr;
   3059     }
   3060 
   3061   /* If we are in the pre-compile phase, accumulate the length used for the
   3062   previous cycle of this loop. */
   3063 
   3064   if (lengthptr != NULL)
   3065     {
   3066 #ifdef PCRE_DEBUG
   3067     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
   3068 #endif
   3069     if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
   3070       {
   3071       *errorcodeptr = ERR52;
   3072       goto FAILED;
   3073       }
   3074 
   3075     /* There is at least one situation where code goes backwards: this is the
   3076     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
   3077     the class is simply eliminated. However, it is created first, so we have to
   3078     allow memory for it. Therefore, don't ever reduce the length at this point.
   3079     */
   3080 
   3081     if (code < last_code) code = last_code;
   3082 
   3083     /* Paranoid check for integer overflow */
   3084 
   3085     if (OFLOW_MAX - *lengthptr < code - last_code)
   3086       {
   3087       *errorcodeptr = ERR20;
   3088       goto FAILED;
   3089       }
   3090 
   3091     *lengthptr += (int)(code - last_code);
   3092     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
   3093 
   3094     /* If "previous" is set and it is not at the start of the work space, move
   3095     it back to there, in order to avoid filling up the work space. Otherwise,
   3096     if "previous" is NULL, reset the current code pointer to the start. */
   3097 
   3098     if (previous != NULL)
   3099       {
   3100       if (previous > orig_code)
   3101         {
   3102         memmove(orig_code, previous, code - previous);
   3103         code -= previous - orig_code;
   3104         previous = orig_code;
   3105         }
   3106       }
   3107     else code = orig_code;
   3108 
   3109     /* Remember where this code item starts so we can pick up the length
   3110     next time round. */
   3111 
   3112     last_code = code;
   3113     }
   3114 
   3115   /* In the real compile phase, just check the workspace used by the forward
   3116   reference list. */
   3117 
   3118   else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
   3119     {
   3120     *errorcodeptr = ERR52;
   3121     goto FAILED;
   3122     }
   3123 
   3124   /* If in \Q...\E, check for the end; if not, we have a literal */
   3125 
   3126   if (inescq && c != 0)
   3127     {
   3128     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   3129       {
   3130       inescq = FALSE;
   3131       ptr++;
   3132       continue;
   3133       }
   3134     else
   3135       {
   3136       if (previous_callout != NULL)
   3137         {
   3138         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
   3139           complete_callout(previous_callout, ptr, cd);
   3140         previous_callout = NULL;
   3141         }
   3142       if ((options & PCRE_AUTO_CALLOUT) != 0)
   3143         {
   3144         previous_callout = code;
   3145         code = auto_callout(code, ptr, cd);
   3146         }
   3147       goto NORMAL_CHAR;
   3148       }
   3149     }
   3150 
   3151   /* Fill in length of a previous callout, except when the next thing is
   3152   a quantifier. */
   3153 
   3154   is_quantifier =
   3155     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
   3156     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
   3157 
   3158   if (!is_quantifier && previous_callout != NULL &&
   3159        after_manual_callout-- <= 0)
   3160     {
   3161     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
   3162       complete_callout(previous_callout, ptr, cd);
   3163     previous_callout = NULL;
   3164     }
   3165 
   3166   /* In extended mode, skip white space and comments */
   3167 
   3168   if ((options & PCRE_EXTENDED) != 0)
   3169     {
   3170     if ((cd->ctypes[c] & ctype_space) != 0) continue;
   3171     if (c == CHAR_NUMBER_SIGN)
   3172       {
   3173       ptr++;
   3174       while (*ptr != 0)
   3175         {
   3176         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
   3177         ptr++;
   3178 #ifdef SUPPORT_UTF8
   3179         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
   3180 #endif
   3181         }
   3182       if (*ptr != 0) continue;
   3183 
   3184       /* Else fall through to handle end of string */
   3185       c = 0;
   3186       }
   3187     }
   3188 
   3189   /* No auto callout for quantifiers. */
   3190 
   3191   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
   3192     {
   3193     previous_callout = code;
   3194     code = auto_callout(code, ptr, cd);
   3195     }
   3196 
   3197   switch(c)
   3198     {
   3199     /* ===================================================================*/
   3200     case 0:                        /* The branch terminates at string end */
   3201     case CHAR_VERTICAL_LINE:       /* or | or ) */
   3202     case CHAR_RIGHT_PARENTHESIS:
   3203     *firstbyteptr = firstbyte;
   3204     *reqbyteptr = reqbyte;
   3205     *codeptr = code;
   3206     *ptrptr = ptr;
   3207     if (lengthptr != NULL)
   3208       {
   3209       if (OFLOW_MAX - *lengthptr < code - last_code)
   3210         {
   3211         *errorcodeptr = ERR20;
   3212         goto FAILED;
   3213         }
   3214       *lengthptr += (int)(code - last_code);   /* To include callout length */
   3215       DPRINTF((">> end branch\n"));
   3216       }
   3217     return TRUE;
   3218 
   3219 
   3220     /* ===================================================================*/
   3221     /* Handle single-character metacharacters. In multiline mode, ^ disables
   3222     the setting of any following char as a first character. */
   3223 
   3224     case CHAR_CIRCUMFLEX_ACCENT:
   3225     if ((options & PCRE_MULTILINE) != 0)
   3226       {
   3227       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
   3228       }
   3229     previous = NULL;
   3230     *code++ = OP_CIRC;
   3231     break;
   3232 
   3233     case CHAR_DOLLAR_SIGN:
   3234     previous = NULL;
   3235     *code++ = OP_DOLL;
   3236     break;
   3237 
   3238     /* There can never be a first char if '.' is first, whatever happens about
   3239     repeats. The value of reqbyte doesn't change either. */
   3240 
   3241     case CHAR_DOT:
   3242     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
   3243     zerofirstbyte = firstbyte;
   3244     zeroreqbyte = reqbyte;
   3245     previous = code;
   3246     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
   3247     break;
   3248 
   3249 
   3250     /* ===================================================================*/
   3251     /* Character classes. If the included characters are all < 256, we build a
   3252     32-byte bitmap of the permitted characters, except in the special case
   3253     where there is only one such character. For negated classes, we build the
   3254     map as usual, then invert it at the end. However, we use a different opcode
   3255     so that data characters > 255 can be handled correctly.
   3256 
   3257     If the class contains characters outside the 0-255 range, a different
   3258     opcode is compiled. It may optionally have a bit map for characters < 256,
   3259     but those above are are explicitly listed afterwards. A flag byte tells
   3260     whether the bitmap is present, and whether this is a negated class or not.
   3261 
   3262     In JavaScript compatibility mode, an isolated ']' causes an error. In
   3263     default (Perl) mode, it is treated as a data character. */
   3264 
   3265     case CHAR_RIGHT_SQUARE_BRACKET:
   3266     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   3267       {
   3268       *errorcodeptr = ERR64;
   3269       goto FAILED;
   3270       }
   3271     goto NORMAL_CHAR;
   3272 
   3273     case CHAR_LEFT_SQUARE_BRACKET:
   3274     previous = code;
   3275 
   3276     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
   3277     they are encountered at the top level, so we'll do that too. */
   3278 
   3279     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   3280          ptr[1] == CHAR_EQUALS_SIGN) &&
   3281         check_posix_syntax(ptr, &tempptr))
   3282       {
   3283       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
   3284       goto FAILED;
   3285       }
   3286 
   3287     /* If the first character is '^', set the negation flag and skip it. Also,
   3288     if the first few characters (either before or after ^) are \Q\E or \E we
   3289     skip them too. This makes for compatibility with Perl. */
   3290 
   3291     negate_class = FALSE;
   3292     for (;;)
   3293       {
   3294       c = *(++ptr);
   3295       if (c == CHAR_BACKSLASH)
   3296         {
   3297         if (ptr[1] == CHAR_E)
   3298           ptr++;
   3299         else if (strncmp((const char *)ptr+1,
   3300                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
   3301           ptr += 3;
   3302         else
   3303           break;
   3304         }
   3305       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
   3306         negate_class = TRUE;
   3307       else break;
   3308       }
   3309 
   3310     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
   3311     an initial ']' is taken as a data character -- the code below handles
   3312     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
   3313     [^] must match any character, so generate OP_ALLANY. */
   3314 
   3315     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
   3316         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
   3317       {
   3318       *code++ = negate_class? OP_ALLANY : OP_FAIL;
   3319       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
   3320       zerofirstbyte = firstbyte;
   3321       break;
   3322       }
   3323 
   3324     /* If a class contains a negative special such as \S, we need to flip the
   3325     negation flag at the end, so that support for characters > 255 works
   3326     correctly (they are all included in the class). */
   3327 
   3328     should_flip_negation = FALSE;
   3329 
   3330     /* Keep a count of chars with values < 256 so that we can optimize the case
   3331     of just a single character (as long as it's < 256). However, For higher
   3332     valued UTF-8 characters, we don't yet do any optimization. */
   3333 
   3334     class_charcount = 0;
   3335     class_lastchar = -1;
   3336 
   3337     /* Initialize the 32-char bit map to all zeros. We build the map in a
   3338     temporary bit of memory, in case the class contains only 1 character (less
   3339     than 256), because in that case the compiled code doesn't use the bit map.
   3340     */
   3341 
   3342     memset(classbits, 0, 32 * sizeof(uschar));
   3343 
   3344 #ifdef SUPPORT_UTF8
   3345     class_utf8 = FALSE;                       /* No chars >= 256 */
   3346     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
   3347     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
   3348 #endif
   3349 
   3350     /* Process characters until ] is reached. By writing this as a "do" it
   3351     means that an initial ] is taken as a data character. At the start of the
   3352     loop, c contains the first byte of the character. */
   3353 
   3354     if (c != 0) do
   3355       {
   3356       const uschar *oldptr;
   3357 
   3358 #ifdef SUPPORT_UTF8
   3359       if (utf8 && c > 127)
   3360         {                           /* Braces are required because the */
   3361         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
   3362         }
   3363 
   3364       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
   3365       data and reset the pointer. This is so that very large classes that
   3366       contain a zillion UTF-8 characters no longer overwrite the work space
   3367       (which is on the stack). */
   3368 
   3369       if (lengthptr != NULL)
   3370         {
   3371         *lengthptr += class_utf8data - class_utf8data_base;
   3372         class_utf8data = class_utf8data_base;
   3373         }
   3374 
   3375 #endif
   3376 
   3377       /* Inside \Q...\E everything is literal except \E */
   3378 
   3379       if (inescq)
   3380         {
   3381         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
   3382           {
   3383           inescq = FALSE;                   /* Reset literal state */
   3384           ptr++;                            /* Skip the 'E' */
   3385           continue;                         /* Carry on with next */
   3386           }
   3387         goto CHECK_RANGE;                   /* Could be range if \E follows */
   3388         }
   3389 
   3390       /* Handle POSIX class names. Perl allows a negation extension of the
   3391       form [:^name:]. A square bracket that doesn't match the syntax is
   3392       treated as a literal. We also recognize the POSIX constructions
   3393       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
   3394       5.6 and 5.8 do. */
   3395 
   3396       if (c == CHAR_LEFT_SQUARE_BRACKET &&
   3397           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
   3398            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
   3399         {
   3400         BOOL local_negate = FALSE;
   3401         int posix_class, taboffset, tabopt;
   3402         register const uschar *cbits = cd->cbits;
   3403         uschar pbits[32];
   3404 
   3405         if (ptr[1] != CHAR_COLON)
   3406           {
   3407           *errorcodeptr = ERR31;
   3408           goto FAILED;
   3409           }
   3410 
   3411         ptr += 2;
   3412         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
   3413           {
   3414           local_negate = TRUE;
   3415           should_flip_negation = TRUE;  /* Note negative special */
   3416           ptr++;
   3417           }
   3418 
   3419         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
   3420         if (posix_class < 0)
   3421           {
   3422           *errorcodeptr = ERR30;
   3423           goto FAILED;
   3424           }
   3425 
   3426         /* If matching is caseless, upper and lower are converted to
   3427         alpha. This relies on the fact that the class table starts with
   3428         alpha, lower, upper as the first 3 entries. */
   3429 
   3430         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
   3431           posix_class = 0;
   3432 
   3433         /* When PCRE_UCP is set, some of the POSIX classes are converted to
   3434         different escape sequences that use Unicode properties. */
   3435 
   3436 #ifdef SUPPORT_UCP
   3437         if ((options & PCRE_UCP) != 0)
   3438           {
   3439           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
   3440           if (posix_substitutes[pc] != NULL)
   3441             {
   3442             nestptr = tempptr + 1;
   3443             ptr = posix_substitutes[pc] - 1;
   3444             continue;
   3445             }
   3446           }
   3447 #endif
   3448         /* In the non-UCP case, we build the bit map for the POSIX class in a
   3449         chunk of local store because we may be adding and subtracting from it,
   3450         and we don't want to subtract bits that may be in the main map already.
   3451         At the end we or the result into the bit map that is being built. */
   3452 
   3453         posix_class *= 3;
   3454 
   3455         /* Copy in the first table (always present) */
   3456 
   3457         memcpy(pbits, cbits + posix_class_maps[posix_class],
   3458           32 * sizeof(uschar));
   3459 
   3460         /* If there is a second table, add or remove it as required. */
   3461 
   3462         taboffset = posix_class_maps[posix_class + 1];
   3463         tabopt = posix_class_maps[posix_class + 2];
   3464 
   3465         if (taboffset >= 0)
   3466           {
   3467           if (tabopt >= 0)
   3468             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
   3469           else
   3470             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
   3471           }
   3472 
   3473         /* Not see if we need to remove any special characters. An option
   3474         value of 1 removes vertical space and 2 removes underscore. */
   3475 
   3476         if (tabopt < 0) tabopt = -tabopt;
   3477         if (tabopt == 1) pbits[1] &= ~0x3c;
   3478           else if (tabopt == 2) pbits[11] &= 0x7f;
   3479 
   3480         /* Add the POSIX table or its complement into the main table that is
   3481         being built and we are done. */
   3482 
   3483         if (local_negate)
   3484           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
   3485         else
   3486           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
   3487 
   3488         ptr = tempptr + 1;
   3489         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
   3490         continue;    /* End of POSIX syntax handling */
   3491         }
   3492 
   3493       /* Backslash may introduce a single character, or it may introduce one
   3494       of the specials, which just set a flag. The sequence \b is a special
   3495       case. Inside a class (and only there) it is treated as backspace. We
   3496       assume that other escapes have more than one character in them, so set
   3497       class_charcount bigger than one. Unrecognized escapes fall through and
   3498       are either treated as literal characters (by default), or are faulted if
   3499       PCRE_EXTRA is set. */
   3500 
   3501       if (c == CHAR_BACKSLASH)
   3502         {
   3503         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
   3504         if (*errorcodeptr != 0) goto FAILED;
   3505 
   3506         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
   3507         else if (-c == ESC_Q)            /* Handle start of quoted string */
   3508           {
   3509           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   3510             {
   3511             ptr += 2; /* avoid empty string */
   3512             }
   3513           else inescq = TRUE;
   3514           continue;
   3515           }
   3516         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
   3517 
   3518         if (c < 0)
   3519           {
   3520           register const uschar *cbits = cd->cbits;
   3521           class_charcount += 2;     /* Greater than 1 is what matters */
   3522 
   3523           switch (-c)
   3524             {
   3525 #ifdef SUPPORT_UCP
   3526             case ESC_du:     /* These are the values given for \d etc */
   3527             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
   3528             case ESC_wu:     /* escape sequence with an appropriate \p */
   3529             case ESC_WU:     /* or \P to test Unicode properties instead */
   3530             case ESC_su:     /* of the default ASCII testing. */
   3531             case ESC_SU:
   3532             nestptr = ptr;
   3533             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
   3534             class_charcount -= 2;                /* Undo! */
   3535             continue;
   3536 #endif
   3537             case ESC_d:
   3538             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
   3539             continue;
   3540 
   3541             case ESC_D:
   3542             should_flip_negation = TRUE;
   3543             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
   3544             continue;
   3545 
   3546             case ESC_w:
   3547             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
   3548             continue;
   3549 
   3550             case ESC_W:
   3551             should_flip_negation = TRUE;
   3552             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
   3553             continue;
   3554 
   3555             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
   3556             if it was previously set by something earlier in the character
   3557             class. */
   3558 
   3559             case ESC_s:
   3560             classbits[0] |= cbits[cbit_space];
   3561             classbits[1] |= cbits[cbit_space+1] & ~0x08;
   3562             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
   3563             continue;
   3564 
   3565             case ESC_S:
   3566             should_flip_negation = TRUE;
   3567             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
   3568             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
   3569             continue;
   3570 
   3571             case ESC_h:
   3572             SETBIT(classbits, 0x09); /* VT */
   3573             SETBIT(classbits, 0x20); /* SPACE */
   3574             SETBIT(classbits, 0xa0); /* NSBP */
   3575 #ifdef SUPPORT_UTF8
   3576             if (utf8)
   3577               {
   3578               class_utf8 = TRUE;
   3579               *class_utf8data++ = XCL_SINGLE;
   3580               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
   3581               *class_utf8data++ = XCL_SINGLE;
   3582               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
   3583               *class_utf8data++ = XCL_RANGE;
   3584               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
   3585               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
   3586               *class_utf8data++ = XCL_SINGLE;
   3587               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
   3588               *class_utf8data++ = XCL_SINGLE;
   3589               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
   3590               *class_utf8data++ = XCL_SINGLE;
   3591               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
   3592               }
   3593 #endif
   3594             continue;
   3595 
   3596             case ESC_H:
   3597             for (c = 0; c < 32; c++)
   3598               {
   3599               int x = 0xff;
   3600               switch (c)
   3601                 {
   3602                 case 0x09/8: x ^= 1 << (0x09%8); break;
   3603                 case 0x20/8: x ^= 1 << (0x20%8); break;
   3604                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
   3605                 default: break;
   3606                 }
   3607               classbits[c] |= x;
   3608               }
   3609 
   3610 #ifdef SUPPORT_UTF8
   3611             if (utf8)
   3612               {
   3613               class_utf8 = TRUE;
   3614               *class_utf8data++ = XCL_RANGE;
   3615               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
   3616               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
   3617               *class_utf8data++ = XCL_RANGE;
   3618               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
   3619               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
   3620               *class_utf8data++ = XCL_RANGE;
   3621               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
   3622               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
   3623               *class_utf8data++ = XCL_RANGE;
   3624               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
   3625               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
   3626               *class_utf8data++ = XCL_RANGE;
   3627               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
   3628               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
   3629               *class_utf8data++ = XCL_RANGE;
   3630               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
   3631               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
   3632               *class_utf8data++ = XCL_RANGE;
   3633               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
   3634               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
   3635               }
   3636 #endif
   3637             continue;
   3638 
   3639             case ESC_v:
   3640             SETBIT(classbits, 0x0a); /* LF */
   3641             SETBIT(classbits, 0x0b); /* VT */
   3642             SETBIT(classbits, 0x0c); /* FF */
   3643             SETBIT(classbits, 0x0d); /* CR */
   3644             SETBIT(classbits, 0x85); /* NEL */
   3645 #ifdef SUPPORT_UTF8
   3646             if (utf8)
   3647               {
   3648               class_utf8 = TRUE;
   3649               *class_utf8data++ = XCL_RANGE;
   3650               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
   3651               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
   3652               }
   3653 #endif
   3654             continue;
   3655 
   3656             case ESC_V:
   3657             for (c = 0; c < 32; c++)
   3658               {
   3659               int x = 0xff;
   3660               switch (c)
   3661                 {
   3662                 case 0x0a/8: x ^= 1 << (0x0a%8);
   3663                              x ^= 1 << (0x0b%8);
   3664                              x ^= 1 << (0x0c%8);
   3665                              x ^= 1 << (0x0d%8);
   3666                              break;
   3667                 case 0x85/8: x ^= 1 << (0x85%8); break;
   3668                 default: break;
   3669                 }
   3670               classbits[c] |= x;
   3671               }
   3672 
   3673 #ifdef SUPPORT_UTF8
   3674             if (utf8)
   3675               {
   3676               class_utf8 = TRUE;
   3677               *class_utf8data++ = XCL_RANGE;
   3678               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
   3679               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
   3680               *class_utf8data++ = XCL_RANGE;
   3681               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
   3682               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
   3683               }
   3684 #endif
   3685             continue;
   3686 
   3687 #ifdef SUPPORT_UCP
   3688             case ESC_p:
   3689             case ESC_P:
   3690               {
   3691               BOOL negated;
   3692               int pdata;
   3693               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
   3694               if (ptype < 0) goto FAILED;
   3695               class_utf8 = TRUE;
   3696               *class_utf8data++ = ((-c == ESC_p) != negated)?
   3697                 XCL_PROP : XCL_NOTPROP;
   3698               *class_utf8data++ = ptype;
   3699               *class_utf8data++ = pdata;
   3700               class_charcount -= 2;   /* Not a < 256 character */
   3701               continue;
   3702               }
   3703 #endif
   3704             /* Unrecognized escapes are faulted if PCRE is running in its
   3705             strict mode. By default, for compatibility with Perl, they are
   3706             treated as literals. */
   3707 
   3708             default:
   3709             if ((options & PCRE_EXTRA) != 0)
   3710               {
   3711               *errorcodeptr = ERR7;
   3712               goto FAILED;
   3713               }
   3714             class_charcount -= 2;  /* Undo the default count from above */
   3715             c = *ptr;              /* Get the final character and fall through */
   3716             break;
   3717             }
   3718           }
   3719 
   3720         /* Fall through if we have a single character (c >= 0). This may be
   3721         greater than 256 in UTF-8 mode. */
   3722 
   3723         }   /* End of backslash handling */
   3724 
   3725       /* A single character may be followed by '-' to form a range. However,
   3726       Perl does not permit ']' to be the end of the range. A '-' character
   3727       at the end is treated as a literal. Perl ignores orphaned \E sequences
   3728       entirely. The code for handling \Q and \E is messy. */
   3729 
   3730       CHECK_RANGE:
   3731       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
   3732         {
   3733         inescq = FALSE;
   3734         ptr += 2;
   3735         }
   3736 
   3737       oldptr = ptr;
   3738 
   3739       /* Remember \r or \n */
   3740 
   3741       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   3742 
   3743       /* Check for range */
   3744 
   3745       if (!inescq && ptr[1] == CHAR_MINUS)
   3746         {
   3747         int d;
   3748         ptr += 2;
   3749         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
   3750 
   3751         /* If we hit \Q (not followed by \E) at this point, go into escaped
   3752         mode. */
   3753 
   3754         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
   3755           {
   3756           ptr += 2;
   3757           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
   3758             { ptr += 2; continue; }
   3759           inescq = TRUE;
   3760           break;
   3761           }
   3762 
   3763         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
   3764           {
   3765           ptr = oldptr;
   3766           goto LONE_SINGLE_CHARACTER;
   3767           }
   3768 
   3769 #ifdef SUPPORT_UTF8
   3770         if (utf8)
   3771           {                           /* Braces are required because the */
   3772           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
   3773           }
   3774         else
   3775 #endif
   3776         d = *ptr;  /* Not UTF-8 mode */
   3777 
   3778         /* The second part of a range can be a single-character escape, but
   3779         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
   3780         in such circumstances. */
   3781 
   3782         if (!inescq && d == CHAR_BACKSLASH)
   3783           {
   3784           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
   3785           if (*errorcodeptr != 0) goto FAILED;
   3786 
   3787           /* \b is backspace; any other special means the '-' was literal */
   3788 
   3789           if (d < 0)
   3790             {
   3791             if (d == -ESC_b) d = CHAR_BS; else
   3792               {
   3793               ptr = oldptr;
   3794               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
   3795               }
   3796             }
   3797           }
   3798 
   3799         /* Check that the two values are in the correct order. Optimize
   3800         one-character ranges */
   3801 
   3802         if (d < c)
   3803           {
   3804           *errorcodeptr = ERR8;
   3805           goto FAILED;
   3806           }
   3807 
   3808         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
   3809 
   3810         /* Remember \r or \n */
   3811 
   3812         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
   3813 
   3814         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
   3815         matching, we have to use an XCLASS with extra data items. Caseless
   3816         matching for characters > 127 is available only if UCP support is
   3817         available. */
   3818 
   3819 #ifdef SUPPORT_UTF8
   3820         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
   3821           {
   3822           class_utf8 = TRUE;
   3823 
   3824           /* With UCP support, we can find the other case equivalents of
   3825           the relevant characters. There may be several ranges. Optimize how
   3826           they fit with the basic range. */
   3827 
   3828 #ifdef SUPPORT_UCP
   3829           if ((options & PCRE_CASELESS) != 0)
   3830             {
   3831             unsigned int occ, ocd;
   3832             unsigned int cc = c;
   3833             unsigned int origd = d;
   3834             while (get_othercase_range(&cc, origd, &occ, &ocd))
   3835               {
   3836               if (occ >= (unsigned int)c &&
   3837                   ocd <= (unsigned int)d)
   3838                 continue;                          /* Skip embedded ranges */
   3839 
   3840               if (occ < (unsigned int)c  &&
   3841                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
   3842                 {                                  /* if there is overlap,   */
   3843                 c = occ;                           /* noting that if occ < c */
   3844                 continue;                          /* we can't have ocd > d  */
   3845                 }                                  /* because a subrange is  */
   3846               if (ocd > (unsigned int)d &&
   3847                   occ <= (unsigned int)d + 1)      /* always shorter than    */
   3848                 {                                  /* the basic range.       */
   3849                 d = ocd;
   3850                 continue;
   3851                 }
   3852 
   3853               if (occ == ocd)
   3854                 {
   3855                 *class_utf8data++ = XCL_SINGLE;
   3856                 }
   3857               else
   3858                 {
   3859                 *class_utf8data++ = XCL_RANGE;
   3860                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
   3861                 }
   3862               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
   3863               }
   3864             }
   3865 #endif  /* SUPPORT_UCP */
   3866 
   3867           /* Now record the original range, possibly modified for UCP caseless
   3868           overlapping ranges. */
   3869 
   3870           *class_utf8data++ = XCL_RANGE;
   3871           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
   3872           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
   3873 
   3874           /* With UCP support, we are done. Without UCP support, there is no
   3875           caseless matching for UTF-8 characters > 127; we can use the bit map
   3876           for the smaller ones. */
   3877 
   3878 #ifdef SUPPORT_UCP
   3879           continue;    /* With next character in the class */
   3880 #else
   3881           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
   3882 
   3883           /* Adjust upper limit and fall through to set up the map */
   3884 
   3885           d = 127;
   3886 
   3887 #endif  /* SUPPORT_UCP */
   3888           }
   3889 #endif  /* SUPPORT_UTF8 */
   3890 
   3891         /* We use the bit map for all cases when not in UTF-8 mode; else
   3892         ranges that lie entirely within 0-127 when there is UCP support; else
   3893         for partial ranges without UCP support. */
   3894 
   3895         class_charcount += d - c + 1;
   3896         class_lastchar = d;
   3897 
   3898         /* We can save a bit of time by skipping this in the pre-compile. */
   3899 
   3900         if (lengthptr == NULL) for (; c <= d; c++)
   3901           {
   3902           classbits[c/8] |= (1 << (c&7));
   3903           if ((options & PCRE_CASELESS) != 0)
   3904             {
   3905             int uc = cd->fcc[c];           /* flip case */
   3906             classbits[uc/8] |= (1 << (uc&7));
   3907             }
   3908           }
   3909 
   3910         continue;   /* Go get the next char in the class */
   3911         }
   3912 
   3913       /* Handle a lone single character - we can get here for a normal
   3914       non-escape char, or after \ that introduces a single character or for an
   3915       apparent range that isn't. */
   3916 
   3917       LONE_SINGLE_CHARACTER:
   3918 
   3919       /* Handle a character that cannot go in the bit map */
   3920 
   3921 #ifdef SUPPORT_UTF8
   3922       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
   3923         {
   3924         class_utf8 = TRUE;
   3925         *class_utf8data++ = XCL_SINGLE;
   3926         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
   3927 
   3928 #ifdef SUPPORT_UCP
   3929         if ((options & PCRE_CASELESS) != 0)
   3930           {
   3931           unsigned int othercase;
   3932           if ((othercase = UCD_OTHERCASE(c)) != c)
   3933             {
   3934             *class_utf8data++ = XCL_SINGLE;
   3935             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
   3936             }
   3937           }
   3938 #endif  /* SUPPORT_UCP */
   3939 
   3940         }
   3941       else
   3942 #endif  /* SUPPORT_UTF8 */
   3943 
   3944       /* Handle a single-byte character */
   3945         {
   3946         classbits[c/8] |= (1 << (c&7));
   3947         if ((options & PCRE_CASELESS) != 0)
   3948           {
   3949           c = cd->fcc[c];   /* flip case */
   3950           classbits[c/8] |= (1 << (c&7));
   3951           }
   3952         class_charcount++;
   3953         class_lastchar = c;
   3954         }
   3955       }
   3956 
   3957     /* Loop until ']' reached. This "while" is the end of the "do" far above.
   3958     If we are at the end of an internal nested string, revert to the outer
   3959     string. */
   3960 
   3961     while (((c = *(++ptr)) != 0 ||
   3962            (nestptr != NULL &&
   3963              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
   3964            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
   3965 
   3966     /* Check for missing terminating ']' */
   3967 
   3968     if (c == 0)
   3969       {
   3970       *errorcodeptr = ERR6;
   3971       goto FAILED;
   3972       }
   3973 
   3974     /* If class_charcount is 1, we saw precisely one character whose value is
   3975     less than 256. As long as there were no characters >= 128 and there was no
   3976     use of \p or \P, in other words, no use of any XCLASS features, we can
   3977     optimize.
   3978 
   3979     In UTF-8 mode, we can optimize the negative case only if there were no
   3980     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
   3981     operate on single-bytes only. This is an historical hangover. Maybe one day
   3982     we can tidy these opcodes to handle multi-byte characters.
   3983 
   3984     The optimization throws away the bit map. We turn the item into a
   3985     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
   3986     that OP_NOT does not support multibyte characters. In the positive case, it
   3987     can cause firstbyte to be set. Otherwise, there can be no first char if
   3988     this item is first, whatever repeat count may follow. In the case of
   3989     reqbyte, save the previous value for reinstating. */
   3990 
   3991 #ifdef SUPPORT_UTF8
   3992     if (class_charcount == 1 && !class_utf8 &&
   3993       (!utf8 || !negate_class || class_lastchar < 128))
   3994 #else
   3995     if (class_charcount == 1)
   3996 #endif
   3997       {
   3998       zeroreqbyte = reqbyte;
   3999 
   4000       /* The OP_NOT opcode works on one-byte characters only. */
   4001 
   4002       if (negate_class)
   4003         {
   4004         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
   4005         zerofirstbyte = firstbyte;
   4006         *code++ = OP_NOT;
   4007         *code++ = class_lastchar;
   4008         break;
   4009         }
   4010 
   4011       /* For a single, positive character, get the value into mcbuffer, and
   4012       then we can handle this with the normal one-character code. */
   4013 
   4014 #ifdef SUPPORT_UTF8
   4015       if (utf8 && class_lastchar > 127)
   4016         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
   4017       else
   4018 #endif
   4019         {
   4020         mcbuffer[0] = class_lastchar;
   4021         mclength = 1;
   4022         }
   4023       goto ONE_CHAR;
   4024       }       /* End of 1-char optimization */
   4025 
   4026     /* The general case - not the one-char optimization. If this is the first
   4027     thing in the branch, there can be no first char setting, whatever the
   4028     repeat count. Any reqbyte setting must remain unchanged after any kind of
   4029     repeat. */
   4030 
   4031     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
   4032     zerofirstbyte = firstbyte;
   4033     zeroreqbyte = reqbyte;
   4034 
   4035     /* If there are characters with values > 255, we have to compile an
   4036     extended class, with its own opcode, unless there was a negated special
   4037     such as \S in the class, and PCRE_UCP is not set, because in that case all
   4038     characters > 255 are in the class, so any that were explicitly given as
   4039     well can be ignored. If (when there are explicit characters > 255 that must
   4040     be listed) there are no characters < 256, we can omit the bitmap in the
   4041     actual compiled code. */
   4042 
   4043 #ifdef SUPPORT_UTF8
   4044     if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
   4045       {
   4046       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
   4047       *code++ = OP_XCLASS;
   4048       code += LINK_SIZE;
   4049       *code = negate_class? XCL_NOT : 0;
   4050 
   4051       /* If the map is required, move up the extra data to make room for it;
   4052       otherwise just move the code pointer to the end of the extra data. */
   4053 
   4054       if (class_charcount > 0)
   4055         {
   4056         *code++ |= XCL_MAP;
   4057         memmove(code + 32, code, class_utf8data - code);
   4058         memcpy(code, classbits, 32);
   4059         code = class_utf8data + 32;
   4060         }
   4061       else code = class_utf8data;
   4062 
   4063       /* Now fill in the complete length of the item */
   4064 
   4065       PUT(previous, 1, code - previous);
   4066       break;   /* End of class handling */
   4067       }
   4068 #endif
   4069 
   4070     /* If there are no characters > 255, or they are all to be included or
   4071     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
   4072     whole class was negated and whether there were negative specials such as \S
   4073     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
   4074     negating it if necessary. */
   4075 
   4076     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
   4077     if (negate_class)
   4078       {
   4079       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
   4080         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
   4081       }
   4082     else
   4083       {
   4084       memcpy(code, classbits, 32);
   4085       }
   4086     code += 32;
   4087     break;
   4088 
   4089 
   4090     /* ===================================================================*/
   4091     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
   4092     has been tested above. */
   4093 
   4094     case CHAR_LEFT_CURLY_BRACKET:
   4095     if (!is_quantifier) goto NORMAL_CHAR;
   4096     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
   4097     if (*errorcodeptr != 0) goto FAILED;
   4098     goto REPEAT;
   4099 
   4100     case CHAR_ASTERISK:
   4101     repeat_min = 0;
   4102     repeat_max = -1;
   4103     goto REPEAT;
   4104 
   4105     case CHAR_PLUS:
   4106     repeat_min = 1;
   4107     repeat_max = -1;
   4108     goto REPEAT;
   4109 
   4110     case CHAR_QUESTION_MARK:
   4111     repeat_min = 0;
   4112     repeat_max = 1;
   4113 
   4114     REPEAT:
   4115     if (previous == NULL)
   4116       {
   4117       *errorcodeptr = ERR9;
   4118       goto FAILED;
   4119       }
   4120 
   4121     if (repeat_min == 0)
   4122       {
   4123       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
   4124       reqbyte = zeroreqbyte;        /* Ditto */
   4125       }
   4126 
   4127     /* Remember whether this is a variable length repeat */
   4128 
   4129     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
   4130 
   4131     op_type = 0;                    /* Default single-char op codes */
   4132     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
   4133 
   4134     /* Save start of previous item, in case we have to move it up to make space
   4135     for an inserted OP_ONCE for the additional '+' extension. */
   4136 
   4137     tempcode = previous;
   4138 
   4139     /* If the next character is '+', we have a possessive quantifier. This
   4140     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
   4141     If the next character is '?' this is a minimizing repeat, by default,
   4142     but if PCRE_UNGREEDY is set, it works the other way round. We change the
   4143     repeat type to the non-default. */
   4144 
   4145     if (ptr[1] == CHAR_PLUS)
   4146       {
   4147       repeat_type = 0;                  /* Force greedy */
   4148       possessive_quantifier = TRUE;
   4149       ptr++;
   4150       }
   4151     else if (ptr[1] == CHAR_QUESTION_MARK)
   4152       {
   4153       repeat_type = greedy_non_default;
   4154       ptr++;
   4155       }
   4156     else repeat_type = greedy_default;
   4157 
   4158     /* If previous was a character match, abolish the item and generate a
   4159     repeat item instead. If a char item has a minumum of more than one, ensure
   4160     that it is set in reqbyte - it might not be if a sequence such as x{3} is
   4161     the first thing in a branch because the x will have gone into firstbyte
   4162     instead.  */
   4163 
   4164     if (*previous == OP_CHAR || *previous == OP_CHARNC)
   4165       {
   4166       /* Deal with UTF-8 characters that take up more than one byte. It's
   4167       easier to write this out separately than try to macrify it. Use c to
   4168       hold the length of the character in bytes, plus 0x80 to flag that it's a
   4169       length rather than a small character. */
   4170 
   4171 #ifdef SUPPORT_UTF8
   4172       if (utf8 && (code[-1] & 0x80) != 0)
   4173         {
   4174         uschar *lastchar = code - 1;
   4175         while((*lastchar & 0xc0) == 0x80) lastchar--;
   4176         c = code - lastchar;            /* Length of UTF-8 character */
   4177         memcpy(utf8_char, lastchar, c); /* Save the char */
   4178         c |= 0x80;                      /* Flag c as a length */
   4179         }
   4180       else
   4181 #endif
   4182 
   4183       /* Handle the case of a single byte - either with no UTF8 support, or
   4184       with UTF-8 disabled, or for a UTF-8 character < 128. */
   4185 
   4186         {
   4187         c = code[-1];
   4188         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
   4189         }
   4190 
   4191       /* If the repetition is unlimited, it pays to see if the next thing on
   4192       the line is something that cannot possibly match this character. If so,
   4193       automatically possessifying this item gains some performance in the case
   4194       where the match fails. */
   4195 
   4196       if (!possessive_quantifier &&
   4197           repeat_max < 0 &&
   4198           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
   4199         {
   4200         repeat_type = 0;    /* Force greedy */
   4201         possessive_quantifier = TRUE;
   4202         }
   4203 
   4204       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
   4205       }
   4206 
   4207     /* If previous was a single negated character ([^a] or similar), we use
   4208     one of the special opcodes, replacing it. The code is shared with single-
   4209     character repeats by setting opt_type to add a suitable offset into
   4210     repeat_type. We can also test for auto-possessification. OP_NOT is
   4211     currently used only for single-byte chars. */
   4212 
   4213     else if (*previous == OP_NOT)
   4214       {
   4215       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
   4216       c = previous[1];
   4217       if (!possessive_quantifier &&
   4218           repeat_max < 0 &&
   4219           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
   4220         {
   4221         repeat_type = 0;    /* Force greedy */
   4222         possessive_quantifier = TRUE;
   4223         }
   4224       goto OUTPUT_SINGLE_REPEAT;
   4225       }
   4226 
   4227     /* If previous was a character type match (\d or similar), abolish it and
   4228     create a suitable repeat item. The code is shared with single-character
   4229     repeats by setting op_type to add a suitable offset into repeat_type. Note
   4230     the the Unicode property types will be present only when SUPPORT_UCP is
   4231     defined, but we don't wrap the little bits of code here because it just
   4232     makes it horribly messy. */
   4233 
   4234     else if (*previous < OP_EODN)
   4235       {
   4236       uschar *oldcode;
   4237       int prop_type, prop_value;
   4238       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
   4239       c = *previous;
   4240 
   4241       if (!possessive_quantifier &&
   4242           repeat_max < 0 &&
   4243           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
   4244         {
   4245         repeat_type = 0;    /* Force greedy */
   4246         possessive_quantifier = TRUE;
   4247         }
   4248 
   4249       OUTPUT_SINGLE_REPEAT:
   4250       if (*previous == OP_PROP || *previous == OP_NOTPROP)
   4251         {
   4252         prop_type = previous[1];
   4253         prop_value = previous[2];
   4254         }
   4255       else prop_type = prop_value = -1;
   4256 
   4257       oldcode = code;
   4258       code = previous;                  /* Usually overwrite previous item */
   4259 
   4260       /* If the maximum is zero then the minimum must also be zero; Perl allows
   4261       this case, so we do too - by simply omitting the item altogether. */
   4262 
   4263       if (repeat_max == 0) goto END_REPEAT;
   4264 
   4265       /*--------------------------------------------------------------------*/
   4266       /* This code is obsolete from release 8.00; the restriction was finally
   4267       removed: */
   4268 
   4269       /* All real repeats make it impossible to handle partial matching (maybe
   4270       one day we will be able to remove this restriction). */
   4271 
   4272       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
   4273       /*--------------------------------------------------------------------*/
   4274 
   4275       /* Combine the op_type with the repeat_type */
   4276 
   4277       repeat_type += op_type;
   4278 
   4279       /* A minimum of zero is handled either as the special case * or ?, or as
   4280       an UPTO, with the maximum given. */
   4281 
   4282       if (repeat_min == 0)
   4283         {
   4284         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
   4285           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
   4286         else
   4287           {
   4288           *code++ = OP_UPTO + repeat_type;
   4289           PUT2INC(code, 0, repeat_max);
   4290           }
   4291         }
   4292 
   4293       /* A repeat minimum of 1 is optimized into some special cases. If the
   4294       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
   4295       left in place and, if the maximum is greater than 1, we use OP_UPTO with
   4296       one less than the maximum. */
   4297 
   4298       else if (repeat_min == 1)
   4299         {
   4300         if (repeat_max == -1)
   4301           *code++ = OP_PLUS + repeat_type;
   4302         else
   4303           {
   4304           code = oldcode;                 /* leave previous item in place */
   4305           if (repeat_max == 1) goto END_REPEAT;
   4306           *code++ = OP_UPTO + repeat_type;
   4307           PUT2INC(code, 0, repeat_max - 1);
   4308           }
   4309         }
   4310 
   4311       /* The case {n,n} is just an EXACT, while the general case {n,m} is
   4312       handled as an EXACT followed by an UPTO. */
   4313 
   4314       else
   4315         {
   4316         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
   4317         PUT2INC(code, 0, repeat_min);
   4318 
   4319         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
   4320         we have to insert the character for the previous code. For a repeated
   4321         Unicode property match, there are two extra bytes that define the
   4322         required property. In UTF-8 mode, long characters have their length in
   4323         c, with the 0x80 bit as a flag. */
   4324 
   4325         if (repeat_max < 0)
   4326           {
   4327 #ifdef SUPPORT_UTF8
   4328           if (utf8 && c >= 128)
   4329             {
   4330             memcpy(code, utf8_char, c & 7);
   4331             code += c & 7;
   4332             }
   4333           else
   4334 #endif
   4335             {
   4336             *code++ = c;
   4337             if (prop_type >= 0)
   4338               {
   4339               *code++ = prop_type;
   4340               *code++ = prop_value;
   4341               }
   4342             }
   4343           *code++ = OP_STAR + repeat_type;
   4344           }
   4345 
   4346         /* Else insert an UPTO if the max is greater than the min, again
   4347         preceded by the character, for the previously inserted code. If the
   4348         UPTO is just for 1 instance, we can use QUERY instead. */
   4349 
   4350         else if (repeat_max != repeat_min)
   4351           {
   4352 #ifdef SUPPORT_UTF8
   4353           if (utf8 && c >= 128)
   4354             {
   4355             memcpy(code, utf8_char, c & 7);
   4356             code += c & 7;
   4357             }
   4358           else
   4359 #endif
   4360           *code++ = c;
   4361           if (prop_type >= 0)
   4362             {
   4363             *code++ = prop_type;
   4364             *code++ = prop_value;
   4365             }
   4366           repeat_max -= repeat_min;
   4367 
   4368           if (repeat_max == 1)
   4369             {
   4370             *code++ = OP_QUERY + repeat_type;
   4371             }
   4372           else
   4373             {
   4374             *code++ = OP_UPTO + repeat_type;
   4375             PUT2INC(code, 0, repeat_max);
   4376             }
   4377           }
   4378         }
   4379 
   4380       /* The character or character type itself comes last in all cases. */
   4381 
   4382 #ifdef SUPPORT_UTF8
   4383       if (utf8 && c >= 128)
   4384         {
   4385         memcpy(code, utf8_char, c & 7);
   4386         code += c & 7;
   4387         }
   4388       else
   4389 #endif
   4390       *code++ = c;
   4391 
   4392       /* For a repeated Unicode property match, there are two extra bytes that
   4393       define the required property. */
   4394 
   4395 #ifdef SUPPORT_UCP
   4396       if (prop_type >= 0)
   4397         {
   4398         *code++ = prop_type;
   4399         *code++ = prop_value;
   4400         }
   4401 #endif
   4402       }
   4403 
   4404     /* If previous was a character class or a back reference, we put the repeat
   4405     stuff after it, but just skip the item if the repeat was {0,0}. */
   4406 
   4407     else if (*previous == OP_CLASS ||
   4408              *previous == OP_NCLASS ||
   4409 #ifdef SUPPORT_UTF8
   4410              *previous == OP_XCLASS ||
   4411 #endif
   4412              *previous == OP_REF)
   4413       {
   4414       if (repeat_max == 0)
   4415         {
   4416         code = previous;
   4417         goto END_REPEAT;
   4418         }
   4419 
   4420       /*--------------------------------------------------------------------*/
   4421       /* This code is obsolete from release 8.00; the restriction was finally
   4422       removed: */
   4423 
   4424       /* All real repeats make it impossible to handle partial matching (maybe
   4425       one day we will be able to remove this restriction). */
   4426 
   4427       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
   4428       /*--------------------------------------------------------------------*/
   4429 
   4430       if (repeat_min == 0 && repeat_max == -1)
   4431         *code++ = OP_CRSTAR + repeat_type;
   4432       else if (repeat_min == 1 && repeat_max == -1)
   4433         *code++ = OP_CRPLUS + repeat_type;
   4434       else if (repeat_min == 0 && repeat_max == 1)
   4435         *code++ = OP_CRQUERY + repeat_type;
   4436       else
   4437         {
   4438         *code++ = OP_CRRANGE + repeat_type;
   4439         PUT2INC(code, 0, repeat_min);
   4440         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
   4441         PUT2INC(code, 0, repeat_max);
   4442         }
   4443       }
   4444 
   4445     /* If previous was a bracket group, we may have to replicate it in certain
   4446     cases. */
   4447 
   4448     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
   4449              *previous == OP_ONCE || *previous == OP_COND)
   4450       {
   4451       register int i;
   4452       int ketoffset = 0;
   4453       int len = (int)(code - previous);
   4454       uschar *bralink = NULL;
   4455 
   4456       /* Repeating a DEFINE group is pointless */
   4457 
   4458       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
   4459         {
   4460         *errorcodeptr = ERR55;
   4461         goto FAILED;
   4462         }
   4463 
   4464       /* If the maximum repeat count is unlimited, find the end of the bracket
   4465       by scanning through from the start, and compute the offset back to it
   4466       from the current code pointer. There may be an OP_OPT setting following
   4467       the final KET, so we can't find the end just by going back from the code
   4468       pointer. */
   4469 
   4470       if (repeat_max == -1)
   4471         {
   4472         register uschar *ket = previous;
   4473         do ket += GET(ket, 1); while (*ket != OP_KET);
   4474         ketoffset = (int)(code - ket);
   4475         }
   4476 
   4477       /* The case of a zero minimum is special because of the need to stick
   4478       OP_BRAZERO in front of it, and because the group appears once in the
   4479       data, whereas in other cases it appears the minimum number of times. For
   4480       this reason, it is simplest to treat this case separately, as otherwise
   4481       the code gets far too messy. There are several special subcases when the
   4482       minimum is zero. */
   4483 
   4484       if (repeat_min == 0)
   4485         {
   4486         /* If the maximum is also zero, we used to just omit the group from the
   4487         output altogether, like this:
   4488 
   4489         ** if (repeat_max == 0)
   4490         **   {
   4491         **   code = previous;
   4492         **   goto END_REPEAT;
   4493         **   }
   4494 
   4495         However, that fails when a group is referenced as a subroutine from
   4496         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
   4497         so that it is skipped on execution. As we don't have a list of which
   4498         groups are referenced, we cannot do this selectively.
   4499 
   4500         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
   4501         and do no more at this point. However, we do need to adjust any
   4502         OP_RECURSE calls inside the group that refer to the group itself or any
   4503         internal or forward referenced group, because the offset is from the
   4504         start of the whole regex. Temporarily terminate the pattern while doing
   4505         this. */
   4506 
   4507         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
   4508           {
   4509           *code = OP_END;
   4510           adjust_recurse(previous, 1, utf8, cd, save_hwm);
   4511           memmove(previous+1, previous, len);
   4512           code++;
   4513           if (repeat_max == 0)
   4514             {
   4515             *previous++ = OP_SKIPZERO;
   4516             goto END_REPEAT;
   4517             }
   4518           *previous++ = OP_BRAZERO + repeat_type;
   4519           }
   4520 
   4521         /* If the maximum is greater than 1 and limited, we have to replicate
   4522         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
   4523         The first one has to be handled carefully because it's the original
   4524         copy, which has to be moved up. The remainder can be handled by code
   4525         that is common with the non-zero minimum case below. We have to
   4526         adjust the value or repeat_max, since one less copy is required. Once
   4527         again, we may have to adjust any OP_RECURSE calls inside the group. */
   4528 
   4529         else
   4530           {
   4531           int offset;
   4532           *code = OP_END;
   4533           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
   4534           memmove(previous + 2 + LINK_SIZE, previous, len);
   4535           code += 2 + LINK_SIZE;
   4536           *previous++ = OP_BRAZERO + repeat_type;
   4537           *previous++ = OP_BRA;
   4538 
   4539           /* We chain together the bracket offset fields that have to be
   4540           filled in later when the ends of the brackets are reached. */
   4541 
   4542           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
   4543           bralink = previous;
   4544           PUTINC(previous, 0, offset);
   4545           }
   4546 
   4547         repeat_max--;
   4548         }
   4549 
   4550       /* If the minimum is greater than zero, replicate the group as many
   4551       times as necessary, and adjust the maximum to the number of subsequent
   4552       copies that we need. If we set a first char from the group, and didn't
   4553       set a required char, copy the latter from the former. If there are any
   4554       forward reference subroutine calls in the group, there will be entries on
   4555       the workspace list; replicate these with an appropriate increment. */
   4556 
   4557       else
   4558         {
   4559         if (repeat_min > 1)
   4560           {
   4561           /* In the pre-compile phase, we don't actually do the replication. We
   4562           just adjust the length as if we had. Do some paranoid checks for
   4563           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
   4564           integer type when available, otherwise double. */
   4565 
   4566           if (lengthptr != NULL)
   4567             {
   4568             int delta = (repeat_min - 1)*length_prevgroup;
   4569             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
   4570                   (INT64_OR_DOUBLE)length_prevgroup >
   4571                     (INT64_OR_DOUBLE)INT_MAX ||
   4572                 OFLOW_MAX - *lengthptr < delta)
   4573               {
   4574               *errorcodeptr = ERR20;
   4575               goto FAILED;
   4576               }
   4577             *lengthptr += delta;
   4578             }
   4579 
   4580           /* This is compiling for real */
   4581 
   4582           else
   4583             {
   4584             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
   4585             for (i = 1; i < repeat_min; i++)
   4586               {
   4587               uschar *hc;
   4588               uschar *this_hwm = cd->hwm;
   4589               memcpy(code, previous, len);
   4590               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
   4591                 {
   4592                 PUT(cd->hwm, 0, GET(hc, 0) + len);
   4593                 cd->hwm += LINK_SIZE;
   4594                 }
   4595               save_hwm = this_hwm;
   4596               code += len;
   4597               }
   4598             }
   4599           }
   4600 
   4601         if (repeat_max > 0) repeat_max -= repeat_min;
   4602         }
   4603 
   4604       /* This code is common to both the zero and non-zero minimum cases. If
   4605       the maximum is limited, it replicates the group in a nested fashion,
   4606       remembering the bracket starts on a stack. In the case of a zero minimum,
   4607       the first one was set up above. In all cases the repeat_max now specifies
   4608       the number of additional copies needed. Again, we must remember to
   4609       replicate entries on the forward reference list. */
   4610 
   4611       if (repeat_max >= 0)
   4612         {
   4613         /* In the pre-compile phase, we don't actually do the replication. We
   4614         just adjust the length as if we had. For each repetition we must add 1
   4615         to the length for BRAZERO and for all but the last repetition we must
   4616         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
   4617         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
   4618         a 64-bit integer type when available, otherwise double. */
   4619 
   4620         if (lengthptr != NULL && repeat_max > 0)
   4621           {
   4622           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
   4623                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
   4624           if ((INT64_OR_DOUBLE)repeat_max *
   4625                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
   4626                   > (INT64_OR_DOUBLE)INT_MAX ||
   4627               OFLOW_MAX - *lengthptr < delta)
   4628             {
   4629             *errorcodeptr = ERR20;
   4630             goto FAILED;
   4631             }
   4632           *lengthptr += delta;
   4633           }
   4634 
   4635         /* This is compiling for real */
   4636 
   4637         else for (i = repeat_max - 1; i >= 0; i--)
   4638           {
   4639           uschar *hc;
   4640           uschar *this_hwm = cd->hwm;
   4641 
   4642           *code++ = OP_BRAZERO + repeat_type;
   4643 
   4644           /* All but the final copy start a new nesting, maintaining the
   4645           chain of brackets outstanding. */
   4646 
   4647           if (i != 0)
   4648             {
   4649             int offset;
   4650             *code++ = OP_BRA;
   4651             offset = (bralink == NULL)? 0 : (int)(code - bralink);
   4652             bralink = code;
   4653             PUTINC(code, 0, offset);
   4654             }
   4655 
   4656           memcpy(code, previous, len);
   4657           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
   4658             {
   4659             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
   4660             cd->hwm += LINK_SIZE;
   4661             }
   4662           save_hwm = this_hwm;
   4663           code += len;
   4664           }
   4665 
   4666         /* Now chain through the pending brackets, and fill in their length
   4667         fields (which are holding the chain links pro tem). */
   4668 
   4669         while (bralink != NULL)
   4670           {
   4671           int oldlinkoffset;
   4672           int offset = (int)(code - bralink + 1);
   4673           uschar *bra = code - offset;
   4674           oldlinkoffset = GET(bra, 1);
   4675           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
   4676           *code++ = OP_KET;
   4677           PUTINC(code, 0, offset);
   4678           PUT(bra, 1, offset);
   4679           }
   4680         }
   4681 
   4682       /* If the maximum is unlimited, set a repeater in the final copy. We
   4683       can't just offset backwards from the current code point, because we
   4684       don't know if there's been an options resetting after the ket. The
   4685       correct offset was computed above.
   4686 
   4687       Then, when we are doing the actual compile phase, check to see whether
   4688       this group is a non-atomic one that could match an empty string. If so,
   4689       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
   4690       that runtime checking can be done. [This check is also applied to
   4691       atomic groups at runtime, but in a different way.] */
   4692 
   4693       else
   4694         {
   4695         uschar *ketcode = code - ketoffset;
   4696         uschar *bracode = ketcode - GET(ketcode, 1);
   4697         *ketcode = OP_KETRMAX + repeat_type;
   4698         if (lengthptr == NULL && *bracode != OP_ONCE)
   4699           {
   4700           uschar *scode = bracode;
   4701           do
   4702             {
   4703             if (could_be_empty_branch(scode, ketcode, utf8, cd))
   4704               {
   4705               *bracode += OP_SBRA - OP_BRA;
   4706               break;
   4707               }
   4708             scode += GET(scode, 1);
   4709             }
   4710           while (*scode == OP_ALT);
   4711           }
   4712         }
   4713       }
   4714 
   4715     /* If previous is OP_FAIL, it was generated by an empty class [] in
   4716     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
   4717     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
   4718     error above. We can just ignore the repeat in JS case. */
   4719 
   4720     else if (*previous == OP_FAIL) goto END_REPEAT;
   4721 
   4722     /* Else there's some kind of shambles */
   4723 
   4724     else
   4725       {
   4726       *errorcodeptr = ERR11;
   4727       goto FAILED;
   4728       }
   4729 
   4730     /* If the character following a repeat is '+', or if certain optimization
   4731     tests above succeeded, possessive_quantifier is TRUE. For some of the
   4732     simpler opcodes, there is an special alternative opcode for this. For
   4733     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
   4734     The '+' notation is just syntactic sugar, taken from Sun's Java package,
   4735     but the special opcodes can optimize it a bit. The repeated item starts at
   4736     tempcode, not at previous, which might be the first part of a string whose
   4737     (former) last char we repeated.
   4738 
   4739     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
   4740     an 'upto' may follow. We skip over an 'exact' item, and then test the
   4741     length of what remains before proceeding. */
   4742 
   4743     if (possessive_quantifier)
   4744       {
   4745       int len;
   4746 
   4747       if (*tempcode == OP_TYPEEXACT)
   4748         tempcode += _pcre_OP_lengths[*tempcode] +
   4749           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
   4750 
   4751       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
   4752         {
   4753         tempcode += _pcre_OP_lengths[*tempcode];
   4754 #ifdef SUPPORT_UTF8
   4755         if (utf8 && tempcode[-1] >= 0xc0)
   4756           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
   4757 #endif
   4758         }
   4759 
   4760       len = (int)(code - tempcode);
   4761       if (len > 0) switch (*tempcode)
   4762         {
   4763         case OP_STAR:  *tempcode = OP_POSSTAR; break;
   4764         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
   4765         case OP_QUERY: *tempcode = OP_POSQUERY; break;
   4766         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
   4767 
   4768         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
   4769         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
   4770         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
   4771         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
   4772 
   4773         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
   4774         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
   4775         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
   4776         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
   4777 
   4778         /* Because we are moving code along, we must ensure that any
   4779         pending recursive references are updated. */
   4780 
   4781         default:
   4782         *code = OP_END;
   4783         adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
   4784         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
   4785         code += 1 + LINK_SIZE;
   4786         len += 1 + LINK_SIZE;
   4787         tempcode[0] = OP_ONCE;
   4788         *code++ = OP_KET;
   4789         PUTINC(code, 0, len);
   4790         PUT(tempcode, 1, len);
   4791         break;
   4792         }
   4793       }
   4794 
   4795     /* In all case we no longer have a previous item. We also set the
   4796     "follows varying string" flag for subsequently encountered reqbytes if
   4797     it isn't already set and we have just passed a varying length item. */
   4798 
   4799     END_REPEAT:
   4800     previous = NULL;
   4801     cd->req_varyopt |= reqvary;
   4802     break;
   4803 
   4804 
   4805     /* ===================================================================*/
   4806     /* Start of nested parenthesized sub-expression, or comment or lookahead or
   4807     lookbehind or option setting or condition or all the other extended
   4808     parenthesis forms.  */
   4809 
   4810     case CHAR_LEFT_PARENTHESIS:
   4811     newoptions = options;
   4812     skipbytes = 0;
   4813     bravalue = OP_CBRA;
   4814     save_hwm = cd->hwm;
   4815     reset_bracount = FALSE;
   4816 
   4817     /* First deal with various "verbs" that can be introduced by '*'. */
   4818 
   4819     if (*(++ptr) == CHAR_ASTERISK &&
   4820          ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
   4821       {
   4822       int i, namelen;
   4823       int arglen = 0;
   4824       const char *vn = verbnames;
   4825       const uschar *name = ptr + 1;
   4826       const uschar *arg = NULL;
   4827       previous = NULL;
   4828       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
   4829       namelen = (int)(ptr - name);
   4830 
   4831       if (*ptr == CHAR_COLON)
   4832         {
   4833         arg = ++ptr;
   4834         while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
   4835           || *ptr == '_') ptr++;
   4836         arglen = (int)(ptr - arg);
   4837         }
   4838 
   4839       if (*ptr != CHAR_RIGHT_PARENTHESIS)
   4840         {
   4841         *errorcodeptr = ERR60;
   4842         goto FAILED;
   4843         }
   4844 
   4845       /* Scan the table of verb names */
   4846 
   4847       for (i = 0; i < verbcount; i++)
   4848         {
   4849         if (namelen == verbs[i].len &&
   4850             strncmp((char *)name, vn, namelen) == 0)
   4851           {
   4852           /* Check for open captures before ACCEPT */
   4853 
   4854           if (verbs[i].op == OP_ACCEPT)
   4855             {
   4856             open_capitem *oc;
   4857             cd->had_accept = TRUE;
   4858             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
   4859               {
   4860               *code++ = OP_CLOSE;
   4861               PUT2INC(code, 0, oc->number);
   4862               }
   4863             }
   4864 
   4865           /* Handle the cases with/without an argument */
   4866 
   4867           if (arglen == 0)
   4868             {
   4869             if (verbs[i].op < 0)   /* Argument is mandatory */
   4870               {
   4871               *errorcodeptr = ERR66;
   4872               goto FAILED;
   4873               }
   4874             *code = verbs[i].op;
   4875             if (*code++ == OP_THEN)
   4876               {
   4877               PUT(code, 0, code - bcptr->current_branch - 1);
   4878               code += LINK_SIZE;
   4879               }
   4880             }
   4881 
   4882           else
   4883             {
   4884             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
   4885               {
   4886               *errorcodeptr = ERR59;
   4887               goto FAILED;
   4888               }
   4889             *code = verbs[i].op_arg;
   4890             if (*code++ == OP_THEN_ARG)
   4891               {
   4892               PUT(code, 0, code - bcptr->current_branch - 1);
   4893               code += LINK_SIZE;
   4894               }
   4895             *code++ = arglen;
   4896             memcpy(code, arg, arglen);
   4897             code += arglen;
   4898             *code++ = 0;
   4899             }
   4900 
   4901           break;  /* Found verb, exit loop */
   4902           }
   4903 
   4904         vn += verbs[i].len + 1;
   4905         }
   4906 
   4907       if (i < verbcount) continue;    /* Successfully handled a verb */
   4908       *errorcodeptr = ERR60;          /* Verb not recognized */
   4909       goto FAILED;
   4910       }
   4911 
   4912     /* Deal with the extended parentheses; all are introduced by '?', and the
   4913     appearance of any of them means that this is not a capturing group. */
   4914 
   4915     else if (*ptr == CHAR_QUESTION_MARK)
   4916       {
   4917       int i, set, unset, namelen;
   4918       int *optset;
   4919       const uschar *name;
   4920       uschar *slot;
   4921 
   4922       switch (*(++ptr))
   4923         {
   4924         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
   4925         ptr++;
   4926         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
   4927         if (*ptr == 0)
   4928           {
   4929           *errorcodeptr = ERR18;
   4930           goto FAILED;
   4931           }
   4932         continue;
   4933 
   4934 
   4935         /* ------------------------------------------------------------ */
   4936         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
   4937         reset_bracount = TRUE;
   4938         /* Fall through */
   4939 
   4940         /* ------------------------------------------------------------ */
   4941         case CHAR_COLON:          /* Non-capturing bracket */
   4942         bravalue = OP_BRA;
   4943         ptr++;
   4944         break;
   4945 
   4946 
   4947         /* ------------------------------------------------------------ */
   4948         case CHAR_LEFT_PARENTHESIS:
   4949         bravalue = OP_COND;       /* Conditional group */
   4950 
   4951         /* A condition can be an assertion, a number (referring to a numbered
   4952         group), a name (referring to a named group), or 'R', referring to
   4953         recursion. R<digits> and R&name are also permitted for recursion tests.
   4954 
   4955         There are several syntaxes for testing a named group: (?(name)) is used
   4956         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
   4957 
   4958         There are two unfortunate ambiguities, caused by history. (a) 'R' can
   4959         be the recursive thing or the name 'R' (and similarly for 'R' followed
   4960         by digits), and (b) a number could be a name that consists of digits.
   4961         In both cases, we look for a name first; if not found, we try the other
   4962         cases. */
   4963 
   4964         /* For conditions that are assertions, check the syntax, and then exit
   4965         the switch. This will take control down to where bracketed groups,
   4966         including assertions, are processed. */
   4967 
   4968         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
   4969             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
   4970           break;
   4971 
   4972         /* Most other conditions use OP_CREF (a couple change to OP_RREF
   4973         below), and all need to skip 3 bytes at the start of the group. */
   4974 
   4975         code[1+LINK_SIZE] = OP_CREF;
   4976         skipbytes = 3;
   4977         refsign = -1;
   4978 
   4979         /* Check for a test for recursion in a named group. */
   4980 
   4981         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
   4982           {
   4983           terminator = -1;
   4984           ptr += 2;
   4985           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
   4986           }
   4987 
   4988         /* Check for a test for a named group's having been set, using the Perl
   4989         syntax (?(<name>) or (?('name') */
   4990 
   4991         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
   4992           {
   4993           terminator = CHAR_GREATER_THAN_SIGN;
   4994           ptr++;
   4995           }
   4996         else if (ptr[1] == CHAR_APOSTROPHE)
   4997           {
   4998           terminator = CHAR_APOSTROPHE;
   4999           ptr++;
   5000           }
   5001         else
   5002           {
   5003           terminator = 0;
   5004           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
   5005           }
   5006 
   5007         /* We now expect to read a name; any thing else is an error */
   5008 
   5009         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
   5010           {
   5011           ptr += 1;  /* To get the right offset */
   5012           *errorcodeptr = ERR28;
   5013           goto FAILED;
   5014           }
   5015 
   5016         /* Read the name, but also get it as a number if it's all digits */
   5017 
   5018         recno = 0;
   5019         name = ++ptr;
   5020         while ((cd->ctypes[*ptr] & ctype_word) != 0)
   5021           {
   5022           if (recno >= 0)
   5023             recno = ((digitab[*ptr] & ctype_digit) != 0)?
   5024               recno * 10 + *ptr - CHAR_0 : -1;
   5025           ptr++;
   5026           }
   5027         namelen = (int)(ptr - name);
   5028 
   5029         if ((terminator > 0 && *ptr++ != terminator) ||
   5030             *ptr++ != CHAR_RIGHT_PARENTHESIS)
   5031           {
   5032           ptr--;      /* Error offset */
   5033           *errorcodeptr = ERR26;
   5034           goto FAILED;
   5035           }
   5036 
   5037         /* Do no further checking in the pre-compile phase. */
   5038 
   5039         if (lengthptr != NULL) break;
   5040 
   5041         /* In the real compile we do the work of looking for the actual
   5042         reference. If the string started with "+" or "-" we require the rest to
   5043         be digits, in which case recno will be set. */
   5044 
   5045         if (refsign > 0)
   5046           {
   5047           if (recno <= 0)
   5048             {
   5049             *errorcodeptr = ERR58;
   5050             goto FAILED;
   5051             }
   5052           recno = (refsign == CHAR_MINUS)?
   5053             cd->bracount - recno + 1 : recno +cd->bracount;
   5054           if (recno <= 0 || recno > cd->final_bracount)
   5055             {
   5056             *errorcodeptr = ERR15;
   5057             goto FAILED;
   5058             }
   5059           PUT2(code, 2+LINK_SIZE, recno);
   5060           break;
   5061           }
   5062 
   5063         /* Otherwise (did not start with "+" or "-"), start by looking for the
   5064         name. If we find a name, add one to the opcode to change OP_CREF or
   5065         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
   5066         except they record that the reference was originally to a name. The
   5067         information is used to check duplicate names. */
   5068 
   5069         slot = cd->name_table;
   5070         for (i = 0; i < cd->names_found; i++)
   5071           {
   5072           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
   5073           slot += cd->name_entry_size;
   5074           }
   5075 
   5076         /* Found a previous named subpattern */
   5077 
   5078         if (i < cd->names_found)
   5079           {
   5080           recno = GET2(slot, 0);
   5081           PUT2(code, 2+LINK_SIZE, recno);
   5082           code[1+LINK_SIZE]++;
   5083           }
   5084 
   5085         /* Search the pattern for a forward reference */
   5086 
   5087         else if ((i = find_parens(cd, name, namelen,
   5088                         (options & PCRE_EXTENDED) != 0, utf8)) > 0)
   5089           {
   5090           PUT2(code, 2+LINK_SIZE, i);
   5091           code[1+LINK_SIZE]++;
   5092           }
   5093 
   5094         /* If terminator == 0 it means that the name followed directly after
   5095         the opening parenthesis [e.g. (?(abc)...] and in this case there are
   5096         some further alternatives to try. For the cases where terminator != 0
   5097         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
   5098         now checked all the possibilities, so give an error. */
   5099 
   5100         else if (terminator != 0)
   5101           {
   5102           *errorcodeptr = ERR15;
   5103           goto FAILED;
   5104           }
   5105 
   5106         /* Check for (?(R) for recursion. Allow digits after R to specify a
   5107         specific group number. */
   5108 
   5109         else if (*name == CHAR_R)
   5110           {
   5111           recno = 0;
   5112           for (i = 1; i < namelen; i++)
   5113             {
   5114             if ((digitab[name[i]] & ctype_digit) == 0)
   5115               {
   5116               *errorcodeptr = ERR15;
   5117               goto FAILED;
   5118               }
   5119             recno = recno * 10 + name[i] - CHAR_0;
   5120             }
   5121           if (recno == 0) recno = RREF_ANY;
   5122           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
   5123           PUT2(code, 2+LINK_SIZE, recno);
   5124           }
   5125 
   5126         /* Similarly, check for the (?(DEFINE) "condition", which is always
   5127         false. */
   5128 
   5129         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
   5130           {
   5131           code[1+LINK_SIZE] = OP_DEF;
   5132           skipbytes = 1;
   5133           }
   5134 
   5135         /* Check for the "name" actually being a subpattern number. We are
   5136         in the second pass here, so final_bracount is set. */
   5137 
   5138         else if (recno > 0 && recno <= cd->final_bracount)
   5139           {
   5140           PUT2(code, 2+LINK_SIZE, recno);
   5141           }
   5142 
   5143         /* Either an unidentified subpattern, or a reference to (?(0) */
   5144 
   5145         else
   5146           {
   5147           *errorcodeptr = (recno == 0)? ERR35: ERR15;
   5148           goto FAILED;
   5149           }
   5150         break;
   5151 
   5152 
   5153         /* ------------------------------------------------------------ */
   5154         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
   5155         bravalue = OP_ASSERT;
   5156         ptr++;
   5157         break;
   5158 
   5159 
   5160         /* ------------------------------------------------------------ */
   5161         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
   5162         ptr++;
   5163         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
   5164           {
   5165           *code++ = OP_FAIL;
   5166           previous = NULL;
   5167           continue;
   5168           }
   5169         bravalue = OP_ASSERT_NOT;
   5170         break;
   5171 
   5172 
   5173         /* ------------------------------------------------------------ */
   5174         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
   5175         switch (ptr[1])
   5176           {
   5177           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
   5178           bravalue = OP_ASSERTBACK;
   5179           ptr += 2;
   5180           break;
   5181 
   5182           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
   5183           bravalue = OP_ASSERTBACK_NOT;
   5184           ptr += 2;
   5185           break;
   5186 
   5187           default:                /* Could be name define, else bad */
   5188           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
   5189           ptr++;                  /* Correct offset for error */
   5190           *errorcodeptr = ERR24;
   5191           goto FAILED;
   5192           }
   5193         break;
   5194 
   5195 
   5196         /* ------------------------------------------------------------ */
   5197         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
   5198         bravalue = OP_ONCE;
   5199         ptr++;
   5200         break;
   5201 
   5202 
   5203         /* ------------------------------------------------------------ */
   5204         case CHAR_C:                 /* Callout - may be followed by digits; */
   5205         previous_callout = code;  /* Save for later completion */
   5206         after_manual_callout = 1; /* Skip one item before completing */
   5207         *code++ = OP_CALLOUT;
   5208           {
   5209           int n = 0;
   5210           while ((digitab[*(++ptr)] & ctype_digit) != 0)
   5211             n = n * 10 + *ptr - CHAR_0;
   5212           if (*ptr != CHAR_RIGHT_PARENTHESIS)
   5213             {
   5214             *errorcodeptr = ERR39;
   5215             goto FAILED;
   5216             }
   5217           if (n > 255)
   5218             {
   5219             *errorcodeptr = ERR38;
   5220             goto FAILED;
   5221             }
   5222           *code++ = n;
   5223           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
   5224           PUT(code, LINK_SIZE, 0);                          /* Default length */
   5225           code += 2 * LINK_SIZE;
   5226           }
   5227         previous = NULL;
   5228         continue;
   5229 
   5230 
   5231         /* ------------------------------------------------------------ */
   5232         case CHAR_P:              /* Python-style named subpattern handling */
   5233         if (*(++ptr) == CHAR_EQUALS_SIGN ||
   5234             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
   5235           {
   5236           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
   5237           terminator = CHAR_RIGHT_PARENTHESIS;
   5238           goto NAMED_REF_OR_RECURSE;
   5239           }
   5240         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
   5241           {
   5242           *errorcodeptr = ERR41;
   5243           goto FAILED;
   5244           }
   5245         /* Fall through to handle (?P< as (?< is handled */
   5246 
   5247 
   5248         /* ------------------------------------------------------------ */
   5249         DEFINE_NAME:    /* Come here from (?< handling */
   5250         case CHAR_APOSTROPHE:
   5251           {
   5252           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
   5253             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
   5254           name = ++ptr;
   5255 
   5256           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
   5257           namelen = (int)(ptr - name);
   5258 
   5259           /* In the pre-compile phase, just do a syntax check. */
   5260 
   5261           if (lengthptr != NULL)
   5262             {
   5263             if (*ptr != terminator)
   5264               {
   5265               *errorcodeptr = ERR42;
   5266               goto FAILED;
   5267               }
   5268             if (cd->names_found >= MAX_NAME_COUNT)
   5269               {
   5270               *errorcodeptr = ERR49;
   5271               goto FAILED;
   5272               }
   5273             if (namelen + 3 > cd->name_entry_size)
   5274               {
   5275               cd->name_entry_size = namelen + 3;
   5276               if (namelen > MAX_NAME_SIZE)
   5277                 {
   5278                 *errorcodeptr = ERR48;
   5279                 goto FAILED;
   5280                 }
   5281               }
   5282             }
   5283 
   5284           /* In the real compile, create the entry in the table, maintaining
   5285           alphabetical order. Duplicate names for different numbers are
   5286           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
   5287           number are always OK. (An existing number can be re-used if (?|
   5288           appears in the pattern.) In either event, a duplicate name results in
   5289           a duplicate entry in the table, even if the number is the same. This
   5290           is because the number of names, and hence the table size, is computed
   5291           in the pre-compile, and it affects various numbers and pointers which
   5292           would all have to be modified, and the compiled code moved down, if
   5293           duplicates with the same number were omitted from the table. This
   5294           doesn't seem worth the hassle. However, *different* names for the
   5295           same number are not permitted. */
   5296 
   5297           else
   5298             {
   5299             BOOL dupname = FALSE;
   5300             slot = cd->name_table;
   5301 
   5302             for (i = 0; i < cd->names_found; i++)
   5303               {
   5304               int crc = memcmp(name, slot+2, namelen);
   5305               if (crc == 0)
   5306                 {
   5307                 if (slot[2+namelen] == 0)
   5308                   {
   5309                   if (GET2(slot, 0) != cd->bracount + 1 &&
   5310                       (options & PCRE_DUPNAMES) == 0)
   5311                     {
   5312                     *errorcodeptr = ERR43;
   5313                     goto FAILED;
   5314                     }
   5315                   else dupname = TRUE;
   5316                   }
   5317                 else crc = -1;      /* Current name is a substring */
   5318                 }
   5319 
   5320               /* Make space in the table and break the loop for an earlier
   5321               name. For a duplicate or later name, carry on. We do this for
   5322               duplicates so that in the simple case (when ?(| is not used) they
   5323               are in order of their numbers. */
   5324 
   5325               if (crc < 0)
   5326                 {
   5327                 memmove(slot + cd->name_entry_size, slot,
   5328                   (cd->names_found - i) * cd->name_entry_size);
   5329                 break;
   5330                 }
   5331 
   5332               /* Continue the loop for a later or duplicate name */
   5333 
   5334               slot += cd->name_entry_size;
   5335               }
   5336 
   5337             /* For non-duplicate names, check for a duplicate number before
   5338             adding the new name. */
   5339 
   5340             if (!dupname)
   5341               {
   5342               uschar *cslot = cd->name_table;
   5343               for (i = 0; i < cd->names_found; i++)
   5344                 {
   5345                 if (cslot != slot)
   5346                   {
   5347                   if (GET2(cslot, 0) == cd->bracount + 1)
   5348                     {
   5349                     *errorcodeptr = ERR65;
   5350                     goto FAILED;
   5351                     }
   5352                   }
   5353                 else i--;
   5354                 cslot += cd->name_entry_size;
   5355                 }
   5356               }
   5357 
   5358             PUT2(slot, 0, cd->bracount + 1);
   5359             memcpy(slot + 2, name, namelen);
   5360             slot[2+namelen] = 0;
   5361             }
   5362           }
   5363 
   5364         /* In both pre-compile and compile, count the number of names we've
   5365         encountered. */
   5366 
   5367         cd->names_found++;
   5368         ptr++;                    /* Move past > or ' */
   5369         goto NUMBERED_GROUP;
   5370 
   5371 
   5372         /* ------------------------------------------------------------ */
   5373         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
   5374         terminator = CHAR_RIGHT_PARENTHESIS;
   5375         is_recurse = TRUE;
   5376         /* Fall through */
   5377 
   5378         /* We come here from the Python syntax above that handles both
   5379         references (?P=name) and recursion (?P>name), as well as falling
   5380         through from the Perl recursion syntax (?&name). We also come here from
   5381         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
   5382         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
   5383 
   5384         NAMED_REF_OR_RECURSE:
   5385         name = ++ptr;
   5386         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
   5387         namelen = (int)(ptr - name);
   5388 
   5389         /* In the pre-compile phase, do a syntax check. We used to just set
   5390         a dummy reference number, because it was not used in the first pass.
   5391         However, with the change of recursive back references to be atomic,
   5392         we have to look for the number so that this state can be identified, as
   5393         otherwise the incorrect length is computed. If it's not a backwards
   5394         reference, the dummy number will do. */