Home | History | Annotate | Download | only in pcre
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language (but see
      7 below for why this module is different).
      8 
      9                        Written by Philip Hazel
     10            Copyright (c) 1997-2010 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 
     42 /* This module contains the external function pcre_dfa_exec(), which is an
     43 alternative matching function that uses a sort of DFA algorithm (not a true
     44 FSM). This is NOT Perl- compatible, but it has advantages in certain
     45 applications. */
     46 
     47 
     48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
     49 the performance of his patterns greatly. I could not use it as it stood, as it
     50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
     51 test 7 to loop, and test 9 to crash with a segfault.
     52 
     53 The issue is the check for duplicate states, which is done by a simple linear
     54 search up the state list. (Grep for "duplicate" below to find the code.) For
     55 many patterns, there will never be many states active at one time, so a simple
     56 linear search is fine. In patterns that have many active states, it might be a
     57 bottleneck. The suggested code used an indexing scheme to remember which states
     58 had previously been used for each character, and avoided the linear search when
     59 it knew there was no chance of a duplicate. This was implemented when adding
     60 states to the state lists.
     61 
     62 I wrote some thread-safe, not-limited code to try something similar at the time
     63 of checking for duplicates (instead of when adding states), using index vectors
     64 on the stack. It did give a 13% improvement with one specially constructed
     65 pattern for certain subject strings, but on other strings and on many of the
     66 simpler patterns in the test suite it did worse. The major problem, I think,
     67 was the extra time to initialize the index. This had to be done for each call
     68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
     69 only once - I suspect this was the cause of the problems with the tests.)
     70 
     71 Overall, I concluded that the gains in some cases did not outweigh the losses
     72 in others, so I abandoned this code. */
     73 
     74 
     75 
     76 #ifdef HAVE_CONFIG_H
     77 #include "config.h"
     78 #endif
     79 
     80 #define NLBLOCK md             /* Block containing newline information */
     81 #define PSSTART start_subject  /* Field containing processed string start */
     82 #define PSEND   end_subject    /* Field containing processed string end */
     83 
     84 #include "pcre_internal.h"
     85 
     86 
     87 /* For use to indent debugging output */
     88 
     89 #define SP "                   "
     90 
     91 
     92 /*************************************************
     93 *      Code parameters and static tables         *
     94 *************************************************/
     95 
     96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
     97 into others, under special conditions. A gap of 20 between the blocks should be
     98 enough. The resulting opcodes don't have to be less than 256 because they are
     99 never stored, so we push them well clear of the normal opcodes. */
    100 
    101 #define OP_PROP_EXTRA       300
    102 #define OP_EXTUNI_EXTRA     320
    103 #define OP_ANYNL_EXTRA      340
    104 #define OP_HSPACE_EXTRA     360
    105 #define OP_VSPACE_EXTRA     380
    106 
    107 
    108 /* This table identifies those opcodes that are followed immediately by a
    109 character that is to be tested in some way. This makes it possible to
    110 centralize the loading of these characters. In the case of Type * etc, the
    111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
    112 small value. Non-zero values in the table are the offsets from the opcode where
    113 the character is to be found. ***NOTE*** If the start of this table is
    114 modified, the three tables that follow must also be modified. */
    115 
    116 static const uschar coptable[] = {
    117   0,                             /* End                                    */
    118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
    119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
    120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
    121   0, 0,                          /* \P, \p                                 */
    122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
    123   0,                             /* \X                                     */
    124   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
    125   1,                             /* Char                                   */
    126   1,                             /* Charnc                                 */
    127   1,                             /* not                                    */
    128   /* Positive single-char repeats                                          */
    129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
    130   3, 3, 3,                       /* upto, minupto, exact                   */
    131   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
    132   /* Negative single-char repeats - only for chars < 256                   */
    133   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
    134   3, 3, 3,                       /* NOT upto, minupto, exact               */
    135   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
    136   /* Positive type repeats                                                 */
    137   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
    138   3, 3, 3,                       /* Type upto, minupto, exact              */
    139   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
    140   /* Character class & ref repeats                                         */
    141   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
    142   0, 0,                          /* CRRANGE, CRMINRANGE                    */
    143   0,                             /* CLASS                                  */
    144   0,                             /* NCLASS                                 */
    145   0,                             /* XCLASS - variable length               */
    146   0,                             /* REF                                    */
    147   0,                             /* RECURSE                                */
    148   0,                             /* CALLOUT                                */
    149   0,                             /* Alt                                    */
    150   0,                             /* Ket                                    */
    151   0,                             /* KetRmax                                */
    152   0,                             /* KetRmin                                */
    153   0,                             /* Assert                                 */
    154   0,                             /* Assert not                             */
    155   0,                             /* Assert behind                          */
    156   0,                             /* Assert behind not                      */
    157   0,                             /* Reverse                                */
    158   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
    159   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
    160   0, 0,                          /* CREF, NCREF                            */
    161   0, 0,                          /* RREF, NRREF                            */
    162   0,                             /* DEF                                    */
    163   0, 0,                          /* BRAZERO, BRAMINZERO                    */
    164   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
    165   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
    166   0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
    167 };
    168 
    169 /* This table identifies those opcodes that inspect a character. It is used to
    170 remember the fact that a character could have been inspected when the end of
    171 the subject is reached. ***NOTE*** If the start of this table is modified, the
    172 two tables that follow must also be modified. */
    173 
    174 static const uschar poptable[] = {
    175   0,                             /* End                                    */
    176   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
    177   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
    178   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
    179   1, 1,                          /* \P, \p                                 */
    180   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
    181   1,                             /* \X                                     */
    182   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
    183   1,                             /* Char                                   */
    184   1,                             /* Charnc                                 */
    185   1,                             /* not                                    */
    186   /* Positive single-char repeats                                          */
    187   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
    188   1, 1, 1,                       /* upto, minupto, exact                   */
    189   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
    190   /* Negative single-char repeats - only for chars < 256                   */
    191   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
    192   1, 1, 1,                       /* NOT upto, minupto, exact               */
    193   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
    194   /* Positive type repeats                                                 */
    195   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
    196   1, 1, 1,                       /* Type upto, minupto, exact              */
    197   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
    198   /* Character class & ref repeats                                         */
    199   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
    200   1, 1,                          /* CRRANGE, CRMINRANGE                    */
    201   1,                             /* CLASS                                  */
    202   1,                             /* NCLASS                                 */
    203   1,                             /* XCLASS - variable length               */
    204   0,                             /* REF                                    */
    205   0,                             /* RECURSE                                */
    206   0,                             /* CALLOUT                                */
    207   0,                             /* Alt                                    */
    208   0,                             /* Ket                                    */
    209   0,                             /* KetRmax                                */
    210   0,                             /* KetRmin                                */
    211   0,                             /* Assert                                 */
    212   0,                             /* Assert not                             */
    213   0,                             /* Assert behind                          */
    214   0,                             /* Assert behind not                      */
    215   0,                             /* Reverse                                */
    216   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
    217   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
    218   0, 0,                          /* CREF, NCREF                            */
    219   0, 0,                          /* RREF, NRREF                            */
    220   0,                             /* DEF                                    */
    221   0, 0,                          /* BRAZERO, BRAMINZERO                    */
    222   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
    223   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
    224   0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
    225 };
    226 
    227 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
    228 and \w */
    229 
    230 static const uschar toptable1[] = {
    231   0, 0, 0, 0, 0, 0,
    232   ctype_digit, ctype_digit,
    233   ctype_space, ctype_space,
    234   ctype_word,  ctype_word,
    235   0, 0                            /* OP_ANY, OP_ALLANY */
    236 };
    237 
    238 static const uschar toptable2[] = {
    239   0, 0, 0, 0, 0, 0,
    240   ctype_digit, 0,
    241   ctype_space, 0,
    242   ctype_word,  0,
    243   1, 1                            /* OP_ANY, OP_ALLANY */
    244 };
    245 
    246 
    247 /* Structure for holding data about a particular state, which is in effect the
    248 current data for an active path through the match tree. It must consist
    249 entirely of ints because the working vector we are passed, and which we put
    250 these structures in, is a vector of ints. */
    251 
    252 typedef struct stateblock {
    253   int offset;                     /* Offset to opcode */
    254   int count;                      /* Count for repeats */
    255   int ims;                        /* ims flag bits */
    256   int data;                       /* Some use extra data */
    257 } stateblock;
    258 
    259 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
    260 
    261 
    262 #ifdef PCRE_DEBUG
    263 /*************************************************
    264 *             Print character string             *
    265 *************************************************/
    266 
    267 /* Character string printing function for debugging.
    268 
    269 Arguments:
    270   p            points to string
    271   length       number of bytes
    272   f            where to print
    273 
    274 Returns:       nothing
    275 */
    276 
    277 static void
    278 pchars(unsigned char *p, int length, FILE *f)
    279 {
    280 int c;
    281 while (length-- > 0)
    282   {
    283   if (isprint(c = *(p++)))
    284     fprintf(f, "%c", c);
    285   else
    286     fprintf(f, "\\x%02x", c);
    287   }
    288 }
    289 #endif
    290 
    291 
    292 
    293 /*************************************************
    294 *    Execute a Regular Expression - DFA engine   *
    295 *************************************************/
    296 
    297 /* This internal function applies a compiled pattern to a subject string,
    298 starting at a given point, using a DFA engine. This function is called from the
    299 external one, possibly multiple times if the pattern is not anchored. The
    300 function calls itself recursively for some kinds of subpattern.
    301 
    302 Arguments:
    303   md                the match_data block with fixed information
    304   this_start_code   the opening bracket of this subexpression's code
    305   current_subject   where we currently are in the subject string
    306   start_offset      start offset in the subject string
    307   offsets           vector to contain the matching string offsets
    308   offsetcount       size of same
    309   workspace         vector of workspace
    310   wscount           size of same
    311   ims               the current ims flags
    312   rlevel            function call recursion level
    313   recursing         regex recursive call level
    314 
    315 Returns:            > 0 => number of match offset pairs placed in offsets
    316                     = 0 => offsets overflowed; longest matches are present
    317                      -1 => failed to match
    318                    < -1 => some kind of unexpected problem
    319 
    320 The following macros are used for adding states to the two state vectors (one
    321 for the current character, one for the following character). */
    322 
    323 #define ADD_ACTIVE(x,y) \
    324   if (active_count++ < wscount) \
    325     { \
    326     next_active_state->offset = (x); \
    327     next_active_state->count  = (y); \
    328     next_active_state->ims    = ims; \
    329     next_active_state++; \
    330     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
    331     } \
    332   else return PCRE_ERROR_DFA_WSSIZE
    333 
    334 #define ADD_ACTIVE_DATA(x,y,z) \
    335   if (active_count++ < wscount) \
    336     { \
    337     next_active_state->offset = (x); \
    338     next_active_state->count  = (y); \
    339     next_active_state->ims    = ims; \
    340     next_active_state->data   = (z); \
    341     next_active_state++; \
    342     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
    343     } \
    344   else return PCRE_ERROR_DFA_WSSIZE
    345 
    346 #define ADD_NEW(x,y) \
    347   if (new_count++ < wscount) \
    348     { \
    349     next_new_state->offset = (x); \
    350     next_new_state->count  = (y); \
    351     next_new_state->ims    = ims; \
    352     next_new_state++; \
    353     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
    354     } \
    355   else return PCRE_ERROR_DFA_WSSIZE
    356 
    357 #define ADD_NEW_DATA(x,y,z) \
    358   if (new_count++ < wscount) \
    359     { \
    360     next_new_state->offset = (x); \
    361     next_new_state->count  = (y); \
    362     next_new_state->ims    = ims; \
    363     next_new_state->data   = (z); \
    364     next_new_state++; \
    365     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
    366     } \
    367   else return PCRE_ERROR_DFA_WSSIZE
    368 
    369 /* And now, here is the code */
    370 
    371 static int
    372 internal_dfa_exec(
    373   dfa_match_data *md,
    374   const uschar *this_start_code,
    375   const uschar *current_subject,
    376   int start_offset,
    377   int *offsets,
    378   int offsetcount,
    379   int *workspace,
    380   int wscount,
    381   int ims,
    382   int  rlevel,
    383   int  recursing)
    384 {
    385 stateblock *active_states, *new_states, *temp_states;
    386 stateblock *next_active_state, *next_new_state;
    387 
    388 const uschar *ctypes, *lcc, *fcc;
    389 const uschar *ptr;
    390 const uschar *end_code, *first_op;
    391 
    392 int active_count, new_count, match_count;
    393 
    394 /* Some fields in the md block are frequently referenced, so we load them into
    395 independent variables in the hope that this will perform better. */
    396 
    397 const uschar *start_subject = md->start_subject;
    398 const uschar *end_subject = md->end_subject;
    399 const uschar *start_code = md->start_code;
    400 
    401 #ifdef SUPPORT_UTF8
    402 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
    403 #else
    404 BOOL utf8 = FALSE;
    405 #endif
    406 
    407 rlevel++;
    408 offsetcount &= (-2);
    409 
    410 wscount -= 2;
    411 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
    412           (2 * INTS_PER_STATEBLOCK);
    413 
    414 DPRINTF(("\n%.*s---------------------\n"
    415   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
    416   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
    417 
    418 ctypes = md->tables + ctypes_offset;
    419 lcc = md->tables + lcc_offset;
    420 fcc = md->tables + fcc_offset;
    421 
    422 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
    423 
    424 active_states = (stateblock *)(workspace + 2);
    425 next_new_state = new_states = active_states + wscount;
    426 new_count = 0;
    427 
    428 first_op = this_start_code + 1 + LINK_SIZE +
    429   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
    430 
    431 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
    432 the alternative states onto the list, and find out where the end is. This
    433 makes is possible to use this function recursively, when we want to stop at a
    434 matching internal ket rather than at the end.
    435 
    436 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
    437 a backward assertion. In that case, we have to find out the maximum amount to
    438 move back, and set up each alternative appropriately. */
    439 
    440 if (*first_op == OP_REVERSE)
    441   {
    442   int max_back = 0;
    443   int gone_back;
    444 
    445   end_code = this_start_code;
    446   do
    447     {
    448     int back = GET(end_code, 2+LINK_SIZE);
    449     if (back > max_back) max_back = back;
    450     end_code += GET(end_code, 1);
    451     }
    452   while (*end_code == OP_ALT);
    453 
    454   /* If we can't go back the amount required for the longest lookbehind
    455   pattern, go back as far as we can; some alternatives may still be viable. */
    456 
    457 #ifdef SUPPORT_UTF8
    458   /* In character mode we have to step back character by character */
    459 
    460   if (utf8)
    461     {
    462     for (gone_back = 0; gone_back < max_back; gone_back++)
    463       {
    464       if (current_subject <= start_subject) break;
    465       current_subject--;
    466       while (current_subject > start_subject &&
    467              (*current_subject & 0xc0) == 0x80)
    468         current_subject--;
    469       }
    470     }
    471   else
    472 #endif
    473 
    474   /* In byte-mode we can do this quickly. */
    475 
    476     {
    477     gone_back = (current_subject - max_back < start_subject)?
    478       (int)(current_subject - start_subject) : max_back;
    479     current_subject -= gone_back;
    480     }
    481 
    482   /* Save the earliest consulted character */
    483 
    484   if (current_subject < md->start_used_ptr)
    485     md->start_used_ptr = current_subject;
    486 
    487   /* Now we can process the individual branches. */
    488 
    489   end_code = this_start_code;
    490   do
    491     {
    492     int back = GET(end_code, 2+LINK_SIZE);
    493     if (back <= gone_back)
    494       {
    495       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
    496       ADD_NEW_DATA(-bstate, 0, gone_back - back);
    497       }
    498     end_code += GET(end_code, 1);
    499     }
    500   while (*end_code == OP_ALT);
    501  }
    502 
    503 /* This is the code for a "normal" subpattern (not a backward assertion). The
    504 start of a whole pattern is always one of these. If we are at the top level,
    505 we may be asked to restart matching from the same point that we reached for a
    506 previous partial match. We still have to scan through the top-level branches to
    507 find the end state. */
    508 
    509 else
    510   {
    511   end_code = this_start_code;
    512 
    513   /* Restarting */
    514 
    515   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
    516     {
    517     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
    518     new_count = workspace[1];
    519     if (!workspace[0])
    520       memcpy(new_states, active_states, new_count * sizeof(stateblock));
    521     }
    522 
    523   /* Not restarting */
    524 
    525   else
    526     {
    527     int length = 1 + LINK_SIZE +
    528       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
    529     do
    530       {
    531       ADD_NEW((int)(end_code - start_code + length), 0);
    532       end_code += GET(end_code, 1);
    533       length = 1 + LINK_SIZE;
    534       }
    535     while (*end_code == OP_ALT);
    536     }
    537   }
    538 
    539 workspace[0] = 0;    /* Bit indicating which vector is current */
    540 
    541 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
    542 
    543 /* Loop for scanning the subject */
    544 
    545 ptr = current_subject;
    546 for (;;)
    547   {
    548   int i, j;
    549   int clen, dlen;
    550   unsigned int c, d;
    551   int forced_fail = 0;
    552   BOOL could_continue = FALSE;
    553 
    554   /* Make the new state list into the active state list and empty the
    555   new state list. */
    556 
    557   temp_states = active_states;
    558   active_states = new_states;
    559   new_states = temp_states;
    560   active_count = new_count;
    561   new_count = 0;
    562 
    563   workspace[0] ^= 1;              /* Remember for the restarting feature */
    564   workspace[1] = active_count;
    565 
    566 #ifdef PCRE_DEBUG
    567   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
    568   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
    569   printf("\"\n");
    570 
    571   printf("%.*sActive states: ", rlevel*2-2, SP);
    572   for (i = 0; i < active_count; i++)
    573     printf("%d/%d ", active_states[i].offset, active_states[i].count);
    574   printf("\n");
    575 #endif
    576 
    577   /* Set the pointers for adding new states */
    578 
    579   next_active_state = active_states + active_count;
    580   next_new_state = new_states;
    581 
    582   /* Load the current character from the subject outside the loop, as many
    583   different states may want to look at it, and we assume that at least one
    584   will. */
    585 
    586   if (ptr < end_subject)
    587     {
    588     clen = 1;        /* Number of bytes in the character */
    589 #ifdef SUPPORT_UTF8
    590     if (utf8) { GETCHARLEN(c, ptr, clen); } else
    591 #endif  /* SUPPORT_UTF8 */
    592     c = *ptr;
    593     }
    594   else
    595     {
    596     clen = 0;        /* This indicates the end of the subject */
    597     c = NOTACHAR;    /* This value should never actually be used */
    598     }
    599 
    600   /* Scan up the active states and act on each one. The result of an action
    601   may be to add more states to the currently active list (e.g. on hitting a
    602   parenthesis) or it may be to put states on the new list, for considering
    603   when we move the character pointer on. */
    604 
    605   for (i = 0; i < active_count; i++)
    606     {
    607     stateblock *current_state = active_states + i;
    608     const uschar *code;
    609     int state_offset = current_state->offset;
    610     int count, codevalue, rrc;
    611 
    612 #ifdef PCRE_DEBUG
    613     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
    614     if (clen == 0) printf("EOL\n");
    615       else if (c > 32 && c < 127) printf("'%c'\n", c);
    616         else printf("0x%02x\n", c);
    617 #endif
    618 
    619     /* This variable is referred to implicity in the ADD_xxx macros. */
    620 
    621     ims = current_state->ims;
    622 
    623     /* A negative offset is a special case meaning "hold off going to this
    624     (negated) state until the number of characters in the data field have
    625     been skipped". */
    626 
    627     if (state_offset < 0)
    628       {
    629       if (current_state->data > 0)
    630         {
    631         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
    632         ADD_NEW_DATA(state_offset, current_state->count,
    633           current_state->data - 1);
    634         continue;
    635         }
    636       else
    637         {
    638         current_state->offset = state_offset = -state_offset;
    639         }
    640       }
    641 
    642     /* Check for a duplicate state with the same count, and skip if found.
    643     See the note at the head of this module about the possibility of improving
    644     performance here. */
    645 
    646     for (j = 0; j < i; j++)
    647       {
    648       if (active_states[j].offset == state_offset &&
    649           active_states[j].count == current_state->count)
    650         {
    651         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
    652         goto NEXT_ACTIVE_STATE;
    653         }
    654       }
    655 
    656     /* The state offset is the offset to the opcode */
    657 
    658     code = start_code + state_offset;
    659     codevalue = *code;
    660 
    661     /* If this opcode inspects a character, but we are at the end of the
    662     subject, remember the fact for use when testing for a partial match. */
    663 
    664     if (clen == 0 && poptable[codevalue] != 0)
    665       could_continue = TRUE;
    666 
    667     /* If this opcode is followed by an inline character, load it. It is
    668     tempting to test for the presence of a subject character here, but that
    669     is wrong, because sometimes zero repetitions of the subject are
    670     permitted.
    671 
    672     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
    673     argument that is not a data character - but is always one byte long. We
    674     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
    675     this case. To keep the other cases fast, convert these ones to new opcodes.
    676     */
    677 
    678     if (coptable[codevalue] > 0)
    679       {
    680       dlen = 1;
    681 #ifdef SUPPORT_UTF8
    682       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
    683 #endif  /* SUPPORT_UTF8 */
    684       d = code[coptable[codevalue]];
    685       if (codevalue >= OP_TYPESTAR)
    686         {
    687         switch(d)
    688           {
    689           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
    690           case OP_NOTPROP:
    691           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
    692           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
    693           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
    694           case OP_NOT_HSPACE:
    695           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
    696           case OP_NOT_VSPACE:
    697           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
    698           default: break;
    699           }
    700         }
    701       }
    702     else
    703       {
    704       dlen = 0;         /* Not strictly necessary, but compilers moan */
    705       d = NOTACHAR;     /* if these variables are not set. */
    706       }
    707 
    708 
    709     /* Now process the individual opcodes */
    710 
    711     switch (codevalue)
    712       {
    713 /* ========================================================================== */
    714       /* These cases are never obeyed. This is a fudge that causes a compile-
    715       time error if the vectors coptable or poptable, which are indexed by
    716       opcode, are not the correct length. It seems to be the only way to do
    717       such a check at compile time, as the sizeof() operator does not work
    718       in the C preprocessor. */
    719 
    720       case OP_TABLE_LENGTH:
    721       case OP_TABLE_LENGTH +
    722         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
    723          (sizeof(poptable) == OP_TABLE_LENGTH)):
    724       break;
    725 
    726 /* ========================================================================== */
    727       /* Reached a closing bracket. If not at the end of the pattern, carry
    728       on with the next opcode. Otherwise, unless we have an empty string and
    729       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
    730       start of the subject, save the match data, shifting up all previous
    731       matches so we always have the longest first. */
    732 
    733       case OP_KET:
    734       case OP_KETRMIN:
    735       case OP_KETRMAX:
    736       if (code != end_code)
    737         {
    738         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
    739         if (codevalue != OP_KET)
    740           {
    741           ADD_ACTIVE(state_offset - GET(code, 1), 0);
    742           }
    743         }
    744       else
    745         {
    746         if (ptr > current_subject ||
    747             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
    748               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
    749                 current_subject > start_subject + md->start_offset)))
    750           {
    751           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
    752             else if (match_count > 0 && ++match_count * 2 >= offsetcount)
    753               match_count = 0;
    754           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
    755           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
    756           if (offsetcount >= 2)
    757             {
    758             offsets[0] = (int)(current_subject - start_subject);
    759             offsets[1] = (int)(ptr - start_subject);
    760             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
    761               offsets[1] - offsets[0], current_subject));
    762             }
    763           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
    764             {
    765             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
    766               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
    767               match_count, rlevel*2-2, SP));
    768             return match_count;
    769             }
    770           }
    771         }
    772       break;
    773 
    774 /* ========================================================================== */
    775       /* These opcodes add to the current list of states without looking
    776       at the current character. */
    777 
    778       /*-----------------------------------------------------------------*/
    779       case OP_ALT:
    780       do { code += GET(code, 1); } while (*code == OP_ALT);
    781       ADD_ACTIVE((int)(code - start_code), 0);
    782       break;
    783 
    784       /*-----------------------------------------------------------------*/
    785       case OP_BRA:
    786       case OP_SBRA:
    787       do
    788         {
    789         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
    790         code += GET(code, 1);
    791         }
    792       while (*code == OP_ALT);
    793       break;
    794 
    795       /*-----------------------------------------------------------------*/
    796       case OP_CBRA:
    797       case OP_SCBRA:
    798       ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
    799       code += GET(code, 1);
    800       while (*code == OP_ALT)
    801         {
    802         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
    803         code += GET(code, 1);
    804         }
    805       break;
    806 
    807       /*-----------------------------------------------------------------*/
    808       case OP_BRAZERO:
    809       case OP_BRAMINZERO:
    810       ADD_ACTIVE(state_offset + 1, 0);
    811       code += 1 + GET(code, 2);
    812       while (*code == OP_ALT) code += GET(code, 1);
    813       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
    814       break;
    815 
    816       /*-----------------------------------------------------------------*/
    817       case OP_SKIPZERO:
    818       code += 1 + GET(code, 2);
    819       while (*code == OP_ALT) code += GET(code, 1);
    820       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
    821       break;
    822 
    823       /*-----------------------------------------------------------------*/
    824       case OP_CIRC:
    825       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
    826           ((ims & PCRE_MULTILINE) != 0 &&
    827             ptr != end_subject &&
    828             WAS_NEWLINE(ptr)))
    829         { ADD_ACTIVE(state_offset + 1, 0); }
    830       break;
    831 
    832       /*-----------------------------------------------------------------*/
    833       case OP_EOD:
    834       if (ptr >= end_subject)
    835         {
    836         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
    837           could_continue = TRUE;
    838         else { ADD_ACTIVE(state_offset + 1, 0); }
    839         }
    840       break;
    841 
    842       /*-----------------------------------------------------------------*/
    843       case OP_OPT:
    844       ims = code[1];
    845       ADD_ACTIVE(state_offset + 2, 0);
    846       break;
    847 
    848       /*-----------------------------------------------------------------*/
    849       case OP_SOD:
    850       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
    851       break;
    852 
    853       /*-----------------------------------------------------------------*/
    854       case OP_SOM:
    855       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
    856       break;
    857 
    858 
    859 /* ========================================================================== */
    860       /* These opcodes inspect the next subject character, and sometimes
    861       the previous one as well, but do not have an argument. The variable
    862       clen contains the length of the current character and is zero if we are
    863       at the end of the subject. */
    864 
    865       /*-----------------------------------------------------------------*/
    866       case OP_ANY:
    867       if (clen > 0 && !IS_NEWLINE(ptr))
    868         { ADD_NEW(state_offset + 1, 0); }
    869       break;
    870 
    871       /*-----------------------------------------------------------------*/
    872       case OP_ALLANY:
    873       if (clen > 0)
    874         { ADD_NEW(state_offset + 1, 0); }
    875       break;
    876 
    877       /*-----------------------------------------------------------------*/
    878       case OP_EODN:
    879       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
    880         could_continue = TRUE;
    881       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
    882         { ADD_ACTIVE(state_offset + 1, 0); }
    883       break;
    884 
    885       /*-----------------------------------------------------------------*/
    886       case OP_DOLL:
    887       if ((md->moptions & PCRE_NOTEOL) == 0)
    888         {
    889         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
    890           could_continue = TRUE;
    891         else if (clen == 0 ||
    892             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
    893                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
    894             ))
    895           { ADD_ACTIVE(state_offset + 1, 0); }
    896         }
    897       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
    898         { ADD_ACTIVE(state_offset + 1, 0); }
    899       break;
    900 
    901       /*-----------------------------------------------------------------*/
    902 
    903       case OP_DIGIT:
    904       case OP_WHITESPACE:
    905       case OP_WORDCHAR:
    906       if (clen > 0 && c < 256 &&
    907             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
    908         { ADD_NEW(state_offset + 1, 0); }
    909       break;
    910 
    911       /*-----------------------------------------------------------------*/
    912       case OP_NOT_DIGIT:
    913       case OP_NOT_WHITESPACE:
    914       case OP_NOT_WORDCHAR:
    915       if (clen > 0 && (c >= 256 ||
    916             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
    917         { ADD_NEW(state_offset + 1, 0); }
    918       break;
    919 
    920       /*-----------------------------------------------------------------*/
    921       case OP_WORD_BOUNDARY:
    922       case OP_NOT_WORD_BOUNDARY:
    923         {
    924         int left_word, right_word;
    925 
    926         if (ptr > start_subject)
    927           {
    928           const uschar *temp = ptr - 1;
    929           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
    930 #ifdef SUPPORT_UTF8
    931           if (utf8) BACKCHAR(temp);
    932 #endif
    933           GETCHARTEST(d, temp);
    934 #ifdef SUPPORT_UCP
    935           if ((md->poptions & PCRE_UCP) != 0)
    936             {
    937             if (d == '_') left_word = TRUE; else
    938               {
    939               int cat = UCD_CATEGORY(d);
    940               left_word = (cat == ucp_L || cat == ucp_N);
    941               }
    942             }
    943           else
    944 #endif
    945           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
    946           }
    947         else left_word = FALSE;
    948 
    949         if (clen > 0)
    950           {
    951 #ifdef SUPPORT_UCP
    952           if ((md->poptions & PCRE_UCP) != 0)
    953             {
    954             if (c == '_') right_word = TRUE; else
    955               {
    956               int cat = UCD_CATEGORY(c);
    957               right_word = (cat == ucp_L || cat == ucp_N);
    958               }
    959             }
    960           else
    961 #endif
    962           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
    963           }
    964         else right_word = FALSE;
    965 
    966         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
    967           { ADD_ACTIVE(state_offset + 1, 0); }
    968         }
    969       break;
    970 
    971 
    972       /*-----------------------------------------------------------------*/
    973       /* Check the next character by Unicode property. We will get here only
    974       if the support is in the binary; otherwise a compile-time error occurs.
    975       */
    976 
    977 #ifdef SUPPORT_UCP
    978       case OP_PROP:
    979       case OP_NOTPROP:
    980       if (clen > 0)
    981         {
    982         BOOL OK;
    983         const ucd_record * prop = GET_UCD(c);
    984         switch(code[1])
    985           {
    986           case PT_ANY:
    987           OK = TRUE;
    988           break;
    989 
    990           case PT_LAMP:
    991           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
    992                prop->chartype == ucp_Lt;
    993           break;
    994 
    995           case PT_GC:
    996           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
    997           break;
    998 
    999           case PT_PC:
   1000           OK = prop->chartype == code[2];
   1001           break;
   1002 
   1003           case PT_SC:
   1004           OK = prop->script == code[2];
   1005           break;
   1006 
   1007           /* These are specials for combination cases. */
   1008 
   1009           case PT_ALNUM:
   1010           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1011                _pcre_ucp_gentype[prop->chartype] == ucp_N;
   1012           break;
   1013 
   1014           case PT_SPACE:    /* Perl space */
   1015           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1016                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
   1017           break;
   1018 
   1019           case PT_PXSPACE:  /* POSIX space */
   1020           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1021                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
   1022                c == CHAR_FF || c == CHAR_CR;
   1023           break;
   1024 
   1025           case PT_WORD:
   1026           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1027                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
   1028                c == CHAR_UNDERSCORE;
   1029           break;
   1030 
   1031           /* Should never occur, but keep compilers from grumbling. */
   1032 
   1033           default:
   1034           OK = codevalue != OP_PROP;
   1035           break;
   1036           }
   1037 
   1038         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
   1039         }
   1040       break;
   1041 #endif
   1042 
   1043 
   1044 
   1045 /* ========================================================================== */
   1046       /* These opcodes likewise inspect the subject character, but have an
   1047       argument that is not a data character. It is one of these opcodes:
   1048       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
   1049       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
   1050 
   1051       case OP_TYPEPLUS:
   1052       case OP_TYPEMINPLUS:
   1053       case OP_TYPEPOSPLUS:
   1054       count = current_state->count;  /* Already matched */
   1055       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
   1056       if (clen > 0)
   1057         {
   1058         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
   1059             (c < 256 &&
   1060               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
   1061               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
   1062           {
   1063           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
   1064             {
   1065             active_count--;            /* Remove non-match possibility */
   1066             next_active_state--;
   1067             }
   1068           count++;
   1069           ADD_NEW(state_offset, count);
   1070           }
   1071         }
   1072       break;
   1073 
   1074       /*-----------------------------------------------------------------*/
   1075       case OP_TYPEQUERY:
   1076       case OP_TYPEMINQUERY:
   1077       case OP_TYPEPOSQUERY:
   1078       ADD_ACTIVE(state_offset + 2, 0);
   1079       if (clen > 0)
   1080         {
   1081         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
   1082             (c < 256 &&
   1083               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
   1084               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
   1085           {
   1086           if (codevalue == OP_TYPEPOSQUERY)
   1087             {
   1088             active_count--;            /* Remove non-match possibility */
   1089             next_active_state--;
   1090             }
   1091           ADD_NEW(state_offset + 2, 0);
   1092           }
   1093         }
   1094       break;
   1095 
   1096       /*-----------------------------------------------------------------*/
   1097       case OP_TYPESTAR:
   1098       case OP_TYPEMINSTAR:
   1099       case OP_TYPEPOSSTAR:
   1100       ADD_ACTIVE(state_offset + 2, 0);
   1101       if (clen > 0)
   1102         {
   1103         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
   1104             (c < 256 &&
   1105               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
   1106               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
   1107           {
   1108           if (codevalue == OP_TYPEPOSSTAR)
   1109             {
   1110             active_count--;            /* Remove non-match possibility */
   1111             next_active_state--;
   1112             }
   1113           ADD_NEW(state_offset, 0);
   1114           }
   1115         }
   1116       break;
   1117 
   1118       /*-----------------------------------------------------------------*/
   1119       case OP_TYPEEXACT:
   1120       count = current_state->count;  /* Number already matched */
   1121       if (clen > 0)
   1122         {
   1123         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
   1124             (c < 256 &&
   1125               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
   1126               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
   1127           {
   1128           if (++count >= GET2(code, 1))
   1129             { ADD_NEW(state_offset + 4, 0); }
   1130           else
   1131             { ADD_NEW(state_offset, count); }
   1132           }
   1133         }
   1134       break;
   1135 
   1136       /*-----------------------------------------------------------------*/
   1137       case OP_TYPEUPTO:
   1138       case OP_TYPEMINUPTO:
   1139       case OP_TYPEPOSUPTO:
   1140       ADD_ACTIVE(state_offset + 4, 0);
   1141       count = current_state->count;  /* Number already matched */
   1142       if (clen > 0)
   1143         {
   1144         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
   1145             (c < 256 &&
   1146               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
   1147               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
   1148           {
   1149           if (codevalue == OP_TYPEPOSUPTO)
   1150             {
   1151             active_count--;           /* Remove non-match possibility */
   1152             next_active_state--;
   1153             }
   1154           if (++count >= GET2(code, 1))
   1155             { ADD_NEW(state_offset + 4, 0); }
   1156           else
   1157             { ADD_NEW(state_offset, count); }
   1158           }
   1159         }
   1160       break;
   1161 
   1162 /* ========================================================================== */
   1163       /* These are virtual opcodes that are used when something like
   1164       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
   1165       argument. It keeps the code above fast for the other cases. The argument
   1166       is in the d variable. */
   1167 
   1168 #ifdef SUPPORT_UCP
   1169       case OP_PROP_EXTRA + OP_TYPEPLUS:
   1170       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
   1171       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
   1172       count = current_state->count;           /* Already matched */
   1173       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
   1174       if (clen > 0)
   1175         {
   1176         BOOL OK;
   1177         const ucd_record * prop = GET_UCD(c);
   1178         switch(code[2])
   1179           {
   1180           case PT_ANY:
   1181           OK = TRUE;
   1182           break;
   1183 
   1184           case PT_LAMP:
   1185           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
   1186             prop->chartype == ucp_Lt;
   1187           break;
   1188 
   1189           case PT_GC:
   1190           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
   1191           break;
   1192 
   1193           case PT_PC:
   1194           OK = prop->chartype == code[3];
   1195           break;
   1196 
   1197           case PT_SC:
   1198           OK = prop->script == code[3];
   1199           break;
   1200 
   1201           /* These are specials for combination cases. */
   1202 
   1203           case PT_ALNUM:
   1204           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1205                _pcre_ucp_gentype[prop->chartype] == ucp_N;
   1206           break;
   1207 
   1208           case PT_SPACE:    /* Perl space */
   1209           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1210                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
   1211           break;
   1212 
   1213           case PT_PXSPACE:  /* POSIX space */
   1214           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1215                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
   1216                c == CHAR_FF || c == CHAR_CR;
   1217           break;
   1218 
   1219           case PT_WORD:
   1220           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1221                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
   1222                c == CHAR_UNDERSCORE;
   1223           break;
   1224 
   1225           /* Should never occur, but keep compilers from grumbling. */
   1226 
   1227           default:
   1228           OK = codevalue != OP_PROP;
   1229           break;
   1230           }
   1231 
   1232         if (OK == (d == OP_PROP))
   1233           {
   1234           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
   1235             {
   1236             active_count--;           /* Remove non-match possibility */
   1237             next_active_state--;
   1238             }
   1239           count++;
   1240           ADD_NEW(state_offset, count);
   1241           }
   1242         }
   1243       break;
   1244 
   1245       /*-----------------------------------------------------------------*/
   1246       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
   1247       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
   1248       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
   1249       count = current_state->count;  /* Already matched */
   1250       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
   1251       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
   1252         {
   1253         const uschar *nptr = ptr + clen;
   1254         int ncount = 0;
   1255         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
   1256           {
   1257           active_count--;           /* Remove non-match possibility */
   1258           next_active_state--;
   1259           }
   1260         while (nptr < end_subject)
   1261           {
   1262           int nd;
   1263           int ndlen = 1;
   1264           GETCHARLEN(nd, nptr, ndlen);
   1265           if (UCD_CATEGORY(nd) != ucp_M) break;
   1266           ncount++;
   1267           nptr += ndlen;
   1268           }
   1269         count++;
   1270         ADD_NEW_DATA(-state_offset, count, ncount);
   1271         }
   1272       break;
   1273 #endif
   1274 
   1275       /*-----------------------------------------------------------------*/
   1276       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
   1277       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
   1278       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
   1279       count = current_state->count;  /* Already matched */
   1280       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
   1281       if (clen > 0)
   1282         {
   1283         int ncount = 0;
   1284         switch (c)
   1285           {
   1286           case 0x000b:
   1287           case 0x000c:
   1288           case 0x0085:
   1289           case 0x2028:
   1290           case 0x2029:
   1291           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
   1292           goto ANYNL01;
   1293 
   1294           case 0x000d:
   1295           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
   1296           /* Fall through */
   1297 
   1298           ANYNL01:
   1299           case 0x000a:
   1300           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
   1301             {
   1302             active_count--;           /* Remove non-match possibility */
   1303             next_active_state--;
   1304             }
   1305           count++;
   1306           ADD_NEW_DATA(-state_offset, count, ncount);
   1307           break;
   1308 
   1309           default:
   1310           break;
   1311           }
   1312         }
   1313       break;
   1314 
   1315       /*-----------------------------------------------------------------*/
   1316       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
   1317       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
   1318       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
   1319       count = current_state->count;  /* Already matched */
   1320       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
   1321       if (clen > 0)
   1322         {
   1323         BOOL OK;
   1324         switch (c)
   1325           {
   1326           case 0x000a:
   1327           case 0x000b:
   1328           case 0x000c:
   1329           case 0x000d:
   1330           case 0x0085:
   1331           case 0x2028:
   1332           case 0x2029:
   1333           OK = TRUE;
   1334           break;
   1335 
   1336           default:
   1337           OK = FALSE;
   1338           break;
   1339           }
   1340 
   1341         if (OK == (d == OP_VSPACE))
   1342           {
   1343           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
   1344             {
   1345             active_count--;           /* Remove non-match possibility */
   1346             next_active_state--;
   1347             }
   1348           count++;
   1349           ADD_NEW_DATA(-state_offset, count, 0);
   1350           }
   1351         }
   1352       break;
   1353 
   1354       /*-----------------------------------------------------------------*/
   1355       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
   1356       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
   1357       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
   1358       count = current_state->count;  /* Already matched */
   1359       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
   1360       if (clen > 0)
   1361         {
   1362         BOOL OK;
   1363         switch (c)
   1364           {
   1365           case 0x09:      /* HT */
   1366           case 0x20:      /* SPACE */
   1367           case 0xa0:      /* NBSP */
   1368           case 0x1680:    /* OGHAM SPACE MARK */
   1369           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
   1370           case 0x2000:    /* EN QUAD */
   1371           case 0x2001:    /* EM QUAD */
   1372           case 0x2002:    /* EN SPACE */
   1373           case 0x2003:    /* EM SPACE */
   1374           case 0x2004:    /* THREE-PER-EM SPACE */
   1375           case 0x2005:    /* FOUR-PER-EM SPACE */
   1376           case 0x2006:    /* SIX-PER-EM SPACE */
   1377           case 0x2007:    /* FIGURE SPACE */
   1378           case 0x2008:    /* PUNCTUATION SPACE */
   1379           case 0x2009:    /* THIN SPACE */
   1380           case 0x200A:    /* HAIR SPACE */
   1381           case 0x202f:    /* NARROW NO-BREAK SPACE */
   1382           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
   1383           case 0x3000:    /* IDEOGRAPHIC SPACE */
   1384           OK = TRUE;
   1385           break;
   1386 
   1387           default:
   1388           OK = FALSE;
   1389           break;
   1390           }
   1391 
   1392         if (OK == (d == OP_HSPACE))
   1393           {
   1394           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
   1395             {
   1396             active_count--;           /* Remove non-match possibility */
   1397             next_active_state--;
   1398             }
   1399           count++;
   1400           ADD_NEW_DATA(-state_offset, count, 0);
   1401           }
   1402         }
   1403       break;
   1404 
   1405       /*-----------------------------------------------------------------*/
   1406 #ifdef SUPPORT_UCP
   1407       case OP_PROP_EXTRA + OP_TYPEQUERY:
   1408       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
   1409       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
   1410       count = 4;
   1411       goto QS1;
   1412 
   1413       case OP_PROP_EXTRA + OP_TYPESTAR:
   1414       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
   1415       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
   1416       count = 0;
   1417 
   1418       QS1:
   1419 
   1420       ADD_ACTIVE(state_offset + 4, 0);
   1421       if (clen > 0)
   1422         {
   1423         BOOL OK;
   1424         const ucd_record * prop = GET_UCD(c);
   1425         switch(code[2])
   1426           {
   1427           case PT_ANY:
   1428           OK = TRUE;
   1429           break;
   1430 
   1431           case PT_LAMP:
   1432           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
   1433             prop->chartype == ucp_Lt;
   1434           break;
   1435 
   1436           case PT_GC:
   1437           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
   1438           break;
   1439 
   1440           case PT_PC:
   1441           OK = prop->chartype == code[3];
   1442           break;
   1443 
   1444           case PT_SC:
   1445           OK = prop->script == code[3];
   1446           break;
   1447 
   1448           /* These are specials for combination cases. */
   1449 
   1450           case PT_ALNUM:
   1451           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1452                _pcre_ucp_gentype[prop->chartype] == ucp_N;
   1453           break;
   1454 
   1455           case PT_SPACE:    /* Perl space */
   1456           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1457                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
   1458           break;
   1459 
   1460           case PT_PXSPACE:  /* POSIX space */
   1461           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1462                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
   1463                c == CHAR_FF || c == CHAR_CR;
   1464           break;
   1465 
   1466           case PT_WORD:
   1467           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1468                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
   1469                c == CHAR_UNDERSCORE;
   1470           break;
   1471 
   1472           /* Should never occur, but keep compilers from grumbling. */
   1473 
   1474           default:
   1475           OK = codevalue != OP_PROP;
   1476           break;
   1477           }
   1478 
   1479         if (OK == (d == OP_PROP))
   1480           {
   1481           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
   1482               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
   1483             {
   1484             active_count--;           /* Remove non-match possibility */
   1485             next_active_state--;
   1486             }
   1487           ADD_NEW(state_offset + count, 0);
   1488           }
   1489         }
   1490       break;
   1491 
   1492       /*-----------------------------------------------------------------*/
   1493       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
   1494       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
   1495       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
   1496       count = 2;
   1497       goto QS2;
   1498 
   1499       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
   1500       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
   1501       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
   1502       count = 0;
   1503 
   1504       QS2:
   1505 
   1506       ADD_ACTIVE(state_offset + 2, 0);
   1507       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
   1508         {
   1509         const uschar *nptr = ptr + clen;
   1510         int ncount = 0;
   1511         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
   1512             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
   1513           {
   1514           active_count--;           /* Remove non-match possibility */
   1515           next_active_state--;
   1516           }
   1517         while (nptr < end_subject)
   1518           {
   1519           int nd;
   1520           int ndlen = 1;
   1521           GETCHARLEN(nd, nptr, ndlen);
   1522           if (UCD_CATEGORY(nd) != ucp_M) break;
   1523           ncount++;
   1524           nptr += ndlen;
   1525           }
   1526         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
   1527         }
   1528       break;
   1529 #endif
   1530 
   1531       /*-----------------------------------------------------------------*/
   1532       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
   1533       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
   1534       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
   1535       count = 2;
   1536       goto QS3;
   1537 
   1538       case OP_ANYNL_EXTRA + OP_TYPESTAR:
   1539       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
   1540       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
   1541       count = 0;
   1542 
   1543       QS3:
   1544       ADD_ACTIVE(state_offset + 2, 0);
   1545       if (clen > 0)
   1546         {
   1547         int ncount = 0;
   1548         switch (c)
   1549           {
   1550           case 0x000b:
   1551           case 0x000c:
   1552           case 0x0085:
   1553           case 0x2028:
   1554           case 0x2029:
   1555           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
   1556           goto ANYNL02;
   1557 
   1558           case 0x000d:
   1559           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
   1560           /* Fall through */
   1561 
   1562           ANYNL02:
   1563           case 0x000a:
   1564           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
   1565               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
   1566             {
   1567             active_count--;           /* Remove non-match possibility */
   1568             next_active_state--;
   1569             }
   1570           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
   1571           break;
   1572 
   1573           default:
   1574           break;
   1575           }
   1576         }
   1577       break;
   1578 
   1579       /*-----------------------------------------------------------------*/
   1580       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
   1581       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
   1582       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
   1583       count = 2;
   1584       goto QS4;
   1585 
   1586       case OP_VSPACE_EXTRA + OP_TYPESTAR:
   1587       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
   1588       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
   1589       count = 0;
   1590 
   1591       QS4:
   1592       ADD_ACTIVE(state_offset + 2, 0);
   1593       if (clen > 0)
   1594         {
   1595         BOOL OK;
   1596         switch (c)
   1597           {
   1598           case 0x000a:
   1599           case 0x000b:
   1600           case 0x000c:
   1601           case 0x000d:
   1602           case 0x0085:
   1603           case 0x2028:
   1604           case 0x2029:
   1605           OK = TRUE;
   1606           break;
   1607 
   1608           default:
   1609           OK = FALSE;
   1610           break;
   1611           }
   1612         if (OK == (d == OP_VSPACE))
   1613           {
   1614           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
   1615               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
   1616             {
   1617             active_count--;           /* Remove non-match possibility */
   1618             next_active_state--;
   1619             }
   1620           ADD_NEW_DATA(-(state_offset + count), 0, 0);
   1621           }
   1622         }
   1623       break;
   1624 
   1625       /*-----------------------------------------------------------------*/
   1626       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
   1627       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
   1628       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
   1629       count = 2;
   1630       goto QS5;
   1631 
   1632       case OP_HSPACE_EXTRA + OP_TYPESTAR:
   1633       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
   1634       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
   1635       count = 0;
   1636 
   1637       QS5:
   1638       ADD_ACTIVE(state_offset + 2, 0);
   1639       if (clen > 0)
   1640         {
   1641         BOOL OK;
   1642         switch (c)
   1643           {
   1644           case 0x09:      /* HT */
   1645           case 0x20:      /* SPACE */
   1646           case 0xa0:      /* NBSP */
   1647           case 0x1680:    /* OGHAM SPACE MARK */
   1648           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
   1649           case 0x2000:    /* EN QUAD */
   1650           case 0x2001:    /* EM QUAD */
   1651           case 0x2002:    /* EN SPACE */
   1652           case 0x2003:    /* EM SPACE */
   1653           case 0x2004:    /* THREE-PER-EM SPACE */
   1654           case 0x2005:    /* FOUR-PER-EM SPACE */
   1655           case 0x2006:    /* SIX-PER-EM SPACE */
   1656           case 0x2007:    /* FIGURE SPACE */
   1657           case 0x2008:    /* PUNCTUATION SPACE */
   1658           case 0x2009:    /* THIN SPACE */
   1659           case 0x200A:    /* HAIR SPACE */
   1660           case 0x202f:    /* NARROW NO-BREAK SPACE */
   1661           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
   1662           case 0x3000:    /* IDEOGRAPHIC SPACE */
   1663           OK = TRUE;
   1664           break;
   1665 
   1666           default:
   1667           OK = FALSE;
   1668           break;
   1669           }
   1670 
   1671         if (OK == (d == OP_HSPACE))
   1672           {
   1673           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
   1674               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
   1675             {
   1676             active_count--;           /* Remove non-match possibility */
   1677             next_active_state--;
   1678             }
   1679           ADD_NEW_DATA(-(state_offset + count), 0, 0);
   1680           }
   1681         }
   1682       break;
   1683 
   1684       /*-----------------------------------------------------------------*/
   1685 #ifdef SUPPORT_UCP
   1686       case OP_PROP_EXTRA + OP_TYPEEXACT:
   1687       case OP_PROP_EXTRA + OP_TYPEUPTO:
   1688       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
   1689       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
   1690       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
   1691         { ADD_ACTIVE(state_offset + 6, 0); }
   1692       count = current_state->count;  /* Number already matched */
   1693       if (clen > 0)
   1694         {
   1695         BOOL OK;
   1696         const ucd_record * prop = GET_UCD(c);
   1697         switch(code[4])
   1698           {
   1699           case PT_ANY:
   1700           OK = TRUE;
   1701           break;
   1702 
   1703           case PT_LAMP:
   1704           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
   1705             prop->chartype == ucp_Lt;
   1706           break;
   1707 
   1708           case PT_GC:
   1709           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
   1710           break;
   1711 
   1712           case PT_PC:
   1713           OK = prop->chartype == code[5];
   1714           break;
   1715 
   1716           case PT_SC:
   1717           OK = prop->script == code[5];
   1718           break;
   1719 
   1720           /* These are specials for combination cases. */
   1721 
   1722           case PT_ALNUM:
   1723           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1724                _pcre_ucp_gentype[prop->chartype] == ucp_N;
   1725           break;
   1726 
   1727           case PT_SPACE:    /* Perl space */
   1728           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1729                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
   1730           break;
   1731 
   1732           case PT_PXSPACE:  /* POSIX space */
   1733           OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
   1734                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
   1735                c == CHAR_FF || c == CHAR_CR;
   1736           break;
   1737 
   1738           case PT_WORD:
   1739           OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
   1740                _pcre_ucp_gentype[prop->chartype] == ucp_N ||
   1741                c == CHAR_UNDERSCORE;
   1742           break;
   1743 
   1744           /* Should never occur, but keep compilers from grumbling. */
   1745 
   1746           default:
   1747           OK = codevalue != OP_PROP;
   1748           break;
   1749           }
   1750 
   1751         if (OK == (d == OP_PROP))
   1752           {
   1753           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
   1754             {
   1755             active_count--;           /* Remove non-match possibility */
   1756             next_active_state--;
   1757             }
   1758           if (++count >= GET2(code, 1))
   1759             { ADD_NEW(state_offset + 6, 0); }
   1760           else
   1761             { ADD_NEW(state_offset, count); }
   1762           }
   1763         }
   1764       break;
   1765 
   1766       /*-----------------------------------------------------------------*/
   1767       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
   1768       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
   1769       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
   1770       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
   1771       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
   1772         { ADD_ACTIVE(state_offset + 4, 0); }
   1773       count = current_state->count;  /* Number already matched */
   1774       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
   1775         {
   1776         const uschar *nptr = ptr + clen;
   1777         int ncount = 0;
   1778         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
   1779           {
   1780           active_count--;           /* Remove non-match possibility */
   1781           next_active_state--;
   1782           }
   1783         while (nptr < end_subject)
   1784           {
   1785           int nd;
   1786           int ndlen = 1;
   1787           GETCHARLEN(nd, nptr, ndlen);
   1788           if (UCD_CATEGORY(nd) != ucp_M) break;
   1789           ncount++;
   1790           nptr += ndlen;
   1791           }
   1792         if (++count >= GET2(code, 1))
   1793           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
   1794         else
   1795           { ADD_NEW_DATA(-state_offset, count, ncount); }
   1796         }
   1797       break;
   1798 #endif
   1799 
   1800       /*-----------------------------------------------------------------*/
   1801       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
   1802       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
   1803       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
   1804       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
   1805       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
   1806         { ADD_ACTIVE(state_offset + 4, 0); }
   1807       count = current_state->count;  /* Number already matched */
   1808       if (clen > 0)
   1809         {
   1810         int ncount = 0;
   1811         switch (c)
   1812           {
   1813           case 0x000b:
   1814           case 0x000c:
   1815           case 0x0085:
   1816           case 0x2028:
   1817           case 0x2029:
   1818           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
   1819           goto ANYNL03;
   1820 
   1821           case 0x000d:
   1822           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
   1823           /* Fall through */
   1824 
   1825           ANYNL03:
   1826           case 0x000a:
   1827           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
   1828             {
   1829             active_count--;           /* Remove non-match possibility */
   1830             next_active_state--;
   1831             }
   1832           if (++count >= GET2(code, 1))
   1833             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
   1834           else
   1835             { ADD_NEW_DATA(-state_offset, count, ncount); }
   1836           break;
   1837 
   1838           default:
   1839           break;
   1840           }
   1841         }
   1842       break;
   1843 
   1844       /*-----------------------------------------------------------------*/
   1845       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
   1846       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
   1847       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
   1848       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
   1849       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
   1850         { ADD_ACTIVE(state_offset + 4, 0); }
   1851       count = current_state->count;  /* Number already matched */
   1852       if (clen > 0)
   1853         {
   1854         BOOL OK;
   1855         switch (c)
   1856           {
   1857           case 0x000a:
   1858           case 0x000b:
   1859           case 0x000c:
   1860           case 0x000d:
   1861           case 0x0085:
   1862           case 0x2028:
   1863           case 0x2029:
   1864           OK = TRUE;
   1865           break;
   1866 
   1867           default:
   1868           OK = FALSE;
   1869           }
   1870 
   1871         if (OK == (d == OP_VSPACE))
   1872           {
   1873           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
   1874             {
   1875             active_count--;           /* Remove non-match possibility */
   1876             next_active_state--;
   1877             }
   1878           if (++count >= GET2(code, 1))
   1879             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
   1880           else
   1881             { ADD_NEW_DATA(-state_offset, count, 0); }
   1882           }
   1883         }
   1884       break;
   1885 
   1886       /*-----------------------------------------------------------------*/
   1887       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
   1888       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
   1889       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
   1890       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
   1891       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
   1892         { ADD_ACTIVE(state_offset + 4, 0); }
   1893       count = current_state->count;  /* Number already matched */
   1894       if (clen > 0)
   1895         {
   1896         BOOL OK;
   1897         switch (c)
   1898           {
   1899           case 0x09:      /* HT */
   1900           case 0x20:      /* SPACE */
   1901           case 0xa0:      /* NBSP */
   1902           case 0x1680:    /* OGHAM SPACE MARK */
   1903           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
   1904           case 0x2000:    /* EN QUAD */
   1905           case 0x2001:    /* EM QUAD */
   1906           case 0x2002:    /* EN SPACE */
   1907           case 0x2003:    /* EM SPACE */
   1908           case 0x2004:    /* THREE-PER-EM SPACE */
   1909           case 0x2005:    /* FOUR-PER-EM SPACE */
   1910           case 0x2006:    /* SIX-PER-EM SPACE */
   1911           case 0x2007:    /* FIGURE SPACE */
   1912           case 0x2008:    /* PUNCTUATION SPACE */
   1913           case 0x2009:    /* THIN SPACE */
   1914           case 0x200A:    /* HAIR SPACE */
   1915           case 0x202f:    /* NARROW NO-BREAK SPACE */
   1916           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
   1917           case 0x3000:    /* IDEOGRAPHIC SPACE */
   1918           OK = TRUE;
   1919           break;
   1920 
   1921           default:
   1922           OK = FALSE;
   1923           break;
   1924           }
   1925 
   1926         if (OK == (d == OP_HSPACE))
   1927           {
   1928           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
   1929             {
   1930             active_count--;           /* Remove non-match possibility */
   1931             next_active_state--;
   1932             }
   1933           if (++count >= GET2(code, 1))
   1934             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
   1935           else
   1936             { ADD_NEW_DATA(-state_offset, count, 0); }
   1937           }
   1938         }
   1939       break;
   1940 
   1941 /* ========================================================================== */
   1942       /* These opcodes are followed by a character that is usually compared
   1943       to the current subject character; it is loaded into d. We still get
   1944       here even if there is no subject character, because in some cases zero
   1945       repetitions are permitted. */
   1946 
   1947       /*-----------------------------------------------------------------*/
   1948       case OP_CHAR:
   1949       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
   1950       break;
   1951 
   1952       /*-----------------------------------------------------------------*/
   1953       case OP_CHARNC:
   1954       if (clen == 0) break;
   1955 
   1956 #ifdef SUPPORT_UTF8
   1957       if (utf8)
   1958         {
   1959         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
   1960           {
   1961           unsigned int othercase;
   1962           if (c < 128) othercase = fcc[c]; else
   1963 
   1964           /* If we have Unicode property support, we can use it to test the
   1965           other case of the character. */
   1966 
   1967 #ifdef SUPPORT_UCP
   1968           othercase = UCD_OTHERCASE(c);
   1969 #else
   1970           othercase = NOTACHAR;
   1971 #endif
   1972 
   1973           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
   1974           }
   1975         }
   1976       else
   1977 #endif  /* SUPPORT_UTF8 */
   1978 
   1979       /* Non-UTF-8 mode */
   1980         {
   1981         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
   1982         }
   1983       break;
   1984 
   1985 
   1986 #ifdef SUPPORT_UCP
   1987       /*-----------------------------------------------------------------*/
   1988       /* This is a tricky one because it can match more than one character.
   1989       Find out how many characters to skip, and then set up a negative state
   1990       to wait for them to pass before continuing. */
   1991 
   1992       case OP_EXTUNI:
   1993       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
   1994         {
   1995         const uschar *nptr = ptr + clen;
   1996         int ncount = 0;
   1997         while (nptr < end_subject)
   1998           {
   1999           int nclen = 1;
   2000           GETCHARLEN(c, nptr, nclen);
   2001           if (UCD_CATEGORY(c) != ucp_M) break;
   2002           ncount++;
   2003           nptr += nclen;
   2004           }
   2005         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
   2006         }
   2007       break;
   2008 #endif
   2009 
   2010       /*-----------------------------------------------------------------*/
   2011       /* This is a tricky like EXTUNI because it too can match more than one
   2012       character (when CR is followed by LF). In this case, set up a negative
   2013       state to wait for one character to pass before continuing. */
   2014 
   2015       case OP_ANYNL:
   2016       if (clen > 0) switch(c)
   2017         {
   2018         case 0x000b:
   2019         case 0x000c:
   2020         case 0x0085:
   2021         case 0x2028:
   2022         case 0x2029:
   2023         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
   2024 
   2025         case 0x000a:
   2026         ADD_NEW(state_offset + 1, 0);
   2027         break;
   2028 
   2029         case 0x000d:
   2030         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
   2031           {
   2032           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
   2033           }
   2034         else
   2035           {
   2036           ADD_NEW(state_offset + 1, 0);
   2037           }
   2038         break;
   2039         }
   2040       break;
   2041 
   2042       /*-----------------------------------------------------------------*/
   2043       case OP_NOT_VSPACE:
   2044       if (clen > 0) switch(c)
   2045         {
   2046         case 0x000a:
   2047         case 0x000b:
   2048         case 0x000c:
   2049         case 0x000d:
   2050         case 0x0085:
   2051         case 0x2028:
   2052         case 0x2029:
   2053         break;
   2054 
   2055         default:
   2056         ADD_NEW(state_offset + 1, 0);
   2057         break;
   2058         }
   2059       break;
   2060 
   2061       /*-----------------------------------------------------------------*/
   2062       case OP_VSPACE:
   2063       if (clen > 0) switch(c)
   2064         {
   2065         case 0x000a:
   2066         case 0x000b:
   2067         case 0x000c:
   2068         case 0x000d:
   2069         case 0x0085:
   2070         case 0x2028:
   2071         case 0x2029:
   2072         ADD_NEW(state_offset + 1, 0);
   2073         break;
   2074 
   2075         default: break;
   2076         }
   2077       break;
   2078 
   2079       /*-----------------------------------------------------------------*/
   2080       case OP_NOT_HSPACE:
   2081       if (clen > 0) switch(c)
   2082         {
   2083         case 0x09:      /* HT */
   2084         case 0x20:      /* SPACE */
   2085         case 0xa0:      /* NBSP */
   2086         case 0x1680:    /* OGHAM SPACE MARK */
   2087         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
   2088         case 0x2000:    /* EN QUAD */
   2089         case 0x2001:    /* EM QUAD */
   2090         case 0x2002:    /* EN SPACE */
   2091         case 0x2003:    /* EM SPACE */
   2092         case 0x2004:    /* THREE-PER-EM SPACE */
   2093         case 0x2005:    /* FOUR-PER-EM SPACE */
   2094         case 0x2006:    /* SIX-PER-EM SPACE */
   2095         case 0x2007:    /* FIGURE SPACE */
   2096         case 0x2008:    /* PUNCTUATION SPACE */
   2097         case 0x2009:    /* THIN SPACE */
   2098         case 0x200A:    /* HAIR SPACE */
   2099         case 0x202f:    /* NARROW NO-BREAK SPACE */
   2100         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
   2101         case 0x3000:    /* IDEOGRAPHIC SPACE */
   2102         break;
   2103 
   2104         default:
   2105         ADD_NEW(state_offset + 1, 0);
   2106         break;
   2107         }
   2108       break;
   2109 
   2110       /*-----------------------------------------------------------------*/
   2111       case OP_HSPACE:
   2112       if (clen > 0) switch(c)
   2113         {
   2114         case 0x09:      /* HT */
   2115         case 0x20:      /* SPACE */
   2116         case 0xa0:      /* NBSP */
   2117         case 0x1680:    /* OGHAM SPACE MARK */
   2118         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
   2119         case 0x2000:    /* EN QUAD */
   2120         case 0x2001:    /* EM QUAD */
   2121         case 0x2002:    /* EN SPACE */
   2122         case 0x2003:    /* EM SPACE */
   2123         case 0x2004:    /* THREE-PER-EM SPACE */
   2124         case 0x2005:    /* FOUR-PER-EM SPACE */
   2125         case 0x2006:    /* SIX-PER-EM SPACE */
   2126         case 0x2007:    /* FIGURE SPACE */
   2127         case 0x2008:    /* PUNCTUATION SPACE */
   2128         case 0x2009:    /* THIN SPACE */
   2129         case 0x200A:    /* HAIR SPACE */
   2130         case 0x202f:    /* NARROW NO-BREAK SPACE */
   2131         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
   2132         case 0x3000:    /* IDEOGRAPHIC SPACE */
   2133         ADD_NEW(state_offset + 1, 0);
   2134         break;
   2135         }
   2136       break;
   2137 
   2138       /*-----------------------------------------------------------------*/
   2139       /* Match a negated single character. This is only used for one-byte
   2140       characters, that is, we know that d < 256. The character we are
   2141       checking (c) can be multibyte. */
   2142 
   2143       case OP_NOT:
   2144       if (clen > 0)
   2145         {
   2146         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
   2147         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
   2148         }
   2149       break;
   2150 
   2151       /*-----------------------------------------------------------------*/
   2152       case OP_PLUS:
   2153       case OP_MINPLUS:
   2154       case OP_POSPLUS:
   2155       case OP_NOTPLUS:
   2156       case OP_NOTMINPLUS:
   2157       case OP_NOTPOSPLUS:
   2158       count = current_state->count;  /* Already matched */
   2159       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
   2160       if (clen > 0)
   2161         {
   2162         unsigned int otherd = NOTACHAR;
   2163         if ((ims & PCRE_CASELESS) != 0)
   2164           {
   2165 #ifdef SUPPORT_UTF8
   2166           if (utf8 && d >= 128)
   2167             {
   2168 #ifdef SUPPORT_UCP
   2169             otherd = UCD_OTHERCASE(d);
   2170 #endif  /* SUPPORT_UCP */
   2171             }
   2172           else
   2173 #endif  /* SUPPORT_UTF8 */
   2174           otherd = fcc[d];
   2175           }
   2176         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
   2177           {
   2178           if (count > 0 &&
   2179               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
   2180             {
   2181             active_count--;             /* Remove non-match possibility */
   2182             next_active_state--;
   2183             }
   2184           count++;
   2185           ADD_NEW(state_offset, count);
   2186           }
   2187         }
   2188       break;
   2189 
   2190       /*-----------------------------------------------------------------*/
   2191       case OP_QUERY:
   2192       case OP_MINQUERY:
   2193       case OP_POSQUERY:
   2194       case OP_NOTQUERY:
   2195       case OP_NOTMINQUERY:
   2196       case OP_NOTPOSQUERY:
   2197       ADD_ACTIVE(state_offset + dlen + 1, 0);
   2198       if (clen > 0)
   2199         {
   2200         unsigned int otherd = NOTACHAR;
   2201         if ((ims & PCRE_CASELESS) != 0)
   2202           {
   2203 #ifdef SUPPORT_UTF8
   2204           if (utf8 && d >= 128)
   2205             {
   2206 #ifdef SUPPORT_UCP
   2207             otherd = UCD_OTHERCASE(d);
   2208 #endif  /* SUPPORT_UCP */
   2209             }
   2210           else
   2211 #endif  /* SUPPORT_UTF8 */
   2212           otherd = fcc[d];
   2213           }
   2214         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
   2215           {
   2216           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
   2217             {
   2218             active_count--;            /* Remove non-match possibility */
   2219             next_active_state--;
   2220             }
   2221           ADD_NEW(state_offset + dlen + 1, 0);
   2222           }
   2223         }
   2224       break;
   2225 
   2226       /*-----------------------------------------------------------------*/
   2227       case OP_STAR:
   2228       case OP_MINSTAR:
   2229       case OP_POSSTAR:
   2230       case OP_NOTSTAR:
   2231       case OP_NOTMINSTAR:
   2232       case OP_NOTPOSSTAR:
   2233       ADD_ACTIVE(state_offset + dlen + 1, 0);
   2234       if (clen > 0)
   2235         {
   2236         unsigned int otherd = NOTACHAR;
   2237         if ((ims & PCRE_CASELESS) != 0)
   2238           {
   2239 #ifdef SUPPORT_UTF8
   2240           if (utf8 && d >= 128)
   2241             {
   2242 #ifdef SUPPORT_UCP
   2243             otherd = UCD_OTHERCASE(d);
   2244 #endif  /* SUPPORT_UCP */
   2245             }
   2246           else
   2247 #endif  /* SUPPORT_UTF8 */
   2248           otherd = fcc[d];
   2249           }
   2250         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
   2251           {
   2252           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
   2253             {
   2254             active_count--;            /* Remove non-match possibility */
   2255             next_active_state--;
   2256             }
   2257           ADD_NEW(state_offset, 0);
   2258           }
   2259         }
   2260       break;
   2261 
   2262       /*-----------------------------------------------------------------*/
   2263       case OP_EXACT:
   2264       case OP_NOTEXACT:
   2265       count = current_state->count;  /* Number already matched */
   2266       if (clen > 0)
   2267         {
   2268         unsigned int otherd = NOTACHAR;
   2269         if ((ims & PCRE_CASELESS) != 0)
   2270           {
   2271 #ifdef SUPPORT_UTF8
   2272           if (utf8 && d >= 128)
   2273             {
   2274 #ifdef SUPPORT_UCP
   2275             otherd = UCD_OTHERCASE(d);
   2276 #endif  /* SUPPORT_UCP */
   2277             }
   2278           else
   2279 #endif  /* SUPPORT_UTF8 */
   2280           otherd = fcc[d];
   2281           }
   2282         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
   2283           {
   2284           if (++count >= GET2(code, 1))
   2285             { ADD_NEW(state_offset + dlen + 3, 0); }
   2286           else
   2287             { ADD_NEW(state_offset, count); }
   2288           }
   2289         }
   2290       break;
   2291 
   2292       /*-----------------------------------------------------------------*/
   2293       case OP_UPTO:
   2294       case OP_MINUPTO:
   2295       case OP_POSUPTO:
   2296       case OP_NOTUPTO:
   2297       case OP_NOTMINUPTO:
   2298       case OP_NOTPOSUPTO:
   2299       ADD_ACTIVE(state_offset + dlen + 3, 0);
   2300       count = current_state->count;  /* Number already matched */
   2301       if (clen > 0)
   2302         {
   2303         unsigned int otherd = NOTACHAR;
   2304         if ((ims & PCRE_CASELESS) != 0)
   2305           {
   2306 #ifdef SUPPORT_UTF8
   2307           if (utf8 && d >= 128)
   2308             {
   2309 #ifdef SUPPORT_UCP
   2310             otherd = UCD_OTHERCASE(d);
   2311 #endif  /* SUPPORT_UCP */
   2312             }
   2313           else
   2314 #endif  /* SUPPORT_UTF8 */
   2315           otherd = fcc[d];
   2316           }
   2317         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
   2318           {
   2319           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
   2320             {
   2321             active_count--;             /* Remove non-match possibility */
   2322             next_active_state--;
   2323             }
   2324           if (++count >= GET2(code, 1))
   2325             { ADD_NEW(state_offset + dlen + 3, 0); }
   2326           else
   2327             { ADD_NEW(state_offset, count); }
   2328           }
   2329         }
   2330       break;
   2331 
   2332 
   2333 /* ========================================================================== */
   2334       /* These are the class-handling opcodes */
   2335 
   2336       case OP_CLASS:
   2337       case OP_NCLASS:
   2338       case OP_XCLASS:
   2339         {
   2340         BOOL isinclass = FALSE;
   2341         int next_state_offset;
   2342         const uschar *ecode;
   2343 
   2344         /* For a simple class, there is always just a 32-byte table, and we
   2345         can set isinclass from it. */
   2346 
   2347         if (codevalue != OP_XCLASS)
   2348           {
   2349           ecode = code + 33;
   2350           if (clen > 0)
   2351             {
   2352             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
   2353               ((code[1 + c/8] & (1 << (c&7))) != 0);
   2354             }
   2355           }
   2356 
   2357         /* An extended class may have a table or a list of single characters,
   2358         ranges, or both, and it may be positive or negative. There's a
   2359         function that sorts all this out. */
   2360 
   2361         else
   2362          {
   2363          ecode = code + GET(code, 1);
   2364          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
   2365          }
   2366 
   2367         /* At this point, isinclass is set for all kinds of class, and ecode
   2368         points to the byte after the end of the class. If there is a
   2369         quantifier, this is where it will be. */
   2370 
   2371         next_state_offset = (int)(ecode - start_code);
   2372 
   2373         switch (*ecode)
   2374           {
   2375           case OP_CRSTAR:
   2376           case OP_CRMINSTAR:
   2377           ADD_ACTIVE(next_state_offset + 1, 0);
   2378           if (isinclass) { ADD_NEW(state_offset, 0); }
   2379           break;
   2380 
   2381           case OP_CRPLUS:
   2382           case OP_CRMINPLUS:
   2383           count = current_state->count;  /* Already matched */
   2384           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
   2385           if (isinclass) { count++; ADD_NEW(state_offset, count); }
   2386           break;
   2387 
   2388           case OP_CRQUERY:
   2389           case OP_CRMINQUERY:
   2390           ADD_ACTIVE(next_state_offset + 1, 0);
   2391           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
   2392           break;
   2393 
   2394           case OP_CRRANGE:
   2395           case OP_CRMINRANGE:
   2396           count = current_state->count;  /* Already matched */
   2397           if (count >= GET2(ecode, 1))
   2398             { ADD_ACTIVE(next_state_offset + 5, 0); }
   2399           if (isinclass)
   2400             {
   2401             int max = GET2(ecode, 3);
   2402             if (++count >= max && max != 0)   /* Max 0 => no limit */
   2403               { ADD_NEW(next_state_offset + 5, 0); }
   2404             else
   2405               { ADD_NEW(state_offset, count); }
   2406             }
   2407           break;
   2408 
   2409           default:
   2410           if (isinclass) { ADD_NEW(next_state_offset, 0); }
   2411           break;
   2412           }
   2413         }
   2414       break;
   2415 
   2416 /* ========================================================================== */
   2417       /* These are the opcodes for fancy brackets of various kinds. We have
   2418       to use recursion in order to handle them. The "always failing" assertion
   2419       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
   2420       though the other "backtracking verbs" are not supported. */
   2421 
   2422       case OP_FAIL:
   2423       forced_fail++;    /* Count FAILs for multiple states */
   2424       break;
   2425 
   2426       case OP_ASSERT:
   2427       case OP_ASSERT_NOT:
   2428       case OP_ASSERTBACK:
   2429       case OP_ASSERTBACK_NOT:
   2430         {
   2431         int rc;
   2432         int local_offsets[2];
   2433         int local_workspace[1000];
   2434         const uschar *endasscode = code + GET(code, 1);
   2435 
   2436         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
   2437 
   2438         rc = internal_dfa_exec(
   2439           md,                                   /* static match data */
   2440           code,                                 /* this subexpression's code */
   2441           ptr,                                  /* where we currently are */
   2442           (int)(ptr - start_subject),           /* start offset */
   2443           local_offsets,                        /* offset vector */
   2444           sizeof(local_offsets)/sizeof(int),    /* size of same */
   2445           local_workspace,                      /* workspace vector */
   2446           sizeof(local_workspace)/sizeof(int),  /* size of same */
   2447           ims,                                  /* the current ims flags */
   2448           rlevel,                               /* function recursion level */
   2449           recursing);                           /* pass on regex recursion */
   2450 
   2451         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
   2452         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
   2453             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
   2454         }
   2455       break;
   2456 
   2457       /*-----------------------------------------------------------------*/
   2458       case OP_COND:
   2459       case OP_SCOND:
   2460         {
   2461         int local_offsets[1000];
   2462         int local_workspace[1000];
   2463         int codelink = GET(code, 1);
   2464         int condcode;
   2465 
   2466         /* Because of the way auto-callout works during compile, a callout item
   2467         is inserted between OP_COND and an assertion condition. This does not
   2468         happen for the other conditions. */
   2469 
   2470         if (code[LINK_SIZE+1] == OP_CALLOUT)
   2471           {
   2472           rrc = 0;
   2473           if (pcre_callout != NULL)
   2474             {
   2475             pcre_callout_block cb;
   2476             cb.version          = 1;   /* Version 1 of the callout block */
   2477             cb.callout_number   = code[LINK_SIZE+2];
   2478             cb.offset_vector    = offsets;
   2479             cb.subject          = (PCRE_SPTR)start_subject;
   2480             cb.subject_length   = (int)(end_subject - start_subject);
   2481             cb.start_match      = (int)(current_subject - start_subject);
   2482             cb.current_position = (int)(ptr - start_subject);
   2483             cb.pattern_position = GET(code, LINK_SIZE + 3);
   2484             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
   2485             cb.capture_top      = 1;
   2486             cb.capture_last     = -1;
   2487             cb.callout_data     = md->callout_data;
   2488             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
   2489             }
   2490           if (rrc > 0) break;                      /* Fail this thread */
   2491           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
   2492           }
   2493 
   2494         condcode = code[LINK_SIZE+1];
   2495 
   2496         /* Back reference conditions are not supported */
   2497 
   2498         if (condcode == OP_CREF || condcode == OP_NCREF)
   2499           return PCRE_ERROR_DFA_UCOND;
   2500 
   2501         /* The DEFINE condition is always false */
   2502 
   2503         if (condcode == OP_DEF)
   2504           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
   2505 
   2506         /* The only supported version of OP_RREF is for the value RREF_ANY,
   2507         which means "test if in any recursion". We can't test for specifically
   2508         recursed groups. */
   2509 
   2510         else if (condcode == OP_RREF || condcode == OP_NRREF)
   2511           {
   2512           int value = GET2(code, LINK_SIZE+2);
   2513           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
   2514           if (recursing > 0)
   2515             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
   2516           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
   2517           }
   2518 
   2519         /* Otherwise, the condition is an assertion */
   2520 
   2521         else
   2522           {
   2523           int rc;
   2524           const uschar *asscode = code + LINK_SIZE + 1;
   2525           const uschar *endasscode = asscode + GET(asscode, 1);
   2526 
   2527           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
   2528 
   2529           rc = internal_dfa_exec(
   2530             md,                                   /* fixed match data */
   2531             asscode,                              /* this subexpression's code */
   2532             ptr,                                  /* where we currently are */
   2533             (int)(ptr - start_subject),           /* start offset */
   2534             local_offsets,                        /* offset vector */
   2535             sizeof(local_offsets)/sizeof(int),    /* size of same */
   2536             local_workspace,                      /* workspace vector */
   2537             sizeof(local_workspace)/sizeof(int),  /* size of same */
   2538             ims,                                  /* the current ims flags */
   2539             rlevel,                               /* function recursion level */
   2540             recursing);                           /* pass on regex recursion */
   2541 
   2542           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
   2543           if ((rc >= 0) ==
   2544                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
   2545             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
   2546           else
   2547             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
   2548           }
   2549         }
   2550       break;
   2551 
   2552       /*-----------------------------------------------------------------*/
   2553       case OP_RECURSE:
   2554         {
   2555         int local_offsets[1000];
   2556         int local_workspace[1000];
   2557         int rc;
   2558 
   2559         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
   2560           recursing + 1));
   2561 
   2562         rc = internal_dfa_exec(
   2563           md,                                   /* fixed match data */
   2564           start_code + GET(code, 1),            /* this subexpression's code */
   2565           ptr,                                  /* where we currently are */
   2566           (int)(ptr - start_subject),           /* start offset */
   2567           local_offsets,                        /* offset vector */
   2568           sizeof(local_offsets)/sizeof(int),    /* size of same */
   2569           local_workspace,                      /* workspace vector */
   2570           sizeof(local_workspace)/sizeof(int),  /* size of same */
   2571           ims,                                  /* the current ims flags */
   2572           rlevel,                               /* function recursion level */
   2573           recursing + 1);                       /* regex recurse level */
   2574 
   2575         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
   2576           recursing + 1, rc));
   2577 
   2578         /* Ran out of internal offsets */
   2579 
   2580         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
   2581 
   2582         /* For each successful matched substring, set up the next state with a
   2583         count of characters to skip before trying it. Note that the count is in
   2584         characters, not bytes. */
   2585 
   2586         if (rc > 0)
   2587           {
   2588           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
   2589             {
   2590             const uschar *p = start_subject + local_offsets[rc];
   2591             const uschar *pp = start_subject + local_offsets[rc+1];
   2592             int charcount = local_offsets[rc+1] - local_offsets[rc];
   2593             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
   2594             if (charcount > 0)
   2595               {
   2596               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
   2597               }
   2598             else
   2599               {
   2600               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
   2601               }
   2602             }
   2603           }
   2604         else if (rc != PCRE_ERROR_NOMATCH) return rc;
   2605         }
   2606       break;
   2607 
   2608       /*-----------------------------------------------------------------*/
   2609       case OP_ONCE:
   2610         {
   2611         int local_offsets[2];
   2612         int local_workspace[1000];
   2613 
   2614         int rc = internal_dfa_exec(
   2615           md,                                   /* fixed match data */
   2616           code,                                 /* this subexpression's code */
   2617           ptr,                                  /* where we currently are */
   2618           (int)(ptr - start_subject),           /* start offset */
   2619           local_offsets,                        /* offset vector */
   2620           sizeof(local_offsets)/sizeof(int),    /* size of same */
   2621           local_workspace,                      /* workspace vector */
   2622           sizeof(local_workspace)/sizeof(int),  /* size of same */
   2623           ims,                                  /* the current ims flags */
   2624           rlevel,                               /* function recursion level */
   2625           recursing);                           /* pass on regex recursion */
   2626 
   2627         if (rc >= 0)
   2628           {
   2629           const uschar *end_subpattern = code;
   2630           int charcount = local_offsets[1] - local_offsets[0];
   2631           int next_state_offset, repeat_state_offset;
   2632 
   2633           do { end_subpattern += GET(end_subpattern, 1); }
   2634             while (*end_subpattern == OP_ALT);
   2635           next_state_offset =
   2636             (int)(end_subpattern - start_code + LINK_SIZE + 1);
   2637 
   2638           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
   2639           arrange for the repeat state also to be added to the relevant list.
   2640           Calculate the offset, or set -1 for no repeat. */
   2641 
   2642           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
   2643                                  *end_subpattern == OP_KETRMIN)?
   2644             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
   2645 
   2646           /* If we have matched an empty string, add the next state at the
   2647           current character pointer. This is important so that the duplicate
   2648           checking kicks in, which is what breaks infinite loops that match an
   2649           empty string. */
   2650 
   2651           if (charcount == 0)
   2652             {
   2653             ADD_ACTIVE(next_state_offset, 0);
   2654             }
   2655 
   2656           /* Optimization: if there are no more active states, and there
   2657           are no new states yet set up, then skip over the subject string
   2658           right here, to save looping. Otherwise, set up the new state to swing
   2659           into action when the end of the substring is reached. */
   2660 
   2661           else if (i + 1 >= active_count && new_count == 0)
   2662             {
   2663             ptr += charcount;
   2664             clen = 0;
   2665             ADD_NEW(next_state_offset, 0);
   2666 
   2667             /* If we are adding a repeat state at the new character position,
   2668             we must fudge things so that it is the only current state.
   2669             Otherwise, it might be a duplicate of one we processed before, and
   2670             that would cause it to be skipped. */
   2671 
   2672             if (repeat_state_offset >= 0)
   2673               {
   2674               next_active_state = active_states;
   2675               active_count = 0;
   2676               i = -1;
   2677               ADD_ACTIVE(repeat_state_offset, 0);
   2678               }
   2679             }
   2680           else
   2681             {
   2682             const uschar *p = start_subject + local_offsets[0];
   2683             const uschar *pp = start_subject + local_offsets[1];
   2684             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
   2685             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
   2686             if (repeat_state_offset >= 0)
   2687               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
   2688             }
   2689 
   2690           }
   2691         else if (rc != PCRE_ERROR_NOMATCH) return rc;
   2692         }
   2693       break;
   2694 
   2695 
   2696 /* ========================================================================== */
   2697       /* Handle callouts */
   2698 
   2699       case OP_CALLOUT:
   2700       rrc = 0;
   2701       if (pcre_callout != NULL)
   2702         {
   2703         pcre_callout_block cb;
   2704         cb.version          = 1;   /* Version 1 of the callout block */
   2705         cb.callout_number   = code[1];
   2706         cb.offset_vector    = offsets;
   2707         cb.subject          = (PCRE_SPTR)start_subject;
   2708         cb.subject_length   = (int)(end_subject - start_subject);
   2709         cb.start_match      = (int)(current_subject - start_subject);
   2710         cb.current_position = (int)(ptr - start_subject);
   2711         cb.pattern_position = GET(code, 2);
   2712         cb.next_item_length = GET(code, 2 + LINK_SIZE);
   2713         cb.capture_top      = 1;
   2714         cb.capture_last     = -1;
   2715         cb.callout_data     = md->callout_data;
   2716         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
   2717         }
   2718       if (rrc == 0)
   2719         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
   2720       break;
   2721 
   2722 
   2723 /* ========================================================================== */
   2724       default:        /* Unsupported opcode */
   2725       return PCRE_ERROR_DFA_UITEM;
   2726       }
   2727 
   2728     NEXT_ACTIVE_STATE: continue;
   2729 
   2730     }      /* End of loop scanning active states */
   2731 
   2732   /* We have finished the processing at the current subject character. If no
   2733   new states have been set for the next character, we have found all the
   2734   matches that we are going to find. If we are at the top level and partial
   2735   matching has been requested, check for appropriate conditions.
   2736 
   2737   The "forced_ fail" variable counts the number of (*F) encountered for the
   2738   character. If it is equal to the original active_count (saved in
   2739   workspace[1]) it means that (*F) was found on every active state. In this
   2740   case we don't want to give a partial match.
   2741 
   2742   The "could_continue" variable is true if a state could have continued but
   2743   for the fact that the end of the subject was reached. */
   2744 
   2745   if (new_count <= 0)
   2746     {
   2747     if (rlevel == 1 &&                               /* Top level, and */
   2748         could_continue &&                            /* Some could go on */
   2749         forced_fail != workspace[1] &&               /* Not all forced fail & */
   2750         (                                            /* either... */
   2751         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
   2752         ||                                           /* or... */
   2753         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
   2754          match_count < 0)                            /* no matches */
   2755         ) &&                                         /* And... */
   2756         ptr >= end_subject &&                  /* Reached end of subject */
   2757         ptr > md->start_used_ptr)              /* Inspected non-empty string */
   2758       {
   2759       if (offsetcount >= 2)
   2760         {
   2761         offsets[0] = (int)(md->start_used_ptr - start_subject);
   2762         offsets[1] = (int)(end_subject - start_subject);
   2763         }
   2764       match_count = PCRE_ERROR_PARTIAL;
   2765       }
   2766 
   2767     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
   2768       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
   2769       rlevel*2-2, SP));
   2770     break;        /* In effect, "return", but see the comment below */
   2771     }
   2772 
   2773   /* One or more states are active for the next character. */
   2774 
   2775   ptr += clen;    /* Advance to next subject character */
   2776   }               /* Loop to move along the subject string */
   2777 
   2778 /* Control gets here from "break" a few lines above. We do it this way because
   2779 if we use "return" above, we have compiler trouble. Some compilers warn if
   2780 there's nothing here because they think the function doesn't return a value. On
   2781 the other hand, if we put a dummy statement here, some more clever compilers
   2782 complain that it can't be reached. Sigh. */
   2783 
   2784 return match_count;
   2785 }
   2786 
   2787 
   2788 
   2789 
   2790 /*************************************************
   2791 *    Execute a Regular Expression - DFA engine   *
   2792 *************************************************/
   2793 
   2794 /* This external function applies a compiled re to a subject string using a DFA
   2795 engine. This function calls the internal function multiple times if the pattern
   2796 is not anchored.
   2797 
   2798 Arguments:
   2799   argument_re     points to the compiled expression
   2800   extra_data      points to extra data or is NULL
   2801   subject         points to the subject string
   2802   length          length of subject string (may contain binary zeros)
   2803   start_offset    where to start in the subject string
   2804   options         option bits
   2805   offsets         vector of match offsets
   2806   offsetcount     size of same
   2807   workspace       workspace vector
   2808   wscount         size of same
   2809 
   2810 Returns:          > 0 => number of match offset pairs placed in offsets
   2811                   = 0 => offsets overflowed; longest matches are present
   2812                    -1 => failed to match
   2813                  < -1 => some kind of unexpected problem
   2814 */
   2815 
   2816 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
   2817 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
   2818   const char *subject, int length, int start_offset, int options, int *offsets,
   2819   int offsetcount, int *workspace, int wscount)
   2820 {
   2821 real_pcre *re = (real_pcre *)argument_re;
   2822 dfa_match_data match_block;
   2823 dfa_match_data *md = &match_block;
   2824 BOOL utf8, anchored, startline, firstline;
   2825 const uschar *current_subject, *end_subject, *lcc;
   2826 
   2827 pcre_study_data internal_study;
   2828 const pcre_study_data *study = NULL;
   2829 real_pcre internal_re;
   2830 
   2831 const uschar *req_byte_ptr;
   2832 const uschar *start_bits = NULL;
   2833 BOOL first_byte_caseless = FALSE;
   2834 BOOL req_byte_caseless = FALSE;
   2835 int first_byte = -1;
   2836 int req_byte = -1;
   2837 int req_byte2 = -1;
   2838 int newline;
   2839 
   2840 /* Plausibility checks */
   2841 
   2842 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
   2843 if (re == NULL || subject == NULL || workspace == NULL ||
   2844    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
   2845 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
   2846 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
   2847 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
   2848 
   2849 /* We need to find the pointer to any study data before we test for byte
   2850 flipping, so we scan the extra_data block first. This may set two fields in the
   2851 match block, so we must initialize them beforehand. However, the other fields
   2852 in the match block must not be set until after the byte flipping. */
   2853 
   2854 md->tables = re->tables;
   2855 md->callout_data = NULL;
   2856 
   2857 if (extra_data != NULL)
   2858   {
   2859   unsigned int flags = extra_data->flags;
   2860   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
   2861     study = (const pcre_study_data *)extra_data->study_data;
   2862   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
   2863   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
   2864     return PCRE_ERROR_DFA_UMLIMIT;
   2865   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
   2866     md->callout_data = extra_data->callout_data;
   2867   if ((flags & PCRE_EXTRA_TABLES) != 0)
   2868     md->tables = extra_data->tables;
   2869   }
   2870 
   2871 /* Check that the first field in the block is the magic number. If it is not,
   2872 test for a regex that was compiled on a host of opposite endianness. If this is
   2873 the case, flipped values are put in internal_re and internal_study if there was
   2874 study data too. */
   2875 
   2876 if (re->magic_number != MAGIC_NUMBER)
   2877   {
   2878   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
   2879   if (re == NULL) return PCRE_ERROR_BADMAGIC;
   2880   if (study != NULL) study = &internal_study;
   2881   }
   2882 
   2883 /* Set some local values */
   2884 
   2885 current_subject = (const unsigned char *)subject + start_offset;
   2886 end_subject = (const unsigned char *)subject + length;
   2887 req_byte_ptr = current_subject - 1;
   2888 
   2889 #ifdef SUPPORT_UTF8
   2890 utf8 = (re->options & PCRE_UTF8) != 0;
   2891 #else
   2892 utf8 = FALSE;
   2893 #endif
   2894 
   2895 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
   2896   (re->options & PCRE_ANCHORED) != 0;
   2897 
   2898 /* The remaining fixed data for passing around. */
   2899 
   2900 md->start_code = (const uschar *)argument_re +
   2901     re->name_table_offset + re->name_count * re->name_entry_size;
   2902 md->start_subject = (const unsigned char *)subject;
   2903 md->end_subject = end_subject;
   2904 md->start_offset = start_offset;
   2905 md->moptions = options;
   2906 md->poptions = re->options;
   2907 
   2908 /* If the BSR option is not set at match time, copy what was set
   2909 at compile time. */
   2910 
   2911 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
   2912   {
   2913   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
   2914     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
   2915 #ifdef BSR_ANYCRLF
   2916   else md->moptions |= PCRE_BSR_ANYCRLF;
   2917 #endif
   2918   }
   2919 
   2920 /* Handle different types of newline. The three bits give eight cases. If
   2921 nothing is set at run time, whatever was used at compile time applies. */
   2922 
   2923 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
   2924          PCRE_NEWLINE_BITS)
   2925   {
   2926   case 0: newline = NEWLINE; break;   /* Compile-time default */
   2927   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
   2928   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
   2929   case PCRE_NEWLINE_CR+
   2930        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
   2931   case PCRE_NEWLINE_ANY: newline = -1; break;
   2932   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
   2933   default: return PCRE_ERROR_BADNEWLINE;
   2934   }
   2935 
   2936 if (newline == -2)
   2937   {
   2938   md->nltype = NLTYPE_ANYCRLF;
   2939   }
   2940 else if (newline < 0)
   2941   {
   2942   md->nltype = NLTYPE_ANY;
   2943   }
   2944 else
   2945   {
   2946   md->nltype = NLTYPE_FIXED;
   2947   if (newline > 255)
   2948     {
   2949     md->nllen = 2;
   2950     md->nl[0] = (newline >> 8) & 255;
   2951     md->nl[1] = newline & 255;
   2952     }
   2953   else
   2954     {
   2955     md->nllen = 1;
   2956     md->nl[0] = newline;
   2957     }
   2958   }
   2959 
   2960 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
   2961 back the character offset. */
   2962 
   2963 #ifdef SUPPORT_UTF8
   2964 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
   2965   {
   2966   int tb;
   2967   if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
   2968     return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
   2969       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
   2970   if (start_offset > 0 && start_offset < length)
   2971     {
   2972     tb = ((USPTR)subject)[start_offset] & 0xc0;
   2973     if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
   2974     }
   2975   }
   2976 #endif
   2977 
   2978 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
   2979 is a feature that makes it possible to save compiled regex and re-use them
   2980 in other programs later. */
   2981 
   2982 if (md->tables == NULL) md->tables = _pcre_default_tables;
   2983 
   2984 /* The lower casing table and the "must be at the start of a line" flag are
   2985 used in a loop when finding where to start. */
   2986 
   2987 lcc = md->tables + lcc_offset;
   2988 startline = (re->flags & PCRE_STARTLINE) != 0;
   2989 firstline = (re->options & PCRE_FIRSTLINE) != 0;
   2990 
   2991 /* Set up the first character to match, if available. The first_byte value is
   2992 never set for an anchored regular expression, but the anchoring may be forced
   2993 at run time, so we have to test for anchoring. The first char may be unset for
   2994 an unanchored pattern, of course. If there's no first char and the pattern was
   2995 studied, there may be a bitmap of possible first characters. */
   2996 
   2997 if (!anchored)
   2998   {
   2999   if ((re->flags & PCRE_FIRSTSET) != 0)
   3000     {
   3001     first_byte = re->first_byte & 255;
   3002     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
   3003       first_byte = lcc[first_byte];
   3004     }
   3005   else
   3006     {
   3007     if (!startline && study != NULL &&
   3008          (study->flags & PCRE_STUDY_MAPPED) != 0)
   3009       start_bits = study->start_bits;
   3010     }
   3011   }
   3012 
   3013 /* For anchored or unanchored matches, there may be a "last known required
   3014 character" set. */
   3015 
   3016 if ((re->flags & PCRE_REQCHSET) != 0)
   3017   {
   3018   req_byte = re->req_byte & 255;
   3019   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
   3020   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
   3021   }
   3022 
   3023 /* Call the main matching function, looping for a non-anchored regex after a
   3024 failed match. If not restarting, perform certain optimizations at the start of
   3025 a match. */
   3026 
   3027 for (;;)
   3028   {
   3029   int rc;
   3030 
   3031   if ((options & PCRE_DFA_RESTART) == 0)
   3032     {
   3033     const uschar *save_end_subject = end_subject;
   3034 
   3035     /* If firstline is TRUE, the start of the match is constrained to the first
   3036     line of a multiline string. Implement this by temporarily adjusting
   3037     end_subject so that we stop scanning at a newline. If the match fails at
   3038     the newline, later code breaks this loop. */
   3039 
   3040     if (firstline)
   3041       {
   3042       USPTR t = current_subject;
   3043 #ifdef SUPPORT_UTF8
   3044       if (utf8)
   3045         {
   3046         while (t < md->end_subject && !IS_NEWLINE(t))
   3047           {
   3048           t++;
   3049           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
   3050           }
   3051         }
   3052       else
   3053 #endif
   3054       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
   3055       end_subject = t;
   3056       }
   3057 
   3058     /* There are some optimizations that avoid running the match if a known
   3059     starting point is not found. However, there is an option that disables
   3060     these, for testing and for ensuring that all callouts do actually occur.
   3061     The option can be set in the regex by (*NO_START_OPT) or passed in
   3062     match-time options. */
   3063 
   3064     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
   3065       {
   3066       /* Advance to a known first byte. */
   3067 
   3068       if (first_byte >= 0)
   3069         {
   3070         if (first_byte_caseless)
   3071           while (current_subject < end_subject &&
   3072                  lcc[*current_subject] != first_byte)
   3073             current_subject++;
   3074         else
   3075           while (current_subject < end_subject &&
   3076                  *current_subject != first_byte)
   3077             current_subject++;
   3078         }
   3079 
   3080       /* Or to just after a linebreak for a multiline match if possible */
   3081 
   3082       else if (startline)
   3083         {
   3084         if (current_subject > md->start_subject + start_offset)
   3085           {
   3086 #ifdef SUPPORT_UTF8
   3087           if (utf8)
   3088             {
   3089             while (current_subject < end_subject &&
   3090                    !WAS_NEWLINE(current_subject))
   3091               {
   3092               current_subject++;
   3093               while(current_subject < end_subject &&
   3094                     (*current_subject & 0xc0) == 0x80)
   3095                 current_subject++;
   3096               }
   3097             }
   3098           else
   3099 #endif
   3100           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
   3101             current_subject++;
   3102 
   3103           /* If we have just passed a CR and the newline option is ANY or
   3104           ANYCRLF, and we are now at a LF, advance the match position by one
   3105           more character. */
   3106 
   3107           if (current_subject[-1] == CHAR_CR &&
   3108                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
   3109                current_subject < end_subject &&
   3110                *current_subject == CHAR_NL)
   3111             current_subject++;
   3112           }
   3113         }
   3114 
   3115       /* Or to a non-unique first char after study */
   3116 
   3117       else if (start_bits != NULL)
   3118         {
   3119         while (current_subject < end_subject)
   3120           {
   3121           register unsigned int c = *current_subject;
   3122           if ((start_bits[c/8] & (1 << (c&7))) == 0)
   3123             {
   3124             current_subject++;
   3125 #ifdef SUPPORT_UTF8
   3126             if (utf8)
   3127               while(current_subject < end_subject &&
   3128                     (*current_subject & 0xc0) == 0x80) current_subject++;
   3129 #endif
   3130             }
   3131           else break;
   3132           }
   3133         }
   3134       }
   3135 
   3136     /* Restore fudged end_subject */
   3137 
   3138     end_subject = save_end_subject;
   3139 
   3140     /* The following two optimizations are disabled for partial matching or if
   3141     disabling is explicitly requested (and of course, by the test above, this
   3142     code is not obeyed when restarting after a partial match). */
   3143 
   3144     if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
   3145         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
   3146       {
   3147       /* If the pattern was studied, a minimum subject length may be set. This
   3148       is a lower bound; no actual string of that length may actually match the
   3149       pattern. Although the value is, strictly, in characters, we treat it as
   3150       bytes to avoid spending too much time in this optimization. */
   3151 
   3152       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
   3153           (pcre_uint32)(end_subject - current_subject) < study->minlength)
   3154         return PCRE_ERROR_NOMATCH;
   3155 
   3156       /* If req_byte is set, we know that that character must appear in the
   3157       subject for the match to succeed. If the first character is set, req_byte
   3158       must be later in the subject; otherwise the test starts at the match
   3159       point. This optimization can save a huge amount of work in patterns with
   3160       nested unlimited repeats that aren't going to match. Writing separate
   3161       code for cased/caseless versions makes it go faster, as does using an
   3162       autoincrement and backing off on a match.
   3163 
   3164       HOWEVER: when the subject string is very, very long, searching to its end
   3165       can take a long time, and give bad performance on quite ordinary
   3166       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
   3167       string... so we don't do this when the string is sufficiently long. */
   3168 
   3169       if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
   3170         {
   3171         register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
   3172 
   3173         /* We don't need to repeat the search if we haven't yet reached the
   3174         place we found it at last time. */
   3175 
   3176         if (p > req_byte_ptr)
   3177           {
   3178           if (req_byte_caseless)
   3179             {
   3180             while (p < end_subject)
   3181               {
   3182               register int pp = *p++;
   3183               if (pp == req_byte || pp == req_byte2) { p--; break; }
   3184               }
   3185             }
   3186           else
   3187             {
   3188             while (p < end_subject)
   3189               {
   3190               if (*p++ == req_byte) { p--; break; }
   3191               }
   3192             }
   3193 
   3194           /* If we can't find the required character, break the matching loop,
   3195           which will cause a return or PCRE_ERROR_NOMATCH. */
   3196 
   3197           if (p >= end_subject) break;
   3198 
   3199           /* If we have found the required character, save the point where we
   3200           found it, so that we don't search again next time round the loop if
   3201           the start hasn't passed this character yet. */
   3202 
   3203           req_byte_ptr = p;
   3204           }
   3205         }
   3206       }
   3207     }   /* End of optimizations that are done when not restarting */
   3208 
   3209   /* OK, now we can do the business */
   3210 
   3211   md->start_used_ptr = current_subject;
   3212 
   3213   rc = internal_dfa_exec(
   3214     md,                                /* fixed match data */
   3215     md->start_code,                    /* this subexpression's code */
   3216     current_subject,                   /* where we currently are */
   3217     start_offset,                      /* start offset in subject */
   3218     offsets,                           /* offset vector */
   3219     offsetcount,                       /* size of same */
   3220     workspace,                         /* workspace vector */
   3221     wscount,                           /* size of same */
   3222     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
   3223     0,                                 /* function recurse level */
   3224     0);                                /* regex recurse level */
   3225 
   3226   /* Anything other than "no match" means we are done, always; otherwise, carry
   3227   on only if not anchored. */
   3228 
   3229   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
   3230 
   3231   /* Advance to the next subject character unless we are at the end of a line
   3232   and firstline is set. */
   3233 
   3234   if (firstline && IS_NEWLINE(current_subject)) break;
   3235   current_subject++;
   3236   if (utf8)
   3237     {
   3238     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
   3239       current_subject++;
   3240     }
   3241   if (current_subject > end_subject) break;
   3242 
   3243   /* If we have just passed a CR and we are now at a LF, and the pattern does
   3244   not contain any explicit matches for \r or \n, and the newline option is CRLF
   3245   or ANY or ANYCRLF, advance the match position by one more character. */
   3246 
   3247   if (current_subject[-1] == CHAR_CR &&
   3248       current_subject < end_subject &&
   3249       *current_subject == CHAR_NL &&
   3250       (re->flags & PCRE_HASCRORLF) == 0 &&
   3251         (md->nltype == NLTYPE_ANY ||
   3252          md->nltype == NLTYPE_ANYCRLF ||
   3253          md->nllen == 2))
   3254     current_subject++;
   3255 
   3256   }   /* "Bumpalong" loop */
   3257 
   3258 return PCRE_ERROR_NOMATCH;
   3259 }
   3260 
   3261 /* End of pcre_dfa_exec.c */
   3262