Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10           New API code Copyright (c) 2016-2018 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 /* This module contains functions for scanning a compiled pattern and
     42 collecting data (e.g. minimum matching length). */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #include "pcre2_internal.h"
     50 
     51 /* The maximum remembered capturing brackets minimum. */
     52 
     53 #define MAX_CACHE_BACKREF 128
     54 
     55 /* Set a bit in the starting code unit bit map. */
     56 
     57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
     58 
     59 /* Returns from set_start_bits() */
     60 
     61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
     62 
     63 
     64 /*************************************************
     65 *   Find the minimum subject length for a group  *
     66 *************************************************/
     67 
     68 /* Scan a parenthesized group and compute the minimum length of subject that
     69 is needed to match it. This is a lower bound; it does not mean there is a
     70 string of that length that matches. In UTF mode, the result is in characters
     71 rather than code units. The field in a compiled pattern for storing the minimum
     72 length is 16-bits long (on the grounds that anything longer than that is
     73 pathological), so we give up when we reach that amount. This also means that
     74 integer overflow for really crazy patterns cannot happen.
     75 
     76 Backreference minimum lengths are cached to speed up multiple references. This
     77 function is called only when the highest back reference in the pattern is less
     78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
     79 caching vector. The zeroth element contains the number of the highest set
     80 value.
     81 
     82 Arguments:
     83   re              compiled pattern block
     84   code            pointer to start of group (the bracket)
     85   startcode       pointer to start of the whole pattern's code
     86   utf             UTF flag
     87   recurses        chain of recurse_check to catch mutual recursion
     88   countptr        pointer to call count (to catch over complexity)
     89   backref_cache   vector for caching back references.
     90 
     91 Returns:   the minimum length
     92            -1 \C in UTF-8 mode
     93               or (*ACCEPT)
     94               or pattern too complicated
     95               or back reference to duplicate name/number
     96            -2 internal error (missing capturing bracket)
     97            -3 internal error (opcode not listed)
     98 */
     99 
    100 static int
    101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
    102   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
    103   int *backref_cache)
    104 {
    105 int length = -1;
    106 int prev_cap_recno = -1;
    107 int prev_cap_d = 0;
    108 int prev_recurse_recno = -1;
    109 int prev_recurse_d = 0;
    110 uint32_t once_fudge = 0;
    111 BOOL had_recurse = FALSE;
    112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
    113 recurse_check this_recurse;
    114 int branchlength = 0;
    115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
    116 
    117 /* If this is a "could be empty" group, its minimum length is 0. */
    118 
    119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
    120 
    121 /* Skip over capturing bracket number */
    122 
    123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
    124 
    125 /* A large and/or complex regex can take too long to process. */
    126 
    127 if ((*countptr)++ > 1000) return -1;
    128 
    129 /* Scan along the opcodes for this branch. If we get to the end of the branch,
    130 check the length against that of the other branches. If the accumulated length
    131 passes 16-bits, stop. */
    132 
    133 for (;;)
    134   {
    135   int d, min, recno;
    136   PCRE2_UCHAR *cs, *ce;
    137   PCRE2_UCHAR op = *cc;
    138 
    139   if (branchlength >= UINT16_MAX) return UINT16_MAX;
    140 
    141   switch (op)
    142     {
    143     case OP_COND:
    144     case OP_SCOND:
    145 
    146     /* If there is only one branch in a condition, the implied branch has zero
    147     length, so we don't add anything. This covers the DEFINE "condition"
    148     automatically. If there are two branches we can treat it the same as any
    149     other non-capturing subpattern. */
    150 
    151     cs = cc + GET(cc, 1);
    152     if (*cs != OP_ALT)
    153       {
    154       cc = cs + 1 + LINK_SIZE;
    155       break;
    156       }
    157     goto PROCESS_NON_CAPTURE;
    158 
    159     case OP_BRA:
    160     /* There's a special case of OP_BRA, when it is wrapped round a repeated
    161     OP_RECURSE. We'd like to process the latter at this level so that
    162     remembering the value works for repeated cases. So we do nothing, but
    163     set a fudge value to skip over the OP_KET after the recurse. */
    164 
    165     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
    166       {
    167       once_fudge = 1 + LINK_SIZE;
    168       cc += 1 + LINK_SIZE;
    169       break;
    170       }
    171     /* Fall through */
    172 
    173     case OP_ONCE:
    174     case OP_SBRA:
    175     case OP_BRAPOS:
    176     case OP_SBRAPOS:
    177     PROCESS_NON_CAPTURE:
    178     d = find_minlength(re, cc, startcode, utf, recurses, countptr,
    179       backref_cache);
    180     if (d < 0) return d;
    181     branchlength += d;
    182     do cc += GET(cc, 1); while (*cc == OP_ALT);
    183     cc += 1 + LINK_SIZE;
    184     break;
    185 
    186     /* To save time for repeated capturing subpatterns, we remember the
    187     length of the previous one. Unfortunately we can't do the same for
    188     the unnumbered ones above. Nor can we do this if (?| is present in the
    189     pattern because captures with the same number are not then identical. */
    190 
    191     case OP_CBRA:
    192     case OP_SCBRA:
    193     case OP_CBRAPOS:
    194     case OP_SCBRAPOS:
    195     recno = (int)GET2(cc, 1+LINK_SIZE);
    196     if (dupcapused || recno != prev_cap_recno)
    197       {
    198       prev_cap_recno = recno;
    199       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
    200         backref_cache);
    201       if (prev_cap_d < 0) return prev_cap_d;
    202       }
    203     branchlength += prev_cap_d;
    204     do cc += GET(cc, 1); while (*cc == OP_ALT);
    205     cc += 1 + LINK_SIZE;
    206     break;
    207 
    208     /* ACCEPT makes things far too complicated; we have to give up. */
    209 
    210     case OP_ACCEPT:
    211     case OP_ASSERT_ACCEPT:
    212     return -1;
    213 
    214     /* Reached end of a branch; if it's a ket it is the end of a nested
    215     call. If it's ALT it is an alternation in a nested call. If it is END it's
    216     the end of the outer call. All can be handled by the same code. If an
    217     ACCEPT was previously encountered, use the length that was in force at that
    218     time, and pass back the shortest ACCEPT length. */
    219 
    220     case OP_ALT:
    221     case OP_KET:
    222     case OP_KETRMAX:
    223     case OP_KETRMIN:
    224     case OP_KETRPOS:
    225     case OP_END:
    226     if (length < 0 || (!had_recurse && branchlength < length))
    227       length = branchlength;
    228     if (op != OP_ALT) return length;
    229     cc += 1 + LINK_SIZE;
    230     branchlength = 0;
    231     had_recurse = FALSE;
    232     break;
    233 
    234     /* Skip over assertive subpatterns */
    235 
    236     case OP_ASSERT:
    237     case OP_ASSERT_NOT:
    238     case OP_ASSERTBACK:
    239     case OP_ASSERTBACK_NOT:
    240     do cc += GET(cc, 1); while (*cc == OP_ALT);
    241     /* Fall through */
    242 
    243     /* Skip over things that don't match chars */
    244 
    245     case OP_REVERSE:
    246     case OP_CREF:
    247     case OP_DNCREF:
    248     case OP_RREF:
    249     case OP_DNRREF:
    250     case OP_FALSE:
    251     case OP_TRUE:
    252     case OP_CALLOUT:
    253     case OP_SOD:
    254     case OP_SOM:
    255     case OP_EOD:
    256     case OP_EODN:
    257     case OP_CIRC:
    258     case OP_CIRCM:
    259     case OP_DOLL:
    260     case OP_DOLLM:
    261     case OP_NOT_WORD_BOUNDARY:
    262     case OP_WORD_BOUNDARY:
    263     cc += PRIV(OP_lengths)[*cc];
    264     break;
    265 
    266     case OP_CALLOUT_STR:
    267     cc += GET(cc, 1 + 2*LINK_SIZE);
    268     break;
    269 
    270     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
    271 
    272     case OP_BRAZERO:
    273     case OP_BRAMINZERO:
    274     case OP_BRAPOSZERO:
    275     case OP_SKIPZERO:
    276     cc += PRIV(OP_lengths)[*cc];
    277     do cc += GET(cc, 1); while (*cc == OP_ALT);
    278     cc += 1 + LINK_SIZE;
    279     break;
    280 
    281     /* Handle literal characters and + repetitions */
    282 
    283     case OP_CHAR:
    284     case OP_CHARI:
    285     case OP_NOT:
    286     case OP_NOTI:
    287     case OP_PLUS:
    288     case OP_PLUSI:
    289     case OP_MINPLUS:
    290     case OP_MINPLUSI:
    291     case OP_POSPLUS:
    292     case OP_POSPLUSI:
    293     case OP_NOTPLUS:
    294     case OP_NOTPLUSI:
    295     case OP_NOTMINPLUS:
    296     case OP_NOTMINPLUSI:
    297     case OP_NOTPOSPLUS:
    298     case OP_NOTPOSPLUSI:
    299     branchlength++;
    300     cc += 2;
    301 #ifdef SUPPORT_UNICODE
    302     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    303 #endif
    304     break;
    305 
    306     case OP_TYPEPLUS:
    307     case OP_TYPEMINPLUS:
    308     case OP_TYPEPOSPLUS:
    309     branchlength++;
    310     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
    311     break;
    312 
    313     /* Handle exact repetitions. The count is already in characters, but we
    314     may need to skip over a multibyte character in UTF mode.  */
    315 
    316     case OP_EXACT:
    317     case OP_EXACTI:
    318     case OP_NOTEXACT:
    319     case OP_NOTEXACTI:
    320     branchlength += GET2(cc,1);
    321     cc += 2 + IMM2_SIZE;
    322 #ifdef SUPPORT_UNICODE
    323     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    324 #endif
    325     break;
    326 
    327     case OP_TYPEEXACT:
    328     branchlength += GET2(cc,1);
    329     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
    330       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
    331     break;
    332 
    333     /* Handle single-char non-literal matchers */
    334 
    335     case OP_PROP:
    336     case OP_NOTPROP:
    337     cc += 2;
    338     /* Fall through */
    339 
    340     case OP_NOT_DIGIT:
    341     case OP_DIGIT:
    342     case OP_NOT_WHITESPACE:
    343     case OP_WHITESPACE:
    344     case OP_NOT_WORDCHAR:
    345     case OP_WORDCHAR:
    346     case OP_ANY:
    347     case OP_ALLANY:
    348     case OP_EXTUNI:
    349     case OP_HSPACE:
    350     case OP_NOT_HSPACE:
    351     case OP_VSPACE:
    352     case OP_NOT_VSPACE:
    353     branchlength++;
    354     cc++;
    355     break;
    356 
    357     /* "Any newline" might match two characters, but it also might match just
    358     one. */
    359 
    360     case OP_ANYNL:
    361     branchlength += 1;
    362     cc++;
    363     break;
    364 
    365     /* The single-byte matcher means we can't proceed in UTF mode. (In
    366     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
    367     appear, but leave the code, just in case.) */
    368 
    369     case OP_ANYBYTE:
    370 #ifdef SUPPORT_UNICODE
    371     if (utf) return -1;
    372 #endif
    373     branchlength++;
    374     cc++;
    375     break;
    376 
    377     /* For repeated character types, we have to test for \p and \P, which have
    378     an extra two bytes of parameters. */
    379 
    380     case OP_TYPESTAR:
    381     case OP_TYPEMINSTAR:
    382     case OP_TYPEQUERY:
    383     case OP_TYPEMINQUERY:
    384     case OP_TYPEPOSSTAR:
    385     case OP_TYPEPOSQUERY:
    386     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
    387     cc += PRIV(OP_lengths)[op];
    388     break;
    389 
    390     case OP_TYPEUPTO:
    391     case OP_TYPEMINUPTO:
    392     case OP_TYPEPOSUPTO:
    393     if (cc[1 + IMM2_SIZE] == OP_PROP
    394       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
    395     cc += PRIV(OP_lengths)[op];
    396     break;
    397 
    398     /* Check a class for variable quantification */
    399 
    400     case OP_CLASS:
    401     case OP_NCLASS:
    402 #ifdef SUPPORT_WIDE_CHARS
    403     case OP_XCLASS:
    404     /* The original code caused an unsigned overflow in 64 bit systems,
    405     so now we use a conditional statement. */
    406     if (op == OP_XCLASS)
    407       cc += GET(cc, 1);
    408     else
    409       cc += PRIV(OP_lengths)[OP_CLASS];
    410 #else
    411     cc += PRIV(OP_lengths)[OP_CLASS];
    412 #endif
    413 
    414     switch (*cc)
    415       {
    416       case OP_CRPLUS:
    417       case OP_CRMINPLUS:
    418       case OP_CRPOSPLUS:
    419       branchlength++;
    420       /* Fall through */
    421 
    422       case OP_CRSTAR:
    423       case OP_CRMINSTAR:
    424       case OP_CRQUERY:
    425       case OP_CRMINQUERY:
    426       case OP_CRPOSSTAR:
    427       case OP_CRPOSQUERY:
    428       cc++;
    429       break;
    430 
    431       case OP_CRRANGE:
    432       case OP_CRMINRANGE:
    433       case OP_CRPOSRANGE:
    434       branchlength += GET2(cc,1);
    435       cc += 1 + 2 * IMM2_SIZE;
    436       break;
    437 
    438       default:
    439       branchlength++;
    440       break;
    441       }
    442     break;
    443 
    444     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
    445     way: we find the minimum length for the subpattern. A recursion
    446     (backreference or subroutine) causes an a flag to be set that causes the
    447     length of this branch to be ignored. The logic is that a recursion can only
    448     make sense if there is another alternative that stops the recursing. That
    449     will provide the minimum length (when no recursion happens).
    450 
    451     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
    452     matches an empty string (by default it causes a matching failure), so in
    453     that case we must set the minimum length to zero. */
    454 
    455     /* Duplicate named pattern back reference. We cannot reliably find a length
    456     for this if duplicate numbers are present in the pattern. */
    457 
    458     case OP_DNREF:
    459     case OP_DNREFI:
    460     if (dupcapused) return -1;
    461     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
    462       {
    463       int count = GET2(cc, 1+IMM2_SIZE);
    464       PCRE2_UCHAR *slot =
    465         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
    466           GET2(cc, 1) * re->name_entry_size;
    467 
    468       d = INT_MAX;
    469 
    470       /* Scan all groups with the same name; find the shortest. */
    471 
    472       while (count-- > 0)
    473         {
    474         int dd, i;
    475         recno = GET2(slot, 0);
    476 
    477         if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
    478           dd = backref_cache[recno];
    479         else
    480           {
    481           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
    482           if (cs == NULL) return -2;
    483           do ce += GET(ce, 1); while (*ce == OP_ALT);
    484           if (cc > cs && cc < ce)    /* Simple recursion */
    485             {
    486             dd = 0;
    487             had_recurse = TRUE;
    488             }
    489           else
    490             {
    491             recurse_check *r = recurses;
    492             for (r = recurses; r != NULL; r = r->prev)
    493               if (r->group == cs) break;
    494             if (r != NULL)           /* Mutual recursion */
    495               {
    496               dd = 0;
    497               had_recurse = TRUE;
    498               }
    499             else
    500               {
    501               this_recurse.prev = recurses;
    502               this_recurse.group = cs;
    503               dd = find_minlength(re, cs, startcode, utf, &this_recurse,
    504                 countptr, backref_cache);
    505               if (dd < 0) return dd;
    506               }
    507             }
    508 
    509           backref_cache[recno] = dd;
    510           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
    511           backref_cache[0] = recno;
    512           }
    513 
    514         if (dd < d) d = dd;
    515         if (d <= 0) break;    /* No point looking at any more */
    516         slot += re->name_entry_size;
    517         }
    518       }
    519     else d = 0;
    520     cc += 1 + 2*IMM2_SIZE;
    521     goto REPEAT_BACK_REFERENCE;
    522 
    523     /* Single back reference. We cannot find a length for this if duplicate
    524     numbers are present in the pattern. */
    525 
    526     case OP_REF:
    527     case OP_REFI:
    528     if (dupcapused) return -1;
    529     recno = GET2(cc, 1);
    530     if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
    531       d = backref_cache[recno];
    532     else
    533       {
    534       int i;
    535       if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
    536         {
    537         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
    538         if (cs == NULL) return -2;
    539         do ce += GET(ce, 1); while (*ce == OP_ALT);
    540         if (cc > cs && cc < ce)    /* Simple recursion */
    541           {
    542           d = 0;
    543           had_recurse = TRUE;
    544           }
    545         else
    546           {
    547           recurse_check *r = recurses;
    548           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    549           if (r != NULL)           /* Mutual recursion */
    550             {
    551             d = 0;
    552             had_recurse = TRUE;
    553             }
    554           else
    555             {
    556             this_recurse.prev = recurses;
    557             this_recurse.group = cs;
    558             d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
    559               backref_cache);
    560             if (d < 0) return d;
    561             }
    562           }
    563         }
    564       else d = 0;
    565 
    566       backref_cache[recno] = d;
    567       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
    568       backref_cache[0] = recno;
    569       }
    570 
    571     cc += 1 + IMM2_SIZE;
    572 
    573     /* Handle repeated back references */
    574 
    575     REPEAT_BACK_REFERENCE:
    576     switch (*cc)
    577       {
    578       case OP_CRSTAR:
    579       case OP_CRMINSTAR:
    580       case OP_CRQUERY:
    581       case OP_CRMINQUERY:
    582       case OP_CRPOSSTAR:
    583       case OP_CRPOSQUERY:
    584       min = 0;
    585       cc++;
    586       break;
    587 
    588       case OP_CRPLUS:
    589       case OP_CRMINPLUS:
    590       case OP_CRPOSPLUS:
    591       min = 1;
    592       cc++;
    593       break;
    594 
    595       case OP_CRRANGE:
    596       case OP_CRMINRANGE:
    597       case OP_CRPOSRANGE:
    598       min = GET2(cc, 1);
    599       cc += 1 + 2 * IMM2_SIZE;
    600       break;
    601 
    602       default:
    603       min = 1;
    604       break;
    605       }
    606 
    607      /* Take care not to overflow: (1) min and d are ints, so check that their
    608      product is not greater than INT_MAX. (2) branchlength is limited to
    609      UINT16_MAX (checked at the top of the loop). */
    610 
    611     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
    612       branchlength = UINT16_MAX;
    613     else branchlength += min * d;
    614     break;
    615 
    616     /* Recursion always refers to the first occurrence of a subpattern with a
    617     given number. Therefore, we can always make use of caching, even when the
    618     pattern contains multiple subpatterns with the same number. */
    619 
    620     case OP_RECURSE:
    621     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
    622     recno = GET2(cs, 1+LINK_SIZE);
    623     if (recno == prev_recurse_recno)
    624       {
    625       branchlength += prev_recurse_d;
    626       }
    627     else
    628       {
    629       do ce += GET(ce, 1); while (*ce == OP_ALT);
    630       if (cc > cs && cc < ce)    /* Simple recursion */
    631         had_recurse = TRUE;
    632       else
    633         {
    634         recurse_check *r = recurses;
    635         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    636         if (r != NULL)          /* Mutual recursion */
    637           had_recurse = TRUE;
    638         else
    639           {
    640           this_recurse.prev = recurses;
    641           this_recurse.group = cs;
    642           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
    643             countptr, backref_cache);
    644           if (prev_recurse_d < 0) return prev_recurse_d;
    645           prev_recurse_recno = recno;
    646           branchlength += prev_recurse_d;
    647           }
    648         }
    649       }
    650     cc += 1 + LINK_SIZE + once_fudge;
    651     once_fudge = 0;
    652     break;
    653 
    654     /* Anything else does not or need not match a character. We can get the
    655     item's length from the table, but for those that can match zero occurrences
    656     of a character, we must take special action for UTF-8 characters. As it
    657     happens, the "NOT" versions of these opcodes are used at present only for
    658     ASCII characters, so they could be omitted from this list. However, in
    659     future that may change, so we include them here so as not to leave a
    660     gotcha for a future maintainer. */
    661 
    662     case OP_UPTO:
    663     case OP_UPTOI:
    664     case OP_NOTUPTO:
    665     case OP_NOTUPTOI:
    666     case OP_MINUPTO:
    667     case OP_MINUPTOI:
    668     case OP_NOTMINUPTO:
    669     case OP_NOTMINUPTOI:
    670     case OP_POSUPTO:
    671     case OP_POSUPTOI:
    672     case OP_NOTPOSUPTO:
    673     case OP_NOTPOSUPTOI:
    674 
    675     case OP_STAR:
    676     case OP_STARI:
    677     case OP_NOTSTAR:
    678     case OP_NOTSTARI:
    679     case OP_MINSTAR:
    680     case OP_MINSTARI:
    681     case OP_NOTMINSTAR:
    682     case OP_NOTMINSTARI:
    683     case OP_POSSTAR:
    684     case OP_POSSTARI:
    685     case OP_NOTPOSSTAR:
    686     case OP_NOTPOSSTARI:
    687 
    688     case OP_QUERY:
    689     case OP_QUERYI:
    690     case OP_NOTQUERY:
    691     case OP_NOTQUERYI:
    692     case OP_MINQUERY:
    693     case OP_MINQUERYI:
    694     case OP_NOTMINQUERY:
    695     case OP_NOTMINQUERYI:
    696     case OP_POSQUERY:
    697     case OP_POSQUERYI:
    698     case OP_NOTPOSQUERY:
    699     case OP_NOTPOSQUERYI:
    700 
    701     cc += PRIV(OP_lengths)[op];
    702 #ifdef SUPPORT_UNICODE
    703     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    704 #endif
    705     break;
    706 
    707     /* Skip these, but we need to add in the name length. */
    708 
    709     case OP_MARK:
    710     case OP_COMMIT_ARG:
    711     case OP_PRUNE_ARG:
    712     case OP_SKIP_ARG:
    713     case OP_THEN_ARG:
    714     cc += PRIV(OP_lengths)[op] + cc[1];
    715     break;
    716 
    717     /* The remaining opcodes are just skipped over. */
    718 
    719     case OP_CLOSE:
    720     case OP_COMMIT:
    721     case OP_FAIL:
    722     case OP_PRUNE:
    723     case OP_SET_SOM:
    724     case OP_SKIP:
    725     case OP_THEN:
    726     cc += PRIV(OP_lengths)[op];
    727     break;
    728 
    729     /* This should not occur: we list all opcodes explicitly so that when
    730     new ones get added they are properly considered. */
    731 
    732     default:
    733     return -3;
    734     }
    735   }
    736 /* Control never gets here */
    737 }
    738 
    739 
    740 
    741 /*************************************************
    742 *      Set a bit and maybe its alternate case    *
    743 *************************************************/
    744 
    745 /* Given a character, set its first code unit's bit in the table, and also the
    746 corresponding bit for the other version of a letter if we are caseless.
    747 
    748 Arguments:
    749   re            points to the regex block
    750   p             points to the first code unit of the character
    751   caseless      TRUE if caseless
    752   utf           TRUE for UTF mode
    753 
    754 Returns:        pointer after the character
    755 */
    756 
    757 static PCRE2_SPTR
    758 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
    759 {
    760 uint32_t c = *p++;   /* First code unit */
    761 (void)utf;           /* Stop compiler warning when UTF not supported */
    762 
    763 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
    764 0xff. */
    765 
    766 #if PCRE2_CODE_UNIT_WIDTH != 8
    767 if (c > 0xff) SET_BIT(0xff); else
    768 #endif
    769 
    770 SET_BIT(c);
    771 
    772 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
    773 the end of the character, even when caseless. */
    774 
    775 #ifdef SUPPORT_UNICODE
    776 if (utf)
    777   {
    778 #if PCRE2_CODE_UNIT_WIDTH == 8
    779   if (c >= 0xc0) GETUTF8INC(c, p);
    780 #elif PCRE2_CODE_UNIT_WIDTH == 16
    781   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
    782 #endif
    783   }
    784 #endif  /* SUPPORT_UNICODE */
    785 
    786 /* If caseless, handle the other case of the character. */
    787 
    788 if (caseless)
    789   {
    790 #ifdef SUPPORT_UNICODE
    791   if (utf)
    792     {
    793 #if PCRE2_CODE_UNIT_WIDTH == 8
    794     PCRE2_UCHAR buff[6];
    795     c = UCD_OTHERCASE(c);
    796     (void)PRIV(ord2utf)(c, buff);
    797     SET_BIT(buff[0]);
    798 #else  /* 16-bit or 32-bit mode */
    799     c = UCD_OTHERCASE(c);
    800     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
    801 #endif
    802     }
    803   else
    804 #endif  /* SUPPORT_UNICODE */
    805 
    806   /* Not UTF */
    807 
    808   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
    809   }
    810 
    811 return p;
    812 }
    813 
    814 
    815 
    816 /*************************************************
    817 *     Set bits for a positive character type     *
    818 *************************************************/
    819 
    820 /* This function sets starting bits for a character type. In UTF-8 mode, we can
    821 only do a direct setting for bytes less than 128, as otherwise there can be
    822 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
    823 environment, the tables will only recognize ASCII characters anyway, but in at
    824 least one Windows environment, some higher bytes bits were set in the tables.
    825 So we deal with that case by considering the UTF-8 encoding.
    826 
    827 Arguments:
    828   re             the regex block
    829   cbit type      the type of character wanted
    830   table_limit    32 for non-UTF-8; 16 for UTF-8
    831 
    832 Returns:         nothing
    833 */
    834 
    835 static void
    836 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
    837 {
    838 uint32_t c;
    839 for (c = 0; c < table_limit; c++)
    840   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
    841 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    842 if (table_limit == 32) return;
    843 for (c = 128; c < 256; c++)
    844   {
    845   if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
    846     {
    847     PCRE2_UCHAR buff[6];
    848     (void)PRIV(ord2utf)(c, buff);
    849     SET_BIT(buff[0]);
    850     }
    851   }
    852 #endif  /* UTF-8 */
    853 }
    854 
    855 
    856 /*************************************************
    857 *     Set bits for a negative character type     *
    858 *************************************************/
    859 
    860 /* This function sets starting bits for a negative character type such as \D.
    861 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
    862 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
    863 Unlike in the positive case, where we can set appropriate starting bits for
    864 specific high-valued UTF-8 characters, in this case we have to set the bits for
    865 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
    866 0xc0 (192) for simplicity.
    867 
    868 Arguments:
    869   re             the regex block
    870   cbit type      the type of character wanted
    871   table_limit    32 for non-UTF-8; 16 for UTF-8
    872 
    873 Returns:         nothing
    874 */
    875 
    876 static void
    877 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
    878 {
    879 uint32_t c;
    880 for (c = 0; c < table_limit; c++)
    881   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
    882 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    883 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
    884 #endif
    885 }
    886 
    887 
    888 
    889 /*************************************************
    890 *          Create bitmap of starting bytes       *
    891 *************************************************/
    892 
    893 /* This function scans a compiled unanchored expression recursively and
    894 attempts to build a bitmap of the set of possible starting code units whose
    895 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
    896 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
    897 we pass a value of 16 rather than 32 as the final argument. (See comments in
    898 those functions for the reason.)
    899 
    900 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
    901 (a*)b where the group provides some optional starting code units but scanning
    902 must continue at the outer level to find at least one mandatory code unit. At
    903 the outermost level, this function fails unless the result is SSB_DONE.
    904 
    905 Arguments:
    906   re           points to the compiled regex block
    907   code         points to an expression
    908   utf          TRUE if in UTF mode
    909 
    910 Returns:       SSB_FAIL     => Failed to find any starting code units
    911                SSB_DONE     => Found mandatory starting code units
    912                SSB_CONTINUE => Found optional starting code units
    913                SSB_UNKNOWN  => Hit an unrecognized opcode
    914 */
    915 
    916 static int
    917 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
    918 {
    919 uint32_t c;
    920 int yield = SSB_DONE;
    921 
    922 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    923 int table_limit = utf? 16:32;
    924 #else
    925 int table_limit = 32;
    926 #endif
    927 
    928 do
    929   {
    930   BOOL try_next = TRUE;
    931   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
    932 
    933   if (*code == OP_CBRA || *code == OP_SCBRA ||
    934       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
    935 
    936   while (try_next)    /* Loop for items in this branch */
    937     {
    938     int rc;
    939     uint8_t *classmap = NULL;
    940 
    941     switch(*tcode)
    942       {
    943       /* If we reach something we don't understand, it means a new opcode has
    944       been created that hasn't been added to this function. Hopefully this
    945       problem will be discovered during testing. */
    946 
    947       default:
    948       return SSB_UNKNOWN;
    949 
    950       /* Fail for a valid opcode that implies no starting bits. */
    951 
    952       case OP_ACCEPT:
    953       case OP_ASSERT_ACCEPT:
    954       case OP_ALLANY:
    955       case OP_ANY:
    956       case OP_ANYBYTE:
    957       case OP_CIRCM:
    958       case OP_CLOSE:
    959       case OP_COMMIT:
    960       case OP_COMMIT_ARG:
    961       case OP_COND:
    962       case OP_CREF:
    963       case OP_FALSE:
    964       case OP_TRUE:
    965       case OP_DNCREF:
    966       case OP_DNREF:
    967       case OP_DNREFI:
    968       case OP_DNRREF:
    969       case OP_DOLL:
    970       case OP_DOLLM:
    971       case OP_END:
    972       case OP_EOD:
    973       case OP_EODN:
    974       case OP_EXTUNI:
    975       case OP_FAIL:
    976       case OP_MARK:
    977       case OP_NOT:
    978       case OP_NOTEXACT:
    979       case OP_NOTEXACTI:
    980       case OP_NOTI:
    981       case OP_NOTMINPLUS:
    982       case OP_NOTMINPLUSI:
    983       case OP_NOTMINQUERY:
    984       case OP_NOTMINQUERYI:
    985       case OP_NOTMINSTAR:
    986       case OP_NOTMINSTARI:
    987       case OP_NOTMINUPTO:
    988       case OP_NOTMINUPTOI:
    989       case OP_NOTPLUS:
    990       case OP_NOTPLUSI:
    991       case OP_NOTPOSPLUS:
    992       case OP_NOTPOSPLUSI:
    993       case OP_NOTPOSQUERY:
    994       case OP_NOTPOSQUERYI:
    995       case OP_NOTPOSSTAR:
    996       case OP_NOTPOSSTARI:
    997       case OP_NOTPOSUPTO:
    998       case OP_NOTPOSUPTOI:
    999       case OP_NOTPROP:
   1000       case OP_NOTQUERY:
   1001       case OP_NOTQUERYI:
   1002       case OP_NOTSTAR:
   1003       case OP_NOTSTARI:
   1004       case OP_NOTUPTO:
   1005       case OP_NOTUPTOI:
   1006       case OP_NOT_HSPACE:
   1007       case OP_NOT_VSPACE:
   1008       case OP_PRUNE:
   1009       case OP_PRUNE_ARG:
   1010       case OP_RECURSE:
   1011       case OP_REF:
   1012       case OP_REFI:
   1013       case OP_REVERSE:
   1014       case OP_RREF:
   1015       case OP_SCOND:
   1016       case OP_SET_SOM:
   1017       case OP_SKIP:
   1018       case OP_SKIP_ARG:
   1019       case OP_SOD:
   1020       case OP_SOM:
   1021       case OP_THEN:
   1022       case OP_THEN_ARG:
   1023       return SSB_FAIL;
   1024 
   1025       /* OP_CIRC happens only at the start of an anchored branch (multiline ^
   1026       uses OP_CIRCM). Skip over it. */
   1027 
   1028       case OP_CIRC:
   1029       tcode += PRIV(OP_lengths)[OP_CIRC];
   1030       break;
   1031 
   1032       /* A "real" property test implies no starting bits, but the fake property
   1033       PT_CLIST identifies a list of characters. These lists are short, as they
   1034       are used for characters with more than one "other case", so there is no
   1035       point in recognizing them for OP_NOTPROP. */
   1036 
   1037       case OP_PROP:
   1038       if (tcode[1] != PT_CLIST) return SSB_FAIL;
   1039         {
   1040         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
   1041         while ((c = *p++) < NOTACHAR)
   1042           {
   1043 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   1044           if (utf)
   1045             {
   1046             PCRE2_UCHAR buff[6];
   1047             (void)PRIV(ord2utf)(c, buff);
   1048             c = buff[0];
   1049             }
   1050 #endif
   1051           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
   1052           }
   1053         }
   1054       try_next = FALSE;
   1055       break;
   1056 
   1057       /* We can ignore word boundary tests. */
   1058 
   1059       case OP_WORD_BOUNDARY:
   1060       case OP_NOT_WORD_BOUNDARY:
   1061       tcode++;
   1062       break;
   1063 
   1064       /* If we hit a bracket or a positive lookahead assertion, recurse to set
   1065       bits from within the subpattern. If it can't find anything, we have to
   1066       give up. If it finds some mandatory character(s), we are done for this
   1067       branch. Otherwise, carry on scanning after the subpattern. */
   1068 
   1069       case OP_BRA:
   1070       case OP_SBRA:
   1071       case OP_CBRA:
   1072       case OP_SCBRA:
   1073       case OP_BRAPOS:
   1074       case OP_SBRAPOS:
   1075       case OP_CBRAPOS:
   1076       case OP_SCBRAPOS:
   1077       case OP_ONCE:
   1078       case OP_ASSERT:
   1079       rc = set_start_bits(re, tcode, utf);
   1080       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
   1081       if (rc == SSB_DONE) try_next = FALSE; else
   1082         {
   1083         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
   1084         tcode += 1 + LINK_SIZE;
   1085         }
   1086       break;
   1087 
   1088       /* If we hit ALT or KET, it means we haven't found anything mandatory in
   1089       this branch, though we might have found something optional. For ALT, we
   1090       continue with the next alternative, but we have to arrange that the final
   1091       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
   1092       return SSB_CONTINUE: if this is the top level, that indicates failure,
   1093       but after a nested subpattern, it causes scanning to continue. */
   1094 
   1095       case OP_ALT:
   1096       yield = SSB_CONTINUE;
   1097       try_next = FALSE;
   1098       break;
   1099 
   1100       case OP_KET:
   1101       case OP_KETRMAX:
   1102       case OP_KETRMIN:
   1103       case OP_KETRPOS:
   1104       return SSB_CONTINUE;
   1105 
   1106       /* Skip over callout */
   1107 
   1108       case OP_CALLOUT:
   1109       tcode += PRIV(OP_lengths)[OP_CALLOUT];
   1110       break;
   1111 
   1112       case OP_CALLOUT_STR:
   1113       tcode += GET(tcode, 1 + 2*LINK_SIZE);
   1114       break;
   1115 
   1116       /* Skip over lookbehind and negative lookahead assertions */
   1117 
   1118       case OP_ASSERT_NOT:
   1119       case OP_ASSERTBACK:
   1120       case OP_ASSERTBACK_NOT:
   1121       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
   1122       tcode += 1 + LINK_SIZE;
   1123       break;
   1124 
   1125       /* BRAZERO does the bracket, but carries on. */
   1126 
   1127       case OP_BRAZERO:
   1128       case OP_BRAMINZERO:
   1129       case OP_BRAPOSZERO:
   1130       rc = set_start_bits(re, ++tcode, utf);
   1131       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
   1132       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1133       tcode += 1 + LINK_SIZE;
   1134       break;
   1135 
   1136       /* SKIPZERO skips the bracket. */
   1137 
   1138       case OP_SKIPZERO:
   1139       tcode++;
   1140       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1141       tcode += 1 + LINK_SIZE;
   1142       break;
   1143 
   1144       /* Single-char * or ? sets the bit and tries the next item */
   1145 
   1146       case OP_STAR:
   1147       case OP_MINSTAR:
   1148       case OP_POSSTAR:
   1149       case OP_QUERY:
   1150       case OP_MINQUERY:
   1151       case OP_POSQUERY:
   1152       tcode = set_table_bit(re, tcode + 1, FALSE, utf);
   1153       break;
   1154 
   1155       case OP_STARI:
   1156       case OP_MINSTARI:
   1157       case OP_POSSTARI:
   1158       case OP_QUERYI:
   1159       case OP_MINQUERYI:
   1160       case OP_POSQUERYI:
   1161       tcode = set_table_bit(re, tcode + 1, TRUE, utf);
   1162       break;
   1163 
   1164       /* Single-char upto sets the bit and tries the next */
   1165 
   1166       case OP_UPTO:
   1167       case OP_MINUPTO:
   1168       case OP_POSUPTO:
   1169       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
   1170       break;
   1171 
   1172       case OP_UPTOI:
   1173       case OP_MINUPTOI:
   1174       case OP_POSUPTOI:
   1175       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
   1176       break;
   1177 
   1178       /* At least one single char sets the bit and stops */
   1179 
   1180       case OP_EXACT:
   1181       tcode += IMM2_SIZE;
   1182       /* Fall through */
   1183       case OP_CHAR:
   1184       case OP_PLUS:
   1185       case OP_MINPLUS:
   1186       case OP_POSPLUS:
   1187       (void)set_table_bit(re, tcode + 1, FALSE, utf);
   1188       try_next = FALSE;
   1189       break;
   1190 
   1191       case OP_EXACTI:
   1192       tcode += IMM2_SIZE;
   1193       /* Fall through */
   1194       case OP_CHARI:
   1195       case OP_PLUSI:
   1196       case OP_MINPLUSI:
   1197       case OP_POSPLUSI:
   1198       (void)set_table_bit(re, tcode + 1, TRUE, utf);
   1199       try_next = FALSE;
   1200       break;
   1201 
   1202       /* Special spacing and line-terminating items. These recognize specific
   1203       lists of characters. The difference between VSPACE and ANYNL is that the
   1204       latter can match the two-character CRLF sequence, but that is not
   1205       relevant for finding the first character, so their code here is
   1206       identical. */
   1207 
   1208       case OP_HSPACE:
   1209       SET_BIT(CHAR_HT);
   1210       SET_BIT(CHAR_SPACE);
   1211 
   1212       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1213       the bits for 0xA0 and for code units >= 255, independently of UTF. */
   1214 
   1215 #if PCRE2_CODE_UNIT_WIDTH != 8
   1216       SET_BIT(0xA0);
   1217       SET_BIT(0xFF);
   1218 #else
   1219       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1220       units of horizontal space characters. */
   1221 
   1222 #ifdef SUPPORT_UNICODE
   1223       if (utf)
   1224         {
   1225         SET_BIT(0xC2);  /* For U+00A0 */
   1226         SET_BIT(0xE1);  /* For U+1680, U+180E */
   1227         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1228         SET_BIT(0xE3);  /* For U+3000 */
   1229         }
   1230       else
   1231 #endif
   1232       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
   1233       the code is EBCDIC. */
   1234         {
   1235 #ifndef EBCDIC
   1236         SET_BIT(0xA0);
   1237 #endif  /* Not EBCDIC */
   1238         }
   1239 #endif  /* 8-bit support */
   1240 
   1241       try_next = FALSE;
   1242       break;
   1243 
   1244       case OP_ANYNL:
   1245       case OP_VSPACE:
   1246       SET_BIT(CHAR_LF);
   1247       SET_BIT(CHAR_VT);
   1248       SET_BIT(CHAR_FF);
   1249       SET_BIT(CHAR_CR);
   1250 
   1251       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1252       the bits for NEL and for code units >= 255, independently of UTF. */
   1253 
   1254 #if PCRE2_CODE_UNIT_WIDTH != 8
   1255       SET_BIT(CHAR_NEL);
   1256       SET_BIT(0xFF);
   1257 #else
   1258       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1259       units of vertical space characters. */
   1260 
   1261 #ifdef SUPPORT_UNICODE
   1262       if (utf)
   1263         {
   1264         SET_BIT(0xC2);  /* For U+0085 (NEL) */
   1265         SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1266         }
   1267       else
   1268 #endif
   1269       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
   1270         {
   1271         SET_BIT(CHAR_NEL);
   1272         }
   1273 #endif  /* 8-bit support */
   1274 
   1275       try_next = FALSE;
   1276       break;
   1277 
   1278       /* Single character types set the bits and stop. Note that if PCRE2_UCP
   1279       is set, we do not see these opcodes because \d etc are converted to
   1280       properties. Therefore, these apply in the case when only characters less
   1281       than 256 are recognized to match the types. */
   1282 
   1283       case OP_NOT_DIGIT:
   1284       set_nottype_bits(re, cbit_digit, table_limit);
   1285       try_next = FALSE;
   1286       break;
   1287 
   1288       case OP_DIGIT:
   1289       set_type_bits(re, cbit_digit, table_limit);
   1290       try_next = FALSE;
   1291       break;
   1292 
   1293       case OP_NOT_WHITESPACE:
   1294       set_nottype_bits(re, cbit_space, table_limit);
   1295       try_next = FALSE;
   1296       break;
   1297 
   1298       case OP_WHITESPACE:
   1299       set_type_bits(re, cbit_space, table_limit);
   1300       try_next = FALSE;
   1301       break;
   1302 
   1303       case OP_NOT_WORDCHAR:
   1304       set_nottype_bits(re, cbit_word, table_limit);
   1305       try_next = FALSE;
   1306       break;
   1307 
   1308       case OP_WORDCHAR:
   1309       set_type_bits(re, cbit_word, table_limit);
   1310       try_next = FALSE;
   1311       break;
   1312 
   1313       /* One or more character type fudges the pointer and restarts, knowing
   1314       it will hit a single character type and stop there. */
   1315 
   1316       case OP_TYPEPLUS:
   1317       case OP_TYPEMINPLUS:
   1318       case OP_TYPEPOSPLUS:
   1319       tcode++;
   1320       break;
   1321 
   1322       case OP_TYPEEXACT:
   1323       tcode += 1 + IMM2_SIZE;
   1324       break;
   1325 
   1326       /* Zero or more repeats of character types set the bits and then
   1327       try again. */
   1328 
   1329       case OP_TYPEUPTO:
   1330       case OP_TYPEMINUPTO:
   1331       case OP_TYPEPOSUPTO:
   1332       tcode += IMM2_SIZE;  /* Fall through */
   1333 
   1334       case OP_TYPESTAR:
   1335       case OP_TYPEMINSTAR:
   1336       case OP_TYPEPOSSTAR:
   1337       case OP_TYPEQUERY:
   1338       case OP_TYPEMINQUERY:
   1339       case OP_TYPEPOSQUERY:
   1340       switch(tcode[1])
   1341         {
   1342         default:
   1343         case OP_ANY:
   1344         case OP_ALLANY:
   1345         return SSB_FAIL;
   1346 
   1347         case OP_HSPACE:
   1348         SET_BIT(CHAR_HT);
   1349         SET_BIT(CHAR_SPACE);
   1350 
   1351         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1352         the bits for 0xA0 and for code units >= 255, independently of UTF. */
   1353 
   1354 #if PCRE2_CODE_UNIT_WIDTH != 8
   1355         SET_BIT(0xA0);
   1356         SET_BIT(0xFF);
   1357 #else
   1358         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1359         units of horizontal space characters. */
   1360 
   1361 #ifdef SUPPORT_UNICODE
   1362         if (utf)
   1363           {
   1364           SET_BIT(0xC2);  /* For U+00A0 */
   1365           SET_BIT(0xE1);  /* For U+1680, U+180E */
   1366           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1367           SET_BIT(0xE3);  /* For U+3000 */
   1368           }
   1369         else
   1370 #endif
   1371         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
   1372         the code is EBCDIC. */
   1373           {
   1374 #ifndef EBCDIC
   1375           SET_BIT(0xA0);
   1376 #endif  /* Not EBCDIC */
   1377           }
   1378 #endif  /* 8-bit support */
   1379         break;
   1380 
   1381         case OP_ANYNL:
   1382         case OP_VSPACE:
   1383         SET_BIT(CHAR_LF);
   1384         SET_BIT(CHAR_VT);
   1385         SET_BIT(CHAR_FF);
   1386         SET_BIT(CHAR_CR);
   1387 
   1388         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1389         the bits for NEL and for code units >= 255, independently of UTF. */
   1390 
   1391 #if PCRE2_CODE_UNIT_WIDTH != 8
   1392         SET_BIT(CHAR_NEL);
   1393         SET_BIT(0xFF);
   1394 #else
   1395         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1396         units of vertical space characters. */
   1397 
   1398 #ifdef SUPPORT_UNICODE
   1399         if (utf)
   1400           {
   1401           SET_BIT(0xC2);  /* For U+0085 (NEL) */
   1402           SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1403           }
   1404         else
   1405 #endif
   1406         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
   1407           {
   1408           SET_BIT(CHAR_NEL);
   1409           }
   1410 #endif  /* 8-bit support */
   1411         break;
   1412 
   1413         case OP_NOT_DIGIT:
   1414         set_nottype_bits(re, cbit_digit, table_limit);
   1415         break;
   1416 
   1417         case OP_DIGIT:
   1418         set_type_bits(re, cbit_digit, table_limit);
   1419         break;
   1420 
   1421         case OP_NOT_WHITESPACE:
   1422         set_nottype_bits(re, cbit_space, table_limit);
   1423         break;
   1424 
   1425         case OP_WHITESPACE:
   1426         set_type_bits(re, cbit_space, table_limit);
   1427         break;
   1428 
   1429         case OP_NOT_WORDCHAR:
   1430         set_nottype_bits(re, cbit_word, table_limit);
   1431         break;
   1432 
   1433         case OP_WORDCHAR:
   1434         set_type_bits(re, cbit_word, table_limit);
   1435         break;
   1436         }
   1437 
   1438       tcode += 2;
   1439       break;
   1440 
   1441       /* Extended class: if there are any property checks, or if this is a
   1442       negative XCLASS without a map, give up. If there are no property checks,
   1443       there must be wide characters on the XCLASS list, because otherwise an
   1444       XCLASS would not have been created. This means that code points >= 255
   1445       are always potential starters. */
   1446 
   1447 #ifdef SUPPORT_WIDE_CHARS
   1448       case OP_XCLASS:
   1449       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
   1450           (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
   1451         return SSB_FAIL;
   1452 
   1453       /* We have a positive XCLASS or a negative one without a map. Set up the
   1454       map pointer if there is one, and fall through. */
   1455 
   1456       classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
   1457         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
   1458 #endif
   1459       /* It seems that the fall through comment must be outside the #ifdef if
   1460       it is to avoid the gcc compiler warning. */
   1461 
   1462       /* Fall through */
   1463 
   1464       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
   1465       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
   1466       because it starts a character with a value > 255. In 8-bit non-UTF mode,
   1467       there is no difference between CLASS and NCLASS. In all other wide
   1468       character modes, set the 0xFF bit to indicate code units >= 255. */
   1469 
   1470       case OP_NCLASS:
   1471 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   1472       if (utf)
   1473         {
   1474         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
   1475         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
   1476         }
   1477 #elif PCRE2_CODE_UNIT_WIDTH != 8
   1478       SET_BIT(0xFF);                             /* For characters >= 255 */
   1479 #endif
   1480       /* Fall through */
   1481 
   1482       /* Enter here for a positive non-XCLASS. If we have fallen through from
   1483       an XCLASS, classmap will already be set; just advance the code pointer.
   1484       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
   1485 
   1486       case OP_CLASS:
   1487       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
   1488         {
   1489         classmap = (uint8_t *)(++tcode);
   1490         tcode += 32 / sizeof(PCRE2_UCHAR);
   1491         }
   1492 
   1493       /* When wide characters are supported, classmap may be NULL. In UTF-8
   1494       (sic) mode, the bits in a class bit map correspond to character values,
   1495       not to byte values. However, the bit map we are constructing is for byte
   1496       values. So we have to do a conversion for characters whose code point is
   1497       greater than 127. In fact, there are only two possible starting bytes for
   1498       characters in the range 128 - 255. */
   1499 
   1500       if (classmap != NULL)
   1501         {
   1502 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   1503         if (utf)
   1504           {
   1505           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
   1506           for (c = 128; c < 256; c++)
   1507             {
   1508             if ((classmap[c/8] & (1 << (c&7))) != 0)
   1509               {
   1510               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
   1511               re->start_bitmap[d/8] |= (1 << (d&7));  /* and then skip on to the */
   1512               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
   1513               }
   1514             }
   1515           }
   1516         else
   1517 #endif
   1518         /* In all modes except UTF-8, the two bit maps are compatible. */
   1519 
   1520           {
   1521           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
   1522           }
   1523         }
   1524 
   1525       /* Act on what follows the class. For a zero minimum repeat, continue;
   1526       otherwise stop processing. */
   1527 
   1528       switch (*tcode)
   1529         {
   1530         case OP_CRSTAR:
   1531         case OP_CRMINSTAR:
   1532         case OP_CRQUERY:
   1533         case OP_CRMINQUERY:
   1534         case OP_CRPOSSTAR:
   1535         case OP_CRPOSQUERY:
   1536         tcode++;
   1537         break;
   1538 
   1539         case OP_CRRANGE:
   1540         case OP_CRMINRANGE:
   1541         case OP_CRPOSRANGE:
   1542         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
   1543           else try_next = FALSE;
   1544         break;
   1545 
   1546         default:
   1547         try_next = FALSE;
   1548         break;
   1549         }
   1550       break; /* End of class handling case */
   1551       }      /* End of switch for opcodes */
   1552     }        /* End of try_next loop */
   1553 
   1554   code += GET(code, 1);   /* Advance to next branch */
   1555   }
   1556 while (*code == OP_ALT);
   1557 
   1558 return yield;
   1559 }
   1560 
   1561 
   1562 
   1563 /*************************************************
   1564 *          Study a compiled expression           *
   1565 *************************************************/
   1566 
   1567 /* This function is handed a compiled expression that it must study to produce
   1568 information that will speed up the matching.
   1569 
   1570 Argument:  points to the compiled expression
   1571 Returns:   0 normally; non-zero should never normally occur
   1572            1 unknown opcode in set_start_bits
   1573            2 missing capturing bracket
   1574            3 unknown opcode in find_minlength
   1575 */
   1576 
   1577 int
   1578 PRIV(study)(pcre2_real_code *re)
   1579 {
   1580 int min;
   1581 int count = 0;
   1582 PCRE2_UCHAR *code;
   1583 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
   1584 
   1585 /* Find start of compiled code */
   1586 
   1587 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
   1588   re->name_entry_size * re->name_count;
   1589 
   1590 /* For a pattern that has a first code unit, or a multiline pattern that
   1591 matches only at "line start", there is no point in seeking a list of starting
   1592 code units. */
   1593 
   1594 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
   1595   {
   1596   int rc = set_start_bits(re, code, utf);
   1597   if (rc == SSB_UNKNOWN) return 1;
   1598   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
   1599   }
   1600 
   1601 /* Find the minimum length of subject string. If the pattern can match an empty
   1602 string, the minimum length is already known. If there are more back references
   1603 than the size of the vector we are going to cache them in, do nothing. A
   1604 pattern that complicated will probably take a long time to analyze and may in
   1605 any case turn out to be too complicated. Note that back reference minima are
   1606 held as 16-bit numbers. */
   1607 
   1608 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
   1609      re->top_backref <= MAX_CACHE_BACKREF)
   1610   {
   1611   int backref_cache[MAX_CACHE_BACKREF+1];
   1612   backref_cache[0] = 0;    /* Highest one that is set */
   1613   min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
   1614   switch(min)
   1615     {
   1616     case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
   1617     break;    /* Leave minlength unchanged (will be zero) */
   1618 
   1619     case -2:
   1620     return 2; /* missing capturing bracket */
   1621 
   1622     case -3:
   1623     return 3; /* unrecognized opcode */
   1624 
   1625     default:
   1626     if (min > UINT16_MAX) min = UINT16_MAX;
   1627     re->minlength = min;
   1628     break;
   1629     }
   1630   }
   1631 
   1632 return 0;
   1633 }
   1634 
   1635 /* End of pcre2_study.c */
   1636