Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10          New API code Copyright (c) 2016 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 /* This module contains functions for scanning a compiled pattern and
     42 collecting data (e.g. minimum matching length). */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 
     50 #include "pcre2_internal.h"
     51 
     52 
     53 /* Set a bit in the starting code unit bit map. */
     54 
     55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
     56 
     57 /* Returns from set_start_bits() */
     58 
     59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
     60 
     61 
     62 /*************************************************
     63 *   Find the minimum subject length for a group  *
     64 *************************************************/
     65 
     66 /* Scan a parenthesized group and compute the minimum length of subject that
     67 is needed to match it. This is a lower bound; it does not mean there is a
     68 string of that length that matches. In UTF mode, the result is in characters
     69 rather than code units. The field in a compiled pattern for storing the minimum
     70 length is 16-bits long (on the grounds that anything longer than that is
     71 pathological), so we give up when we reach that amount. This also means that
     72 integer overflow for really crazy patterns cannot happen.
     73 
     74 Arguments:
     75   re              compiled pattern block
     76   code            pointer to start of group (the bracket)
     77   startcode       pointer to start of the whole pattern's code
     78   utf             UTF flag
     79   recurses        chain of recurse_check to catch mutual recursion
     80   countptr        pointer to call count (to catch over complexity)
     81 
     82 Returns:   the minimum length
     83            -1 \C in UTF-8 mode
     84               or (*ACCEPT)
     85               or pattern too complicated
     86               or back reference to duplicate name/number
     87            -2 internal error (missing capturing bracket)
     88            -3 internal error (opcode not listed)
     89 */
     90 
     91 static int
     92 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
     93   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
     94 {
     95 int length = -1;
     96 int prev_cap_recno = -1;
     97 int prev_cap_d = 0;
     98 int prev_recurse_recno = -1;
     99 int prev_recurse_d = 0;
    100 uint32_t once_fudge = 0;
    101 BOOL had_recurse = FALSE;
    102 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
    103 recurse_check this_recurse;
    104 register int branchlength = 0;
    105 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
    106 
    107 /* If this is a "could be empty" group, its minimum length is 0. */
    108 
    109 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
    110 
    111 /* Skip over capturing bracket number */
    112 
    113 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
    114 
    115 /* A large and/or complex regex can take too long to process. */
    116 
    117 if ((*countptr)++ > 1000) return -1;
    118 
    119 /* Scan along the opcodes for this branch. If we get to the end of the branch,
    120 check the length against that of the other branches. If the accumulated length
    121 passes 16-bits, stop. */
    122 
    123 for (;;)
    124   {
    125   int d, min, recno;
    126   PCRE2_UCHAR *cs, *ce;
    127   register PCRE2_UCHAR op = *cc;
    128 
    129   if (branchlength >= UINT16_MAX) return UINT16_MAX;
    130 
    131   switch (op)
    132     {
    133     case OP_COND:
    134     case OP_SCOND:
    135 
    136     /* If there is only one branch in a condition, the implied branch has zero
    137     length, so we don't add anything. This covers the DEFINE "condition"
    138     automatically. If there are two branches we can treat it the same as any
    139     other non-capturing subpattern. */
    140 
    141     cs = cc + GET(cc, 1);
    142     if (*cs != OP_ALT)
    143       {
    144       cc = cs + 1 + LINK_SIZE;
    145       break;
    146       }
    147     goto PROCESS_NON_CAPTURE;
    148 
    149     /* There's a special case of OP_ONCE, when it is wrapped round an
    150     OP_RECURSE. We'd like to process the latter at this level so that
    151     remembering the value works for repeated cases. So we do nothing, but
    152     set a fudge value to skip over the OP_KET after the recurse. */
    153 
    154     case OP_ONCE:
    155     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
    156       {
    157       once_fudge = 1 + LINK_SIZE;
    158       cc += 1 + LINK_SIZE;
    159       break;
    160       }
    161     /* Fall through */
    162 
    163     case OP_ONCE_NC:
    164     case OP_BRA:
    165     case OP_SBRA:
    166     case OP_BRAPOS:
    167     case OP_SBRAPOS:
    168     PROCESS_NON_CAPTURE:
    169     d = find_minlength(re, cc, startcode, utf, recurses, countptr);
    170     if (d < 0) return d;
    171     branchlength += d;
    172     do cc += GET(cc, 1); while (*cc == OP_ALT);
    173     cc += 1 + LINK_SIZE;
    174     break;
    175 
    176     /* To save time for repeated capturing subpatterns, we remember the
    177     length of the previous one. Unfortunately we can't do the same for
    178     the unnumbered ones above. Nor can we do this if (?| is present in the
    179     pattern because captures with the same number are not then identical. */
    180 
    181     case OP_CBRA:
    182     case OP_SCBRA:
    183     case OP_CBRAPOS:
    184     case OP_SCBRAPOS:
    185     recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
    186     if (recno != prev_cap_recno)
    187       {
    188       prev_cap_recno = recno;
    189       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
    190       if (prev_cap_d < 0) return prev_cap_d;
    191       }
    192     branchlength += prev_cap_d;
    193     do cc += GET(cc, 1); while (*cc == OP_ALT);
    194     cc += 1 + LINK_SIZE;
    195     break;
    196 
    197     /* ACCEPT makes things far too complicated; we have to give up. */
    198 
    199     case OP_ACCEPT:
    200     case OP_ASSERT_ACCEPT:
    201     return -1;
    202 
    203     /* Reached end of a branch; if it's a ket it is the end of a nested
    204     call. If it's ALT it is an alternation in a nested call. If it is END it's
    205     the end of the outer call. All can be handled by the same code. If an
    206     ACCEPT was previously encountered, use the length that was in force at that
    207     time, and pass back the shortest ACCEPT length. */
    208 
    209     case OP_ALT:
    210     case OP_KET:
    211     case OP_KETRMAX:
    212     case OP_KETRMIN:
    213     case OP_KETRPOS:
    214     case OP_END:
    215     if (length < 0 || (!had_recurse && branchlength < length))
    216       length = branchlength;
    217     if (op != OP_ALT) return length;
    218     cc += 1 + LINK_SIZE;
    219     branchlength = 0;
    220     had_recurse = FALSE;
    221     break;
    222 
    223     /* Skip over assertive subpatterns */
    224 
    225     case OP_ASSERT:
    226     case OP_ASSERT_NOT:
    227     case OP_ASSERTBACK:
    228     case OP_ASSERTBACK_NOT:
    229     do cc += GET(cc, 1); while (*cc == OP_ALT);
    230     /* Fall through */
    231 
    232     /* Skip over things that don't match chars */
    233 
    234     case OP_REVERSE:
    235     case OP_CREF:
    236     case OP_DNCREF:
    237     case OP_RREF:
    238     case OP_DNRREF:
    239     case OP_FALSE:
    240     case OP_TRUE:
    241     case OP_CALLOUT:
    242     case OP_SOD:
    243     case OP_SOM:
    244     case OP_EOD:
    245     case OP_EODN:
    246     case OP_CIRC:
    247     case OP_CIRCM:
    248     case OP_DOLL:
    249     case OP_DOLLM:
    250     case OP_NOT_WORD_BOUNDARY:
    251     case OP_WORD_BOUNDARY:
    252     cc += PRIV(OP_lengths)[*cc];
    253     break;
    254 
    255     case OP_CALLOUT_STR:
    256     cc += GET(cc, 1 + 2*LINK_SIZE);
    257     break;
    258 
    259     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
    260 
    261     case OP_BRAZERO:
    262     case OP_BRAMINZERO:
    263     case OP_BRAPOSZERO:
    264     case OP_SKIPZERO:
    265     cc += PRIV(OP_lengths)[*cc];
    266     do cc += GET(cc, 1); while (*cc == OP_ALT);
    267     cc += 1 + LINK_SIZE;
    268     break;
    269 
    270     /* Handle literal characters and + repetitions */
    271 
    272     case OP_CHAR:
    273     case OP_CHARI:
    274     case OP_NOT:
    275     case OP_NOTI:
    276     case OP_PLUS:
    277     case OP_PLUSI:
    278     case OP_MINPLUS:
    279     case OP_MINPLUSI:
    280     case OP_POSPLUS:
    281     case OP_POSPLUSI:
    282     case OP_NOTPLUS:
    283     case OP_NOTPLUSI:
    284     case OP_NOTMINPLUS:
    285     case OP_NOTMINPLUSI:
    286     case OP_NOTPOSPLUS:
    287     case OP_NOTPOSPLUSI:
    288     branchlength++;
    289     cc += 2;
    290 #ifdef SUPPORT_UNICODE
    291     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    292 #endif
    293     break;
    294 
    295     case OP_TYPEPLUS:
    296     case OP_TYPEMINPLUS:
    297     case OP_TYPEPOSPLUS:
    298     branchlength++;
    299     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
    300     break;
    301 
    302     /* Handle exact repetitions. The count is already in characters, but we
    303     may need to skip over a multibyte character in UTF mode.  */
    304 
    305     case OP_EXACT:
    306     case OP_EXACTI:
    307     case OP_NOTEXACT:
    308     case OP_NOTEXACTI:
    309     branchlength += GET2(cc,1);
    310     cc += 2 + IMM2_SIZE;
    311 #ifdef SUPPORT_UNICODE
    312     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    313 #endif
    314     break;
    315 
    316     case OP_TYPEEXACT:
    317     branchlength += GET2(cc,1);
    318     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
    319       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
    320     break;
    321 
    322     /* Handle single-char non-literal matchers */
    323 
    324     case OP_PROP:
    325     case OP_NOTPROP:
    326     cc += 2;
    327     /* Fall through */
    328 
    329     case OP_NOT_DIGIT:
    330     case OP_DIGIT:
    331     case OP_NOT_WHITESPACE:
    332     case OP_WHITESPACE:
    333     case OP_NOT_WORDCHAR:
    334     case OP_WORDCHAR:
    335     case OP_ANY:
    336     case OP_ALLANY:
    337     case OP_EXTUNI:
    338     case OP_HSPACE:
    339     case OP_NOT_HSPACE:
    340     case OP_VSPACE:
    341     case OP_NOT_VSPACE:
    342     branchlength++;
    343     cc++;
    344     break;
    345 
    346     /* "Any newline" might match two characters, but it also might match just
    347     one. */
    348 
    349     case OP_ANYNL:
    350     branchlength += 1;
    351     cc++;
    352     break;
    353 
    354     /* The single-byte matcher means we can't proceed in UTF mode. (In
    355     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
    356     appear, but leave the code, just in case.) */
    357 
    358     case OP_ANYBYTE:
    359 #ifdef SUPPORT_UNICODE
    360     if (utf) return -1;
    361 #endif
    362     branchlength++;
    363     cc++;
    364     break;
    365 
    366     /* For repeated character types, we have to test for \p and \P, which have
    367     an extra two bytes of parameters. */
    368 
    369     case OP_TYPESTAR:
    370     case OP_TYPEMINSTAR:
    371     case OP_TYPEQUERY:
    372     case OP_TYPEMINQUERY:
    373     case OP_TYPEPOSSTAR:
    374     case OP_TYPEPOSQUERY:
    375     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
    376     cc += PRIV(OP_lengths)[op];
    377     break;
    378 
    379     case OP_TYPEUPTO:
    380     case OP_TYPEMINUPTO:
    381     case OP_TYPEPOSUPTO:
    382     if (cc[1 + IMM2_SIZE] == OP_PROP
    383       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
    384     cc += PRIV(OP_lengths)[op];
    385     break;
    386 
    387     /* Check a class for variable quantification */
    388 
    389     case OP_CLASS:
    390     case OP_NCLASS:
    391 #ifdef SUPPORT_WIDE_CHARS
    392     case OP_XCLASS:
    393     /* The original code caused an unsigned overflow in 64 bit systems,
    394     so now we use a conditional statement. */
    395     if (op == OP_XCLASS)
    396       cc += GET(cc, 1);
    397     else
    398       cc += PRIV(OP_lengths)[OP_CLASS];
    399 #else
    400     cc += PRIV(OP_lengths)[OP_CLASS];
    401 #endif
    402 
    403     switch (*cc)
    404       {
    405       case OP_CRPLUS:
    406       case OP_CRMINPLUS:
    407       case OP_CRPOSPLUS:
    408       branchlength++;
    409       /* Fall through */
    410 
    411       case OP_CRSTAR:
    412       case OP_CRMINSTAR:
    413       case OP_CRQUERY:
    414       case OP_CRMINQUERY:
    415       case OP_CRPOSSTAR:
    416       case OP_CRPOSQUERY:
    417       cc++;
    418       break;
    419 
    420       case OP_CRRANGE:
    421       case OP_CRMINRANGE:
    422       case OP_CRPOSRANGE:
    423       branchlength += GET2(cc,1);
    424       cc += 1 + 2 * IMM2_SIZE;
    425       break;
    426 
    427       default:
    428       branchlength++;
    429       break;
    430       }
    431     break;
    432 
    433     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
    434     way: we find the minimum length for the subpattern. A recursion
    435     (backreference or subroutine) causes an a flag to be set that causes the
    436     length of this branch to be ignored. The logic is that a recursion can only
    437     make sense if there is another alternative that stops the recursing. That
    438     will provide the minimum length (when no recursion happens).
    439 
    440     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
    441     matches an empty string (by default it causes a matching failure), so in
    442     that case we must set the minimum length to zero. */
    443 
    444     /* Duplicate named pattern back reference. We cannot reliably find a length
    445     for this if duplicate numbers are present in the pattern. */
    446 
    447     case OP_DNREF:
    448     case OP_DNREFI:
    449     if (dupcapused) return -1;
    450     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
    451       {
    452       int count = GET2(cc, 1+IMM2_SIZE);
    453       PCRE2_UCHAR *slot =
    454         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
    455           GET2(cc, 1) * re->name_entry_size;
    456 
    457       d = INT_MAX;
    458 
    459       /* Scan all groups with the same name */
    460 
    461       while (count-- > 0)
    462         {
    463         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
    464         if (cs == NULL) return -2;
    465         do ce += GET(ce, 1); while (*ce == OP_ALT);
    466         if (cc > cs && cc < ce)    /* Simple recursion */
    467           {
    468           d = 0;
    469           had_recurse = TRUE;
    470           break;
    471           }
    472         else
    473           {
    474           recurse_check *r = recurses;
    475           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    476           if (r != NULL)           /* Mutual recursion */
    477             {
    478             d = 0;
    479             had_recurse = TRUE;
    480             break;
    481             }
    482           else
    483             {
    484             int dd;
    485             this_recurse.prev = recurses;
    486             this_recurse.group = cs;
    487             dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
    488             if (dd < d) d = dd;
    489             }
    490           }
    491         slot += re->name_entry_size;
    492         }
    493       }
    494     else d = 0;
    495     cc += 1 + 2*IMM2_SIZE;
    496     goto REPEAT_BACK_REFERENCE;
    497 
    498     /* Single back reference. We cannot find a length for this if duplicate
    499     numbers are present in the pattern. */
    500 
    501     case OP_REF:
    502     case OP_REFI:
    503     if (dupcapused) return -1;
    504     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
    505       {
    506       ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
    507       if (cs == NULL) return -2;
    508       do ce += GET(ce, 1); while (*ce == OP_ALT);
    509       if (cc > cs && cc < ce)    /* Simple recursion */
    510         {
    511         d = 0;
    512         had_recurse = TRUE;
    513         }
    514       else
    515         {
    516         recurse_check *r = recurses;
    517         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    518         if (r != NULL)           /* Mutual recursion */
    519           {
    520           d = 0;
    521           had_recurse = TRUE;
    522           }
    523         else
    524           {
    525           this_recurse.prev = recurses;
    526           this_recurse.group = cs;
    527           d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
    528           }
    529         }
    530       }
    531     else d = 0;
    532     cc += 1 + IMM2_SIZE;
    533 
    534     /* Handle repeated back references */
    535 
    536     REPEAT_BACK_REFERENCE:
    537     switch (*cc)
    538       {
    539       case OP_CRSTAR:
    540       case OP_CRMINSTAR:
    541       case OP_CRQUERY:
    542       case OP_CRMINQUERY:
    543       case OP_CRPOSSTAR:
    544       case OP_CRPOSQUERY:
    545       min = 0;
    546       cc++;
    547       break;
    548 
    549       case OP_CRPLUS:
    550       case OP_CRMINPLUS:
    551       case OP_CRPOSPLUS:
    552       min = 1;
    553       cc++;
    554       break;
    555 
    556       case OP_CRRANGE:
    557       case OP_CRMINRANGE:
    558       case OP_CRPOSRANGE:
    559       min = GET2(cc, 1);
    560       cc += 1 + 2 * IMM2_SIZE;
    561       break;
    562 
    563       default:
    564       min = 1;
    565       break;
    566       }
    567 
    568      /* Take care not to overflow: (1) min and d are ints, so check that their
    569      product is not greater than INT_MAX. (2) branchlength is limited to
    570      UINT16_MAX (checked at the top of the loop). */
    571 
    572     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
    573       branchlength = UINT16_MAX;
    574     else branchlength += min * d;
    575     break;
    576 
    577     /* Recursion always refers to the first occurrence of a subpattern with a
    578     given number. Therefore, we can always make use of caching, even when the
    579     pattern contains multiple subpatterns with the same number. */
    580 
    581     case OP_RECURSE:
    582     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
    583     recno = GET2(cs, 1+LINK_SIZE);
    584     if (recno == prev_recurse_recno)
    585       {
    586       branchlength += prev_recurse_d;
    587       }
    588     else
    589       {
    590       do ce += GET(ce, 1); while (*ce == OP_ALT);
    591       if (cc > cs && cc < ce)    /* Simple recursion */
    592         had_recurse = TRUE;
    593       else
    594         {
    595         recurse_check *r = recurses;
    596         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    597         if (r != NULL)          /* Mutual recursion */
    598           had_recurse = TRUE;
    599         else
    600           {
    601           this_recurse.prev = recurses;
    602           this_recurse.group = cs;
    603           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
    604             countptr);
    605           if (prev_recurse_d < 0) return prev_recurse_d;
    606           prev_recurse_recno = recno;
    607           branchlength += prev_recurse_d;
    608           }
    609         }
    610       }
    611     cc += 1 + LINK_SIZE + once_fudge;
    612     once_fudge = 0;
    613     break;
    614 
    615     /* Anything else does not or need not match a character. We can get the
    616     item's length from the table, but for those that can match zero occurrences
    617     of a character, we must take special action for UTF-8 characters. As it
    618     happens, the "NOT" versions of these opcodes are used at present only for
    619     ASCII characters, so they could be omitted from this list. However, in
    620     future that may change, so we include them here so as not to leave a
    621     gotcha for a future maintainer. */
    622 
    623     case OP_UPTO:
    624     case OP_UPTOI:
    625     case OP_NOTUPTO:
    626     case OP_NOTUPTOI:
    627     case OP_MINUPTO:
    628     case OP_MINUPTOI:
    629     case OP_NOTMINUPTO:
    630     case OP_NOTMINUPTOI:
    631     case OP_POSUPTO:
    632     case OP_POSUPTOI:
    633     case OP_NOTPOSUPTO:
    634     case OP_NOTPOSUPTOI:
    635 
    636     case OP_STAR:
    637     case OP_STARI:
    638     case OP_NOTSTAR:
    639     case OP_NOTSTARI:
    640     case OP_MINSTAR:
    641     case OP_MINSTARI:
    642     case OP_NOTMINSTAR:
    643     case OP_NOTMINSTARI:
    644     case OP_POSSTAR:
    645     case OP_POSSTARI:
    646     case OP_NOTPOSSTAR:
    647     case OP_NOTPOSSTARI:
    648 
    649     case OP_QUERY:
    650     case OP_QUERYI:
    651     case OP_NOTQUERY:
    652     case OP_NOTQUERYI:
    653     case OP_MINQUERY:
    654     case OP_MINQUERYI:
    655     case OP_NOTMINQUERY:
    656     case OP_NOTMINQUERYI:
    657     case OP_POSQUERY:
    658     case OP_POSQUERYI:
    659     case OP_NOTPOSQUERY:
    660     case OP_NOTPOSQUERYI:
    661 
    662     cc += PRIV(OP_lengths)[op];
    663 #ifdef SUPPORT_UNICODE
    664     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    665 #endif
    666     break;
    667 
    668     /* Skip these, but we need to add in the name length. */
    669 
    670     case OP_MARK:
    671     case OP_PRUNE_ARG:
    672     case OP_SKIP_ARG:
    673     case OP_THEN_ARG:
    674     cc += PRIV(OP_lengths)[op] + cc[1];
    675     break;
    676 
    677     /* The remaining opcodes are just skipped over. */
    678 
    679     case OP_CLOSE:
    680     case OP_COMMIT:
    681     case OP_FAIL:
    682     case OP_PRUNE:
    683     case OP_SET_SOM:
    684     case OP_SKIP:
    685     case OP_THEN:
    686     cc += PRIV(OP_lengths)[op];
    687     break;
    688 
    689     /* This should not occur: we list all opcodes explicitly so that when
    690     new ones get added they are properly considered. */
    691 
    692     default:
    693     return -3;
    694     }
    695   }
    696 /* Control never gets here */
    697 }
    698 
    699 
    700 
    701 /*************************************************
    702 *      Set a bit and maybe its alternate case    *
    703 *************************************************/
    704 
    705 /* Given a character, set its first code unit's bit in the table, and also the
    706 corresponding bit for the other version of a letter if we are caseless.
    707 
    708 Arguments:
    709   re            points to the regex block
    710   p             points to the first code unit of the character
    711   caseless      TRUE if caseless
    712   utf           TRUE for UTF mode
    713 
    714 Returns:        pointer after the character
    715 */
    716 
    717 static PCRE2_SPTR
    718 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
    719 {
    720 uint32_t c = *p++;   /* First code unit */
    721 (void)utf;           /* Stop compiler warning when UTF not supported */
    722 
    723 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
    724 0xff. */
    725 
    726 #if PCRE2_CODE_UNIT_WIDTH != 8
    727 if (c > 0xff) SET_BIT(0xff); else
    728 #endif
    729 
    730 SET_BIT(c);
    731 
    732 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
    733 the end of the character, even when caseless. */
    734 
    735 #ifdef SUPPORT_UNICODE
    736 if (utf)
    737   {
    738 #if PCRE2_CODE_UNIT_WIDTH == 8
    739   if (c >= 0xc0) GETUTF8INC(c, p);
    740 #elif PCRE2_CODE_UNIT_WIDTH == 16
    741   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
    742 #endif
    743   }
    744 #endif  /* SUPPORT_UNICODE */
    745 
    746 /* If caseless, handle the other case of the character. */
    747 
    748 if (caseless)
    749   {
    750   if (utf)
    751     {
    752 #if PCRE2_CODE_UNIT_WIDTH == 8
    753     PCRE2_UCHAR buff[6];
    754     c = UCD_OTHERCASE(c);
    755     (void)PRIV(ord2utf)(c, buff);
    756     SET_BIT(buff[0]);
    757 #else  /* 16-bit or 32-bit mode */
    758     c = UCD_OTHERCASE(c);
    759     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
    760 #endif
    761     }
    762 
    763   /* Not UTF */
    764 
    765   else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
    766   }
    767 
    768 return p;
    769 }
    770 
    771 
    772 
    773 /*************************************************
    774 *     Set bits for a positive character type     *
    775 *************************************************/
    776 
    777 /* This function sets starting bits for a character type. In UTF-8 mode, we can
    778 only do a direct setting for bytes less than 128, as otherwise there can be
    779 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
    780 environment, the tables will only recognize ASCII characters anyway, but in at
    781 least one Windows environment, some higher bytes bits were set in the tables.
    782 So we deal with that case by considering the UTF-8 encoding.
    783 
    784 Arguments:
    785   re             the regex block
    786   cbit type      the type of character wanted
    787   table_limit    32 for non-UTF-8; 16 for UTF-8
    788 
    789 Returns:         nothing
    790 */
    791 
    792 static void
    793 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
    794 {
    795 register uint32_t c;
    796 for (c = 0; c < table_limit; c++)
    797   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
    798 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    799 if (table_limit == 32) return;
    800 for (c = 128; c < 256; c++)
    801   {
    802   if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
    803     {
    804     PCRE2_UCHAR buff[6];
    805     (void)PRIV(ord2utf)(c, buff);
    806     SET_BIT(buff[0]);
    807     }
    808   }
    809 #endif  /* UTF-8 */
    810 }
    811 
    812 
    813 /*************************************************
    814 *     Set bits for a negative character type     *
    815 *************************************************/
    816 
    817 /* This function sets starting bits for a negative character type such as \D.
    818 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
    819 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
    820 Unlike in the positive case, where we can set appropriate starting bits for
    821 specific high-valued UTF-8 characters, in this case we have to set the bits for
    822 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
    823 0xc0 (192) for simplicity.
    824 
    825 Arguments:
    826   re             the regex block
    827   cbit type      the type of character wanted
    828   table_limit    32 for non-UTF-8; 16 for UTF-8
    829 
    830 Returns:         nothing
    831 */
    832 
    833 static void
    834 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
    835 {
    836 register uint32_t c;
    837 for (c = 0; c < table_limit; c++)
    838   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
    839 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    840 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
    841 #endif
    842 }
    843 
    844 
    845 
    846 /*************************************************
    847 *          Create bitmap of starting bytes       *
    848 *************************************************/
    849 
    850 /* This function scans a compiled unanchored expression recursively and
    851 attempts to build a bitmap of the set of possible starting code units whose
    852 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
    853 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
    854 we pass a value of 16 rather than 32 as the final argument. (See comments in
    855 those functions for the reason.)
    856 
    857 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
    858 (a*)b where the group provides some optional starting code units but scanning
    859 must continue at the outer level to find at least one mandatory code unit. At
    860 the outermost level, this function fails unless the result is SSB_DONE.
    861 
    862 Arguments:
    863   re           points to the compiled regex block
    864   code         points to an expression
    865   utf          TRUE if in UTF mode
    866 
    867 Returns:       SSB_FAIL     => Failed to find any starting code units
    868                SSB_DONE     => Found mandatory starting code units
    869                SSB_CONTINUE => Found optional starting code units
    870                SSB_UNKNOWN  => Hit an unrecognized opcode
    871 */
    872 
    873 static int
    874 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
    875 {
    876 register uint32_t c;
    877 int yield = SSB_DONE;
    878 
    879 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    880 int table_limit = utf? 16:32;
    881 #else
    882 int table_limit = 32;
    883 #endif
    884 
    885 do
    886   {
    887   BOOL try_next = TRUE;
    888   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
    889 
    890   if (*code == OP_CBRA || *code == OP_SCBRA ||
    891       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
    892 
    893   while (try_next)    /* Loop for items in this branch */
    894     {
    895     int rc;
    896     uint8_t *classmap = NULL;
    897 
    898     switch(*tcode)
    899       {
    900       /* If we reach something we don't understand, it means a new opcode has
    901       been created that hasn't been added to this function. Hopefully this
    902       problem will be discovered during testing. */
    903 
    904       default:
    905       return SSB_UNKNOWN;
    906 
    907       /* Fail for a valid opcode that implies no starting bits. */
    908 
    909       case OP_ACCEPT:
    910       case OP_ASSERT_ACCEPT:
    911       case OP_ALLANY:
    912       case OP_ANY:
    913       case OP_ANYBYTE:
    914       case OP_CIRC:
    915       case OP_CIRCM:
    916       case OP_CLOSE:
    917       case OP_COMMIT:
    918       case OP_COND:
    919       case OP_CREF:
    920       case OP_FALSE:
    921       case OP_TRUE:
    922       case OP_DNCREF:
    923       case OP_DNREF:
    924       case OP_DNREFI:
    925       case OP_DNRREF:
    926       case OP_DOLL:
    927       case OP_DOLLM:
    928       case OP_END:
    929       case OP_EOD:
    930       case OP_EODN:
    931       case OP_EXTUNI:
    932       case OP_FAIL:
    933       case OP_MARK:
    934       case OP_NOT:
    935       case OP_NOTEXACT:
    936       case OP_NOTEXACTI:
    937       case OP_NOTI:
    938       case OP_NOTMINPLUS:
    939       case OP_NOTMINPLUSI:
    940       case OP_NOTMINQUERY:
    941       case OP_NOTMINQUERYI:
    942       case OP_NOTMINSTAR:
    943       case OP_NOTMINSTARI:
    944       case OP_NOTMINUPTO:
    945       case OP_NOTMINUPTOI:
    946       case OP_NOTPLUS:
    947       case OP_NOTPLUSI:
    948       case OP_NOTPOSPLUS:
    949       case OP_NOTPOSPLUSI:
    950       case OP_NOTPOSQUERY:
    951       case OP_NOTPOSQUERYI:
    952       case OP_NOTPOSSTAR:
    953       case OP_NOTPOSSTARI:
    954       case OP_NOTPOSUPTO:
    955       case OP_NOTPOSUPTOI:
    956       case OP_NOTPROP:
    957       case OP_NOTQUERY:
    958       case OP_NOTQUERYI:
    959       case OP_NOTSTAR:
    960       case OP_NOTSTARI:
    961       case OP_NOTUPTO:
    962       case OP_NOTUPTOI:
    963       case OP_NOT_HSPACE:
    964       case OP_NOT_VSPACE:
    965       case OP_PRUNE:
    966       case OP_PRUNE_ARG:
    967       case OP_RECURSE:
    968       case OP_REF:
    969       case OP_REFI:
    970       case OP_REVERSE:
    971       case OP_RREF:
    972       case OP_SCOND:
    973       case OP_SET_SOM:
    974       case OP_SKIP:
    975       case OP_SKIP_ARG:
    976       case OP_SOD:
    977       case OP_SOM:
    978       case OP_THEN:
    979       case OP_THEN_ARG:
    980       return SSB_FAIL;
    981 
    982       /* A "real" property test implies no starting bits, but the fake property
    983       PT_CLIST identifies a list of characters. These lists are short, as they
    984       are used for characters with more than one "other case", so there is no
    985       point in recognizing them for OP_NOTPROP. */
    986 
    987       case OP_PROP:
    988       if (tcode[1] != PT_CLIST) return SSB_FAIL;
    989         {
    990         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
    991         while ((c = *p++) < NOTACHAR)
    992           {
    993 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
    994           if (utf)
    995             {
    996             PCRE2_UCHAR buff[6];
    997             (void)PRIV(ord2utf)(c, buff);
    998             c = buff[0];
    999             }
   1000 #endif
   1001           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
   1002           }
   1003         }
   1004       try_next = FALSE;
   1005       break;
   1006 
   1007       /* We can ignore word boundary tests. */
   1008 
   1009       case OP_WORD_BOUNDARY:
   1010       case OP_NOT_WORD_BOUNDARY:
   1011       tcode++;
   1012       break;
   1013 
   1014       /* If we hit a bracket or a positive lookahead assertion, recurse to set
   1015       bits from within the subpattern. If it can't find anything, we have to
   1016       give up. If it finds some mandatory character(s), we are done for this
   1017       branch. Otherwise, carry on scanning after the subpattern. */
   1018 
   1019       case OP_BRA:
   1020       case OP_SBRA:
   1021       case OP_CBRA:
   1022       case OP_SCBRA:
   1023       case OP_BRAPOS:
   1024       case OP_SBRAPOS:
   1025       case OP_CBRAPOS:
   1026       case OP_SCBRAPOS:
   1027       case OP_ONCE:
   1028       case OP_ONCE_NC:
   1029       case OP_ASSERT:
   1030       rc = set_start_bits(re, tcode, utf);
   1031       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
   1032       if (rc == SSB_DONE) try_next = FALSE; else
   1033         {
   1034         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
   1035         tcode += 1 + LINK_SIZE;
   1036         }
   1037       break;
   1038 
   1039       /* If we hit ALT or KET, it means we haven't found anything mandatory in
   1040       this branch, though we might have found something optional. For ALT, we
   1041       continue with the next alternative, but we have to arrange that the final
   1042       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
   1043       return SSB_CONTINUE: if this is the top level, that indicates failure,
   1044       but after a nested subpattern, it causes scanning to continue. */
   1045 
   1046       case OP_ALT:
   1047       yield = SSB_CONTINUE;
   1048       try_next = FALSE;
   1049       break;
   1050 
   1051       case OP_KET:
   1052       case OP_KETRMAX:
   1053       case OP_KETRMIN:
   1054       case OP_KETRPOS:
   1055       return SSB_CONTINUE;
   1056 
   1057       /* Skip over callout */
   1058 
   1059       case OP_CALLOUT:
   1060       tcode += PRIV(OP_lengths)[OP_CALLOUT];
   1061       break;
   1062 
   1063       case OP_CALLOUT_STR:
   1064       tcode += GET(tcode, 1 + 2*LINK_SIZE);
   1065       break;
   1066 
   1067       /* Skip over lookbehind and negative lookahead assertions */
   1068 
   1069       case OP_ASSERT_NOT:
   1070       case OP_ASSERTBACK:
   1071       case OP_ASSERTBACK_NOT:
   1072       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
   1073       tcode += 1 + LINK_SIZE;
   1074       break;
   1075 
   1076       /* BRAZERO does the bracket, but carries on. */
   1077 
   1078       case OP_BRAZERO:
   1079       case OP_BRAMINZERO:
   1080       case OP_BRAPOSZERO:
   1081       rc = set_start_bits(re, ++tcode, utf);
   1082       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
   1083       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1084       tcode += 1 + LINK_SIZE;
   1085       break;
   1086 
   1087       /* SKIPZERO skips the bracket. */
   1088 
   1089       case OP_SKIPZERO:
   1090       tcode++;
   1091       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1092       tcode += 1 + LINK_SIZE;
   1093       break;
   1094 
   1095       /* Single-char * or ? sets the bit and tries the next item */
   1096 
   1097       case OP_STAR:
   1098       case OP_MINSTAR:
   1099       case OP_POSSTAR:
   1100       case OP_QUERY:
   1101       case OP_MINQUERY:
   1102       case OP_POSQUERY:
   1103       tcode = set_table_bit(re, tcode + 1, FALSE, utf);
   1104       break;
   1105 
   1106       case OP_STARI:
   1107       case OP_MINSTARI:
   1108       case OP_POSSTARI:
   1109       case OP_QUERYI:
   1110       case OP_MINQUERYI:
   1111       case OP_POSQUERYI:
   1112       tcode = set_table_bit(re, tcode + 1, TRUE, utf);
   1113       break;
   1114 
   1115       /* Single-char upto sets the bit and tries the next */
   1116 
   1117       case OP_UPTO:
   1118       case OP_MINUPTO:
   1119       case OP_POSUPTO:
   1120       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
   1121       break;
   1122 
   1123       case OP_UPTOI:
   1124       case OP_MINUPTOI:
   1125       case OP_POSUPTOI:
   1126       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
   1127       break;
   1128 
   1129       /* At least one single char sets the bit and stops */
   1130 
   1131       case OP_EXACT:
   1132       tcode += IMM2_SIZE;
   1133       /* Fall through */
   1134       case OP_CHAR:
   1135       case OP_PLUS:
   1136       case OP_MINPLUS:
   1137       case OP_POSPLUS:
   1138       (void)set_table_bit(re, tcode + 1, FALSE, utf);
   1139       try_next = FALSE;
   1140       break;
   1141 
   1142       case OP_EXACTI:
   1143       tcode += IMM2_SIZE;
   1144       /* Fall through */
   1145       case OP_CHARI:
   1146       case OP_PLUSI:
   1147       case OP_MINPLUSI:
   1148       case OP_POSPLUSI:
   1149       (void)set_table_bit(re, tcode + 1, TRUE, utf);
   1150       try_next = FALSE;
   1151       break;
   1152 
   1153       /* Special spacing and line-terminating items. These recognize specific
   1154       lists of characters. The difference between VSPACE and ANYNL is that the
   1155       latter can match the two-character CRLF sequence, but that is not
   1156       relevant for finding the first character, so their code here is
   1157       identical. */
   1158 
   1159       case OP_HSPACE:
   1160       SET_BIT(CHAR_HT);
   1161       SET_BIT(CHAR_SPACE);
   1162 
   1163       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1164       the bits for 0xA0 and for code units >= 255, independently of UTF. */
   1165 
   1166 #if PCRE2_CODE_UNIT_WIDTH != 8
   1167       SET_BIT(0xA0);
   1168       SET_BIT(0xFF);
   1169 #else
   1170       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1171       units of horizontal space characters. */
   1172 
   1173 #ifdef SUPPORT_UNICODE
   1174       if (utf)
   1175         {
   1176         SET_BIT(0xC2);  /* For U+00A0 */
   1177         SET_BIT(0xE1);  /* For U+1680, U+180E */
   1178         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1179         SET_BIT(0xE3);  /* For U+3000 */
   1180         }
   1181       else
   1182 #endif
   1183       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
   1184       the code is EBCDIC. */
   1185         {
   1186 #ifndef EBCDIC
   1187         SET_BIT(0xA0);
   1188 #endif  /* Not EBCDIC */
   1189         }
   1190 #endif  /* 8-bit support */
   1191 
   1192       try_next = FALSE;
   1193       break;
   1194 
   1195       case OP_ANYNL:
   1196       case OP_VSPACE:
   1197       SET_BIT(CHAR_LF);
   1198       SET_BIT(CHAR_VT);
   1199       SET_BIT(CHAR_FF);
   1200       SET_BIT(CHAR_CR);
   1201 
   1202       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1203       the bits for NEL and for code units >= 255, independently of UTF. */
   1204 
   1205 #if PCRE2_CODE_UNIT_WIDTH != 8
   1206       SET_BIT(CHAR_NEL);
   1207       SET_BIT(0xFF);
   1208 #else
   1209       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1210       units of vertical space characters. */
   1211 
   1212 #ifdef SUPPORT_UNICODE
   1213       if (utf)
   1214         {
   1215         SET_BIT(0xC2);  /* For U+0085 (NEL) */
   1216         SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1217         }
   1218       else
   1219 #endif
   1220       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
   1221         {
   1222         SET_BIT(CHAR_NEL);
   1223         }
   1224 #endif  /* 8-bit support */
   1225 
   1226       try_next = FALSE;
   1227       break;
   1228 
   1229       /* Single character types set the bits and stop. Note that if PCRE2_UCP
   1230       is set, we do not see these op codes because \d etc are converted to
   1231       properties. Therefore, these apply in the case when only characters less
   1232       than 256 are recognized to match the types. */
   1233 
   1234       case OP_NOT_DIGIT:
   1235       set_nottype_bits(re, cbit_digit, table_limit);
   1236       try_next = FALSE;
   1237       break;
   1238 
   1239       case OP_DIGIT:
   1240       set_type_bits(re, cbit_digit, table_limit);
   1241       try_next = FALSE;
   1242       break;
   1243 
   1244       case OP_NOT_WHITESPACE:
   1245       set_nottype_bits(re, cbit_space, table_limit);
   1246       try_next = FALSE;
   1247       break;
   1248 
   1249       case OP_WHITESPACE:
   1250       set_type_bits(re, cbit_space, table_limit);
   1251       try_next = FALSE;
   1252       break;
   1253 
   1254       case OP_NOT_WORDCHAR:
   1255       set_nottype_bits(re, cbit_word, table_limit);
   1256       try_next = FALSE;
   1257       break;
   1258 
   1259       case OP_WORDCHAR:
   1260       set_type_bits(re, cbit_word, table_limit);
   1261       try_next = FALSE;
   1262       break;
   1263 
   1264       /* One or more character type fudges the pointer and restarts, knowing
   1265       it will hit a single character type and stop there. */
   1266 
   1267       case OP_TYPEPLUS:
   1268       case OP_TYPEMINPLUS:
   1269       case OP_TYPEPOSPLUS:
   1270       tcode++;
   1271       break;
   1272 
   1273       case OP_TYPEEXACT:
   1274       tcode += 1 + IMM2_SIZE;
   1275       break;
   1276 
   1277       /* Zero or more repeats of character types set the bits and then
   1278       try again. */
   1279 
   1280       case OP_TYPEUPTO:
   1281       case OP_TYPEMINUPTO:
   1282       case OP_TYPEPOSUPTO:
   1283       tcode += IMM2_SIZE;  /* Fall through */
   1284 
   1285       case OP_TYPESTAR:
   1286       case OP_TYPEMINSTAR:
   1287       case OP_TYPEPOSSTAR:
   1288       case OP_TYPEQUERY:
   1289       case OP_TYPEMINQUERY:
   1290       case OP_TYPEPOSQUERY:
   1291       switch(tcode[1])
   1292         {
   1293         default:
   1294         case OP_ANY:
   1295         case OP_ALLANY:
   1296         return SSB_FAIL;
   1297 
   1298         case OP_HSPACE:
   1299         SET_BIT(CHAR_HT);
   1300         SET_BIT(CHAR_SPACE);
   1301 
   1302         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1303         the bits for 0xA0 and for code units >= 255, independently of UTF. */
   1304 
   1305 #if PCRE2_CODE_UNIT_WIDTH != 8
   1306         SET_BIT(0xA0);
   1307         SET_BIT(0xFF);
   1308 #else
   1309         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1310         units of horizontal space characters. */
   1311 
   1312 #ifdef SUPPORT_UNICODE
   1313         if (utf)
   1314           {
   1315           SET_BIT(0xC2);  /* For U+00A0 */
   1316           SET_BIT(0xE1);  /* For U+1680, U+180E */
   1317           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1318           SET_BIT(0xE3);  /* For U+3000 */
   1319           }
   1320         else
   1321 #endif
   1322         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
   1323         the code is EBCDIC. */
   1324           {
   1325 #ifndef EBCDIC
   1326           SET_BIT(0xA0);
   1327 #endif  /* Not EBCDIC */
   1328           }
   1329 #endif  /* 8-bit support */
   1330         break;
   1331 
   1332         case OP_ANYNL:
   1333         case OP_VSPACE:
   1334         SET_BIT(CHAR_LF);
   1335         SET_BIT(CHAR_VT);
   1336         SET_BIT(CHAR_FF);
   1337         SET_BIT(CHAR_CR);
   1338 
   1339         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
   1340         the bits for NEL and for code units >= 255, independently of UTF. */
   1341 
   1342 #if PCRE2_CODE_UNIT_WIDTH != 8
   1343         SET_BIT(CHAR_NEL);
   1344         SET_BIT(0xFF);
   1345 #else
   1346         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
   1347         units of vertical space characters. */
   1348 
   1349 #ifdef SUPPORT_UNICODE
   1350         if (utf)
   1351           {
   1352           SET_BIT(0xC2);  /* For U+0085 (NEL) */
   1353           SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1354           }
   1355         else
   1356 #endif
   1357         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
   1358           {
   1359           SET_BIT(CHAR_NEL);
   1360           }
   1361 #endif  /* 8-bit support */
   1362         break;
   1363 
   1364         case OP_NOT_DIGIT:
   1365         set_nottype_bits(re, cbit_digit, table_limit);
   1366         break;
   1367 
   1368         case OP_DIGIT:
   1369         set_type_bits(re, cbit_digit, table_limit);
   1370         break;
   1371 
   1372         case OP_NOT_WHITESPACE:
   1373         set_nottype_bits(re, cbit_space, table_limit);
   1374         break;
   1375 
   1376         case OP_WHITESPACE:
   1377         set_type_bits(re, cbit_space, table_limit);
   1378         break;
   1379 
   1380         case OP_NOT_WORDCHAR:
   1381         set_nottype_bits(re, cbit_word, table_limit);
   1382         break;
   1383 
   1384         case OP_WORDCHAR:
   1385         set_type_bits(re, cbit_word, table_limit);
   1386         break;
   1387         }
   1388 
   1389       tcode += 2;
   1390       break;
   1391 
   1392       /* Extended class: if there are any property checks, or if this is a
   1393       negative XCLASS without a map, give up. If there are no property checks,
   1394       there must be wide characters on the XCLASS list, because otherwise an
   1395       XCLASS would not have been created. This means that code points >= 255
   1396       are always potential starters. */
   1397 
   1398 #ifdef SUPPORT_WIDE_CHARS
   1399       case OP_XCLASS:
   1400       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
   1401           (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
   1402         return SSB_FAIL;
   1403 
   1404       /* We have a positive XCLASS or a negative one without a map. Set up the
   1405       map pointer if there is one, and fall through. */
   1406 
   1407       classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
   1408         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
   1409 #endif
   1410 
   1411       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
   1412       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
   1413       because it starts a character with a value > 255. In 8-bit non-UTF mode,
   1414       there is no difference between CLASS and NCLASS. In all other wide
   1415       character modes, set the 0xFF bit to indicate code units >= 255. */
   1416 
   1417       case OP_NCLASS:
   1418 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   1419       if (utf)
   1420         {
   1421         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
   1422         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
   1423         }
   1424 #elif PCRE2_CODE_UNIT_WIDTH != 8
   1425       SET_BIT(0xFF);                             /* For characters >= 255 */
   1426 #endif
   1427       /* Fall through */
   1428 
   1429       /* Enter here for a positive non-XCLASS. If we have fallen through from
   1430       an XCLASS, classmap will already be set; just advance the code pointer.
   1431       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
   1432 
   1433       case OP_CLASS:
   1434       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
   1435         {
   1436         classmap = (uint8_t *)(++tcode);
   1437         tcode += 32 / sizeof(PCRE2_UCHAR);
   1438         }
   1439 
   1440       /* When wide characters are supported, classmap may be NULL. In UTF-8
   1441       (sic) mode, the bits in a class bit map correspond to character values,
   1442       not to byte values. However, the bit map we are constructing is for byte
   1443       values. So we have to do a conversion for characters whose code point is
   1444       greater than 127. In fact, there are only two possible starting bytes for
   1445       characters in the range 128 - 255. */
   1446 
   1447       if (classmap != NULL)
   1448         {
   1449 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   1450         if (utf)
   1451           {
   1452           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
   1453           for (c = 128; c < 256; c++)
   1454             {
   1455             if ((classmap[c/8] & (1 << (c&7))) != 0)
   1456               {
   1457               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
   1458               re->start_bitmap[d/8] |= (1 << (d&7));  /* and then skip on to the */
   1459               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
   1460               }
   1461             }
   1462           }
   1463         else
   1464 #endif
   1465         /* In all modes except UTF-8, the two bit maps are compatible. */
   1466 
   1467           {
   1468           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
   1469           }
   1470         }
   1471 
   1472       /* Act on what follows the class. For a zero minimum repeat, continue;
   1473       otherwise stop processing. */
   1474 
   1475       switch (*tcode)
   1476         {
   1477         case OP_CRSTAR:
   1478         case OP_CRMINSTAR:
   1479         case OP_CRQUERY:
   1480         case OP_CRMINQUERY:
   1481         case OP_CRPOSSTAR:
   1482         case OP_CRPOSQUERY:
   1483         tcode++;
   1484         break;
   1485 
   1486         case OP_CRRANGE:
   1487         case OP_CRMINRANGE:
   1488         case OP_CRPOSRANGE:
   1489         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
   1490           else try_next = FALSE;
   1491         break;
   1492 
   1493         default:
   1494         try_next = FALSE;
   1495         break;
   1496         }
   1497       break; /* End of class handling case */
   1498       }      /* End of switch for opcodes */
   1499     }        /* End of try_next loop */
   1500 
   1501   code += GET(code, 1);   /* Advance to next branch */
   1502   }
   1503 while (*code == OP_ALT);
   1504 
   1505 return yield;
   1506 }
   1507 
   1508 
   1509 
   1510 /*************************************************
   1511 *          Study a compiled expression           *
   1512 *************************************************/
   1513 
   1514 /* This function is handed a compiled expression that it must study to produce
   1515 information that will speed up the matching.
   1516 
   1517 Argument:  points to the compiled expression
   1518 Returns:   0 normally; non-zero should never normally occur
   1519            1 unknown opcode in set_start_bits
   1520            2 missing capturing bracket
   1521            3 unknown opcode in find_minlength
   1522 */
   1523 
   1524 int
   1525 PRIV(study)(pcre2_real_code *re)
   1526 {
   1527 int min;
   1528 int count = 0;
   1529 PCRE2_UCHAR *code;
   1530 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
   1531 
   1532 /* Find start of compiled code */
   1533 
   1534 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
   1535   re->name_entry_size * re->name_count;
   1536 
   1537 /* For an anchored pattern, or an unanchored pattern that has a first code
   1538 unit, or a multiline pattern that matches only at "line start", there is no
   1539 point in seeking a list of starting code units. */
   1540 
   1541 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
   1542     (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
   1543   {
   1544   int rc = set_start_bits(re, code, utf);
   1545   if (rc == SSB_UNKNOWN) return 1;
   1546   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
   1547   }
   1548 
   1549 /* Find the minimum length of subject string. If it can match an empty string,
   1550 the minimum length is already known. */
   1551 
   1552 if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
   1553   {
   1554   switch(min = find_minlength(re, code, code, utf, NULL, &count))
   1555     {
   1556     case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
   1557     break;    /* Leave minlength unchanged (will be zero) */
   1558 
   1559     case -2:
   1560     return 2; /* missing capturing bracket */
   1561 
   1562     case -3:
   1563     return 3; /* unrecognized opcode */
   1564 
   1565     default:
   1566     if (min > UINT16_MAX) min = UINT16_MAX;
   1567     re->minlength = min;
   1568     break;
   1569     }
   1570   }
   1571 
   1572 return 0;
   1573 }
   1574 
   1575 /* End of pcre2_study.c */
   1576