Home | History | Annotate | Download | only in dist
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9            Copyright (c) 1997-2012 University of Cambridge
     10 
     11 -----------------------------------------------------------------------------
     12 Redistribution and use in source and binary forms, with or without
     13 modification, are permitted provided that the following conditions are met:
     14 
     15     * Redistributions of source code must retain the above copyright notice,
     16       this list of conditions and the following disclaimer.
     17 
     18     * Redistributions in binary form must reproduce the above copyright
     19       notice, this list of conditions and the following disclaimer in the
     20       documentation and/or other materials provided with the distribution.
     21 
     22     * Neither the name of the University of Cambridge nor the names of its
     23       contributors may be used to endorse or promote products derived from
     24       this software without specific prior written permission.
     25 
     26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36 POSSIBILITY OF SUCH DAMAGE.
     37 -----------------------------------------------------------------------------
     38 */
     39 
     40 
     41 /* This module contains the external function pcre_study(), along with local
     42 supporting functions. */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #include "pcre_internal.h"
     50 
     51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
     52 
     53 /* Returns from set_start_bits() */
     54 
     55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
     56 
     57 
     58 
     59 /*************************************************
     60 *   Find the minimum subject length for a group  *
     61 *************************************************/
     62 
     63 /* Scan a parenthesized group and compute the minimum length of subject that
     64 is needed to match it. This is a lower bound; it does not mean there is a
     65 string of that length that matches. In UTF8 mode, the result is in characters
     66 rather than bytes.
     67 
     68 Arguments:
     69   re              compiled pattern block
     70   code            pointer to start of group (the bracket)
     71   startcode       pointer to start of the whole pattern's code
     72   options         the compiling options
     73   recurses        chain of recurse_check to catch mutual recursion
     74   countptr        pointer to call count (to catch over complexity)
     75 
     76 Returns:   the minimum length
     77            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
     78            -2 internal error (missing capturing bracket)
     79            -3 internal error (opcode not listed)
     80 */
     81 
     82 static int
     83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
     84   const pcre_uchar *startcode, int options, recurse_check *recurses,
     85   int *countptr)
     86 {
     87 int length = -1;
     88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
     89 BOOL utf = (options & PCRE_UTF8) != 0;
     90 BOOL had_recurse = FALSE;
     91 recurse_check this_recurse;
     92 register int branchlength = 0;
     93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
     94 
     95 if ((*countptr)++ > 1000) return -1;   /* too complex */
     96 
     97 if (*code == OP_CBRA || *code == OP_SCBRA ||
     98     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
     99 
    100 /* Scan along the opcodes for this branch. If we get to the end of the
    101 branch, check the length against that of the other branches. */
    102 
    103 for (;;)
    104   {
    105   int d, min;
    106   pcre_uchar *cs, *ce;
    107   register pcre_uchar op = *cc;
    108 
    109   switch (op)
    110     {
    111     case OP_COND:
    112     case OP_SCOND:
    113 
    114     /* If there is only one branch in a condition, the implied branch has zero
    115     length, so we don't add anything. This covers the DEFINE "condition"
    116     automatically. */
    117 
    118     cs = cc + GET(cc, 1);
    119     if (*cs != OP_ALT)
    120       {
    121       cc = cs + 1 + LINK_SIZE;
    122       break;
    123       }
    124 
    125     /* Otherwise we can fall through and treat it the same as any other
    126     subpattern. */
    127 
    128     case OP_CBRA:
    129     case OP_SCBRA:
    130     case OP_BRA:
    131     case OP_SBRA:
    132     case OP_CBRAPOS:
    133     case OP_SCBRAPOS:
    134     case OP_BRAPOS:
    135     case OP_SBRAPOS:
    136     case OP_ONCE:
    137     case OP_ONCE_NC:
    138     d = find_minlength(re, cc, startcode, options, recurses, countptr);
    139     if (d < 0) return d;
    140     branchlength += d;
    141     do cc += GET(cc, 1); while (*cc == OP_ALT);
    142     cc += 1 + LINK_SIZE;
    143     break;
    144 
    145     /* ACCEPT makes things far too complicated; we have to give up. */
    146 
    147     case OP_ACCEPT:
    148     case OP_ASSERT_ACCEPT:
    149     return -1;
    150 
    151     /* Reached end of a branch; if it's a ket it is the end of a nested
    152     call. If it's ALT it is an alternation in a nested call. If it is END it's
    153     the end of the outer call. All can be handled by the same code. If an
    154     ACCEPT was previously encountered, use the length that was in force at that
    155     time, and pass back the shortest ACCEPT length. */
    156 
    157     case OP_ALT:
    158     case OP_KET:
    159     case OP_KETRMAX:
    160     case OP_KETRMIN:
    161     case OP_KETRPOS:
    162     case OP_END:
    163     if (length < 0 || (!had_recurse && branchlength < length))
    164       length = branchlength;
    165     if (op != OP_ALT) return length;
    166     cc += 1 + LINK_SIZE;
    167     branchlength = 0;
    168     had_recurse = FALSE;
    169     break;
    170 
    171     /* Skip over assertive subpatterns */
    172 
    173     case OP_ASSERT:
    174     case OP_ASSERT_NOT:
    175     case OP_ASSERTBACK:
    176     case OP_ASSERTBACK_NOT:
    177     do cc += GET(cc, 1); while (*cc == OP_ALT);
    178     /* Fall through */
    179 
    180     /* Skip over things that don't match chars */
    181 
    182     case OP_REVERSE:
    183     case OP_CREF:
    184     case OP_DNCREF:
    185     case OP_RREF:
    186     case OP_DNRREF:
    187     case OP_DEF:
    188     case OP_CALLOUT:
    189     case OP_SOD:
    190     case OP_SOM:
    191     case OP_EOD:
    192     case OP_EODN:
    193     case OP_CIRC:
    194     case OP_CIRCM:
    195     case OP_DOLL:
    196     case OP_DOLLM:
    197     case OP_NOT_WORD_BOUNDARY:
    198     case OP_WORD_BOUNDARY:
    199     cc += PRIV(OP_lengths)[*cc];
    200     break;
    201 
    202     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
    203 
    204     case OP_BRAZERO:
    205     case OP_BRAMINZERO:
    206     case OP_BRAPOSZERO:
    207     case OP_SKIPZERO:
    208     cc += PRIV(OP_lengths)[*cc];
    209     do cc += GET(cc, 1); while (*cc == OP_ALT);
    210     cc += 1 + LINK_SIZE;
    211     break;
    212 
    213     /* Handle literal characters and + repetitions */
    214 
    215     case OP_CHAR:
    216     case OP_CHARI:
    217     case OP_NOT:
    218     case OP_NOTI:
    219     case OP_PLUS:
    220     case OP_PLUSI:
    221     case OP_MINPLUS:
    222     case OP_MINPLUSI:
    223     case OP_POSPLUS:
    224     case OP_POSPLUSI:
    225     case OP_NOTPLUS:
    226     case OP_NOTPLUSI:
    227     case OP_NOTMINPLUS:
    228     case OP_NOTMINPLUSI:
    229     case OP_NOTPOSPLUS:
    230     case OP_NOTPOSPLUSI:
    231     branchlength++;
    232     cc += 2;
    233 #ifdef SUPPORT_UTF
    234     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    235 #endif
    236     break;
    237 
    238     case OP_TYPEPLUS:
    239     case OP_TYPEMINPLUS:
    240     case OP_TYPEPOSPLUS:
    241     branchlength++;
    242     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
    243     break;
    244 
    245     /* Handle exact repetitions. The count is already in characters, but we
    246     need to skip over a multibyte character in UTF8 mode.  */
    247 
    248     case OP_EXACT:
    249     case OP_EXACTI:
    250     case OP_NOTEXACT:
    251     case OP_NOTEXACTI:
    252     branchlength += GET2(cc,1);
    253     cc += 2 + IMM2_SIZE;
    254 #ifdef SUPPORT_UTF
    255     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    256 #endif
    257     break;
    258 
    259     case OP_TYPEEXACT:
    260     branchlength += GET2(cc,1);
    261     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
    262       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
    263     break;
    264 
    265     /* Handle single-char non-literal matchers */
    266 
    267     case OP_PROP:
    268     case OP_NOTPROP:
    269     cc += 2;
    270     /* Fall through */
    271 
    272     case OP_NOT_DIGIT:
    273     case OP_DIGIT:
    274     case OP_NOT_WHITESPACE:
    275     case OP_WHITESPACE:
    276     case OP_NOT_WORDCHAR:
    277     case OP_WORDCHAR:
    278     case OP_ANY:
    279     case OP_ALLANY:
    280     case OP_EXTUNI:
    281     case OP_HSPACE:
    282     case OP_NOT_HSPACE:
    283     case OP_VSPACE:
    284     case OP_NOT_VSPACE:
    285     branchlength++;
    286     cc++;
    287     break;
    288 
    289     /* "Any newline" might match two characters, but it also might match just
    290     one. */
    291 
    292     case OP_ANYNL:
    293     branchlength += 1;
    294     cc++;
    295     break;
    296 
    297     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
    298     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
    299     appear, but leave the code, just in case.) */
    300 
    301     case OP_ANYBYTE:
    302 #ifdef SUPPORT_UTF
    303     if (utf) return -1;
    304 #endif
    305     branchlength++;
    306     cc++;
    307     break;
    308 
    309     /* For repeated character types, we have to test for \p and \P, which have
    310     an extra two bytes of parameters. */
    311 
    312     case OP_TYPESTAR:
    313     case OP_TYPEMINSTAR:
    314     case OP_TYPEQUERY:
    315     case OP_TYPEMINQUERY:
    316     case OP_TYPEPOSSTAR:
    317     case OP_TYPEPOSQUERY:
    318     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
    319     cc += PRIV(OP_lengths)[op];
    320     break;
    321 
    322     case OP_TYPEUPTO:
    323     case OP_TYPEMINUPTO:
    324     case OP_TYPEPOSUPTO:
    325     if (cc[1 + IMM2_SIZE] == OP_PROP
    326       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
    327     cc += PRIV(OP_lengths)[op];
    328     break;
    329 
    330     /* Check a class for variable quantification */
    331 
    332     case OP_CLASS:
    333     case OP_NCLASS:
    334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
    335     case OP_XCLASS:
    336     /* The original code caused an unsigned overflow in 64 bit systems,
    337     so now we use a conditional statement. */
    338     if (op == OP_XCLASS)
    339       cc += GET(cc, 1);
    340     else
    341       cc += PRIV(OP_lengths)[OP_CLASS];
    342 #else
    343     cc += PRIV(OP_lengths)[OP_CLASS];
    344 #endif
    345 
    346     switch (*cc)
    347       {
    348       case OP_CRPLUS:
    349       case OP_CRMINPLUS:
    350       case OP_CRPOSPLUS:
    351       branchlength++;
    352       /* Fall through */
    353 
    354       case OP_CRSTAR:
    355       case OP_CRMINSTAR:
    356       case OP_CRQUERY:
    357       case OP_CRMINQUERY:
    358       case OP_CRPOSSTAR:
    359       case OP_CRPOSQUERY:
    360       cc++;
    361       break;
    362 
    363       case OP_CRRANGE:
    364       case OP_CRMINRANGE:
    365       case OP_CRPOSRANGE:
    366       branchlength += GET2(cc,1);
    367       cc += 1 + 2 * IMM2_SIZE;
    368       break;
    369 
    370       default:
    371       branchlength++;
    372       break;
    373       }
    374     break;
    375 
    376     /* Backreferences and subroutine calls are treated in the same way: we find
    377     the minimum length for the subpattern. A recursion, however, causes an
    378     a flag to be set that causes the length of this branch to be ignored. The
    379     logic is that a recursion can only make sense if there is another
    380     alternation that stops the recursing. That will provide the minimum length
    381     (when no recursion happens). A backreference within the group that it is
    382     referencing behaves in the same way.
    383 
    384     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
    385     matches an empty string (by default it causes a matching failure), so in
    386     that case we must set the minimum length to zero. */
    387 
    388     case OP_DNREF:     /* Duplicate named pattern back reference */
    389     case OP_DNREFI:
    390     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
    391       {
    392       int count = GET2(cc, 1+IMM2_SIZE);
    393       pcre_uchar *slot = (pcre_uchar *)re +
    394         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
    395       d = INT_MAX;
    396       while (count-- > 0)
    397         {
    398         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
    399         if (cs == NULL) return -2;
    400         do ce += GET(ce, 1); while (*ce == OP_ALT);
    401         if (cc > cs && cc < ce)     /* Simple recursion */
    402           {
    403           d = 0;
    404           had_recurse = TRUE;
    405           break;
    406           }
    407         else
    408           {
    409           recurse_check *r = recurses;
    410           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    411           if (r != NULL)           /* Mutual recursion */
    412             {
    413             d = 0;
    414             had_recurse = TRUE;
    415             break;
    416             }
    417           else
    418             {
    419             int dd;
    420             this_recurse.prev = recurses;
    421             this_recurse.group = cs;
    422             dd = find_minlength(re, cs, startcode, options, &this_recurse,
    423               countptr);
    424             if (dd < d) d = dd;
    425             }
    426           }
    427         slot += re->name_entry_size;
    428         }
    429       }
    430     else d = 0;
    431     cc += 1 + 2*IMM2_SIZE;
    432     goto REPEAT_BACK_REFERENCE;
    433 
    434     case OP_REF:      /* Single back reference */
    435     case OP_REFI:
    436     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
    437       {
    438       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
    439       if (cs == NULL) return -2;
    440       do ce += GET(ce, 1); while (*ce == OP_ALT);
    441       if (cc > cs && cc < ce)    /* Simple recursion */
    442         {
    443         d = 0;
    444         had_recurse = TRUE;
    445         }
    446       else
    447         {
    448         recurse_check *r = recurses;
    449         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    450         if (r != NULL)           /* Mutual recursion */
    451           {
    452           d = 0;
    453           had_recurse = TRUE;
    454           }
    455         else
    456           {
    457           this_recurse.prev = recurses;
    458           this_recurse.group = cs;
    459           d = find_minlength(re, cs, startcode, options, &this_recurse,
    460             countptr);
    461           }
    462         }
    463       }
    464     else d = 0;
    465     cc += 1 + IMM2_SIZE;
    466 
    467     /* Handle repeated back references */
    468 
    469     REPEAT_BACK_REFERENCE:
    470     switch (*cc)
    471       {
    472       case OP_CRSTAR:
    473       case OP_CRMINSTAR:
    474       case OP_CRQUERY:
    475       case OP_CRMINQUERY:
    476       case OP_CRPOSSTAR:
    477       case OP_CRPOSQUERY:
    478       min = 0;
    479       cc++;
    480       break;
    481 
    482       case OP_CRPLUS:
    483       case OP_CRMINPLUS:
    484       case OP_CRPOSPLUS:
    485       min = 1;
    486       cc++;
    487       break;
    488 
    489       case OP_CRRANGE:
    490       case OP_CRMINRANGE:
    491       case OP_CRPOSRANGE:
    492       min = GET2(cc, 1);
    493       cc += 1 + 2 * IMM2_SIZE;
    494       break;
    495 
    496       default:
    497       min = 1;
    498       break;
    499       }
    500 
    501     branchlength += min * d;
    502     break;
    503 
    504     /* We can easily detect direct recursion, but not mutual recursion. This is
    505     caught by a recursion depth count. */
    506 
    507     case OP_RECURSE:
    508     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
    509     do ce += GET(ce, 1); while (*ce == OP_ALT);
    510     if (cc > cs && cc < ce)    /* Simple recursion */
    511       had_recurse = TRUE;
    512     else
    513       {
    514       recurse_check *r = recurses;
    515       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
    516       if (r != NULL)           /* Mutual recursion */
    517         had_recurse = TRUE;
    518       else
    519         {
    520         this_recurse.prev = recurses;
    521         this_recurse.group = cs;
    522         branchlength += find_minlength(re, cs, startcode, options,
    523           &this_recurse, countptr);
    524         }
    525       }
    526     cc += 1 + LINK_SIZE;
    527     break;
    528 
    529     /* Anything else does not or need not match a character. We can get the
    530     item's length from the table, but for those that can match zero occurrences
    531     of a character, we must take special action for UTF-8 characters. As it
    532     happens, the "NOT" versions of these opcodes are used at present only for
    533     ASCII characters, so they could be omitted from this list. However, in
    534     future that may change, so we include them here so as not to leave a
    535     gotcha for a future maintainer. */
    536 
    537     case OP_UPTO:
    538     case OP_UPTOI:
    539     case OP_NOTUPTO:
    540     case OP_NOTUPTOI:
    541     case OP_MINUPTO:
    542     case OP_MINUPTOI:
    543     case OP_NOTMINUPTO:
    544     case OP_NOTMINUPTOI:
    545     case OP_POSUPTO:
    546     case OP_POSUPTOI:
    547     case OP_NOTPOSUPTO:
    548     case OP_NOTPOSUPTOI:
    549 
    550     case OP_STAR:
    551     case OP_STARI:
    552     case OP_NOTSTAR:
    553     case OP_NOTSTARI:
    554     case OP_MINSTAR:
    555     case OP_MINSTARI:
    556     case OP_NOTMINSTAR:
    557     case OP_NOTMINSTARI:
    558     case OP_POSSTAR:
    559     case OP_POSSTARI:
    560     case OP_NOTPOSSTAR:
    561     case OP_NOTPOSSTARI:
    562 
    563     case OP_QUERY:
    564     case OP_QUERYI:
    565     case OP_NOTQUERY:
    566     case OP_NOTQUERYI:
    567     case OP_MINQUERY:
    568     case OP_MINQUERYI:
    569     case OP_NOTMINQUERY:
    570     case OP_NOTMINQUERYI:
    571     case OP_POSQUERY:
    572     case OP_POSQUERYI:
    573     case OP_NOTPOSQUERY:
    574     case OP_NOTPOSQUERYI:
    575 
    576     cc += PRIV(OP_lengths)[op];
    577 #ifdef SUPPORT_UTF
    578     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
    579 #endif
    580     break;
    581 
    582     /* Skip these, but we need to add in the name length. */
    583 
    584     case OP_MARK:
    585     case OP_PRUNE_ARG:
    586     case OP_SKIP_ARG:
    587     case OP_THEN_ARG:
    588     cc += PRIV(OP_lengths)[op] + cc[1];
    589     break;
    590 
    591     /* The remaining opcodes are just skipped over. */
    592 
    593     case OP_CLOSE:
    594     case OP_COMMIT:
    595     case OP_FAIL:
    596     case OP_PRUNE:
    597     case OP_SET_SOM:
    598     case OP_SKIP:
    599     case OP_THEN:
    600     cc += PRIV(OP_lengths)[op];
    601     break;
    602 
    603     /* This should not occur: we list all opcodes explicitly so that when
    604     new ones get added they are properly considered. */
    605 
    606     default:
    607     return -3;
    608     }
    609   }
    610 /* Control never gets here */
    611 }
    612 
    613 
    614 
    615 /*************************************************
    616 *      Set a bit and maybe its alternate case    *
    617 *************************************************/
    618 
    619 /* Given a character, set its first byte's bit in the table, and also the
    620 corresponding bit for the other version of a letter if we are caseless. In
    621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
    622 when Unicode property support is available.
    623 
    624 Arguments:
    625   start_bits    points to the bit map
    626   p             points to the character
    627   caseless      the caseless flag
    628   cd            the block with char table pointers
    629   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
    630 
    631 Returns:        pointer after the character
    632 */
    633 
    634 static const pcre_uchar *
    635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
    636   compile_data *cd, BOOL utf)
    637 {
    638 pcre_uint32 c = *p;
    639 
    640 #ifdef COMPILE_PCRE8
    641 SET_BIT(c);
    642 
    643 #ifdef SUPPORT_UTF
    644 if (utf && c > 127)
    645   {
    646   GETCHARINC(c, p);
    647 #ifdef SUPPORT_UCP
    648   if (caseless)
    649     {
    650     pcre_uchar buff[6];
    651     c = UCD_OTHERCASE(c);
    652     (void)PRIV(ord2utf)(c, buff);
    653     SET_BIT(buff[0]);
    654     }
    655 #endif  /* Not SUPPORT_UCP */
    656   return p;
    657   }
    658 #else   /* Not SUPPORT_UTF */
    659 (void)(utf);   /* Stops warning for unused parameter */
    660 #endif  /* SUPPORT_UTF */
    661 
    662 /* Not UTF-8 mode, or character is less than 127. */
    663 
    664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
    665 return p + 1;
    666 #endif  /* COMPILE_PCRE8 */
    667 
    668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
    669 if (c > 0xff)
    670   {
    671   c = 0xff;
    672   caseless = FALSE;
    673   }
    674 SET_BIT(c);
    675 
    676 #ifdef SUPPORT_UTF
    677 if (utf && c > 127)
    678   {
    679   GETCHARINC(c, p);
    680 #ifdef SUPPORT_UCP
    681   if (caseless)
    682     {
    683     c = UCD_OTHERCASE(c);
    684     if (c > 0xff)
    685       c = 0xff;
    686     SET_BIT(c);
    687     }
    688 #endif  /* SUPPORT_UCP */
    689   return p;
    690   }
    691 #else   /* Not SUPPORT_UTF */
    692 (void)(utf);   /* Stops warning for unused parameter */
    693 #endif  /* SUPPORT_UTF */
    694 
    695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
    696 return p + 1;
    697 #endif
    698 }
    699 
    700 
    701 
    702 /*************************************************
    703 *     Set bits for a positive character type     *
    704 *************************************************/
    705 
    706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
    707 only do a direct setting for bytes less than 128, as otherwise there can be
    708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
    709 environment, the tables will only recognize ASCII characters anyway, but in at
    710 least one Windows environment, some higher bytes bits were set in the tables.
    711 So we deal with that case by considering the UTF-8 encoding.
    712 
    713 Arguments:
    714   start_bits     the starting bitmap
    715   cbit type      the type of character wanted
    716   table_limit    32 for non-UTF-8; 16 for UTF-8
    717   cd             the block with char table pointers
    718 
    719 Returns:         nothing
    720 */
    721 
    722 static void
    723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
    724   compile_data *cd)
    725 {
    726 register pcre_uint32 c;
    727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
    728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
    729 if (table_limit == 32) return;
    730 for (c = 128; c < 256; c++)
    731   {
    732   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
    733     {
    734     pcre_uchar buff[6];
    735     (void)PRIV(ord2utf)(c, buff);
    736     SET_BIT(buff[0]);
    737     }
    738   }
    739 #endif
    740 }
    741 
    742 
    743 /*************************************************
    744 *     Set bits for a negative character type     *
    745 *************************************************/
    746 
    747 /* This function sets starting bits for a negative character type such as \D.
    748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
    749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
    750 Unlike in the positive case, where we can set appropriate starting bits for
    751 specific high-valued UTF-8 characters, in this case we have to set the bits for
    752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
    753 0xc0 (192) for simplicity.
    754 
    755 Arguments:
    756   start_bits     the starting bitmap
    757   cbit type      the type of character wanted
    758   table_limit    32 for non-UTF-8; 16 for UTF-8
    759   cd             the block with char table pointers
    760 
    761 Returns:         nothing
    762 */
    763 
    764 static void
    765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
    766   compile_data *cd)
    767 {
    768 register pcre_uint32 c;
    769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
    770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
    771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
    772 #endif
    773 }
    774 
    775 
    776 
    777 /*************************************************
    778 *          Create bitmap of starting bytes       *
    779 *************************************************/
    780 
    781 /* This function scans a compiled unanchored expression recursively and
    782 attempts to build a bitmap of the set of possible starting bytes. As time goes
    783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
    784 useful for parenthesized groups in patterns such as (a*)b where the group
    785 provides some optional starting bytes but scanning must continue at the outer
    786 level to find at least one mandatory byte. At the outermost level, this
    787 function fails unless the result is SSB_DONE.
    788 
    789 Arguments:
    790   code         points to an expression
    791   start_bits   points to a 32-byte table, initialized to 0
    792   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
    793   cd           the block with char table pointers
    794 
    795 Returns:       SSB_FAIL     => Failed to find any starting bytes
    796                SSB_DONE     => Found mandatory starting bytes
    797                SSB_CONTINUE => Found optional starting bytes
    798                SSB_UNKNOWN  => Hit an unrecognized opcode
    799 */
    800 
    801 static int
    802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
    803   compile_data *cd)
    804 {
    805 register pcre_uint32 c;
    806 int yield = SSB_DONE;
    807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
    808 int table_limit = utf? 16:32;
    809 #else
    810 int table_limit = 32;
    811 #endif
    812 
    813 #if 0
    814 /* ========================================================================= */
    815 /* The following comment and code was inserted in January 1999. In May 2006,
    816 when it was observed to cause compiler warnings about unused values, I took it
    817 out again. If anybody is still using OS/2, they will have to put it back
    818 manually. */
    819 
    820 /* This next statement and the later reference to dummy are here in order to
    821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
    822 code. Apparently IBM isn't going to fix the problem, and we would rather not
    823 disable optimization (in this module it actually makes a big difference, and
    824 the pcre module can use all the optimization it can get). */
    825 
    826 volatile int dummy;
    827 /* ========================================================================= */
    828 #endif
    829 
    830 do
    831   {
    832   BOOL try_next = TRUE;
    833   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
    834 
    835   if (*code == OP_CBRA || *code == OP_SCBRA ||
    836       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
    837 
    838   while (try_next)    /* Loop for items in this branch */
    839     {
    840     int rc;
    841 
    842     switch(*tcode)
    843       {
    844       /* If we reach something we don't understand, it means a new opcode has
    845       been created that hasn't been added to this code. Hopefully this problem
    846       will be discovered during testing. */
    847 
    848       default:
    849       return SSB_UNKNOWN;
    850 
    851       /* Fail for a valid opcode that implies no starting bits. */
    852 
    853       case OP_ACCEPT:
    854       case OP_ASSERT_ACCEPT:
    855       case OP_ALLANY:
    856       case OP_ANY:
    857       case OP_ANYBYTE:
    858       case OP_CIRC:
    859       case OP_CIRCM:
    860       case OP_CLOSE:
    861       case OP_COMMIT:
    862       case OP_COND:
    863       case OP_CREF:
    864       case OP_DEF:
    865       case OP_DNCREF:
    866       case OP_DNREF:
    867       case OP_DNREFI:
    868       case OP_DNRREF:
    869       case OP_DOLL:
    870       case OP_DOLLM:
    871       case OP_END:
    872       case OP_EOD:
    873       case OP_EODN:
    874       case OP_EXTUNI:
    875       case OP_FAIL:
    876       case OP_MARK:
    877       case OP_NOT:
    878       case OP_NOTEXACT:
    879       case OP_NOTEXACTI:
    880       case OP_NOTI:
    881       case OP_NOTMINPLUS:
    882       case OP_NOTMINPLUSI:
    883       case OP_NOTMINQUERY:
    884       case OP_NOTMINQUERYI:
    885       case OP_NOTMINSTAR:
    886       case OP_NOTMINSTARI:
    887       case OP_NOTMINUPTO:
    888       case OP_NOTMINUPTOI:
    889       case OP_NOTPLUS:
    890       case OP_NOTPLUSI:
    891       case OP_NOTPOSPLUS:
    892       case OP_NOTPOSPLUSI:
    893       case OP_NOTPOSQUERY:
    894       case OP_NOTPOSQUERYI:
    895       case OP_NOTPOSSTAR:
    896       case OP_NOTPOSSTARI:
    897       case OP_NOTPOSUPTO:
    898       case OP_NOTPOSUPTOI:
    899       case OP_NOTPROP:
    900       case OP_NOTQUERY:
    901       case OP_NOTQUERYI:
    902       case OP_NOTSTAR:
    903       case OP_NOTSTARI:
    904       case OP_NOTUPTO:
    905       case OP_NOTUPTOI:
    906       case OP_NOT_HSPACE:
    907       case OP_NOT_VSPACE:
    908       case OP_PRUNE:
    909       case OP_PRUNE_ARG:
    910       case OP_RECURSE:
    911       case OP_REF:
    912       case OP_REFI:
    913       case OP_REVERSE:
    914       case OP_RREF:
    915       case OP_SCOND:
    916       case OP_SET_SOM:
    917       case OP_SKIP:
    918       case OP_SKIP_ARG:
    919       case OP_SOD:
    920       case OP_SOM:
    921       case OP_THEN:
    922       case OP_THEN_ARG:
    923       return SSB_FAIL;
    924 
    925       /* A "real" property test implies no starting bits, but the fake property
    926       PT_CLIST identifies a list of characters. These lists are short, as they
    927       are used for characters with more than one "other case", so there is no
    928       point in recognizing them for OP_NOTPROP. */
    929 
    930       case OP_PROP:
    931       if (tcode[1] != PT_CLIST) return SSB_FAIL;
    932         {
    933         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
    934         while ((c = *p++) < NOTACHAR)
    935           {
    936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
    937           if (utf)
    938             {
    939             pcre_uchar buff[6];
    940             (void)PRIV(ord2utf)(c, buff);
    941             c = buff[0];
    942             }
    943 #endif
    944           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
    945           }
    946         }
    947       try_next = FALSE;
    948       break;
    949 
    950       /* We can ignore word boundary tests. */
    951 
    952       case OP_WORD_BOUNDARY:
    953       case OP_NOT_WORD_BOUNDARY:
    954       tcode++;
    955       break;
    956 
    957       /* If we hit a bracket or a positive lookahead assertion, recurse to set
    958       bits from within the subpattern. If it can't find anything, we have to
    959       give up. If it finds some mandatory character(s), we are done for this
    960       branch. Otherwise, carry on scanning after the subpattern. */
    961 
    962       case OP_BRA:
    963       case OP_SBRA:
    964       case OP_CBRA:
    965       case OP_SCBRA:
    966       case OP_BRAPOS:
    967       case OP_SBRAPOS:
    968       case OP_CBRAPOS:
    969       case OP_SCBRAPOS:
    970       case OP_ONCE:
    971       case OP_ONCE_NC:
    972       case OP_ASSERT:
    973       rc = set_start_bits(tcode, start_bits, utf, cd);
    974       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
    975       if (rc == SSB_DONE) try_next = FALSE; else
    976         {
    977         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
    978         tcode += 1 + LINK_SIZE;
    979         }
    980       break;
    981 
    982       /* If we hit ALT or KET, it means we haven't found anything mandatory in
    983       this branch, though we might have found something optional. For ALT, we
    984       continue with the next alternative, but we have to arrange that the final
    985       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
    986       return SSB_CONTINUE: if this is the top level, that indicates failure,
    987       but after a nested subpattern, it causes scanning to continue. */
    988 
    989       case OP_ALT:
    990       yield = SSB_CONTINUE;
    991       try_next = FALSE;
    992       break;
    993 
    994       case OP_KET:
    995       case OP_KETRMAX:
    996       case OP_KETRMIN:
    997       case OP_KETRPOS:
    998       return SSB_CONTINUE;
    999 
   1000       /* Skip over callout */
   1001 
   1002       case OP_CALLOUT:
   1003       tcode += 2 + 2*LINK_SIZE;
   1004       break;
   1005 
   1006       /* Skip over lookbehind and negative lookahead assertions */
   1007 
   1008       case OP_ASSERT_NOT:
   1009       case OP_ASSERTBACK:
   1010       case OP_ASSERTBACK_NOT:
   1011       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
   1012       tcode += 1 + LINK_SIZE;
   1013       break;
   1014 
   1015       /* BRAZERO does the bracket, but carries on. */
   1016 
   1017       case OP_BRAZERO:
   1018       case OP_BRAMINZERO:
   1019       case OP_BRAPOSZERO:
   1020       rc = set_start_bits(++tcode, start_bits, utf, cd);
   1021       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
   1022 /* =========================================================================
   1023       See the comment at the head of this function concerning the next line,
   1024       which was an old fudge for the benefit of OS/2.
   1025       dummy = 1;
   1026   ========================================================================= */
   1027       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1028       tcode += 1 + LINK_SIZE;
   1029       break;
   1030 
   1031       /* SKIPZERO skips the bracket. */
   1032 
   1033       case OP_SKIPZERO:
   1034       tcode++;
   1035       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
   1036       tcode += 1 + LINK_SIZE;
   1037       break;
   1038 
   1039       /* Single-char * or ? sets the bit and tries the next item */
   1040 
   1041       case OP_STAR:
   1042       case OP_MINSTAR:
   1043       case OP_POSSTAR:
   1044       case OP_QUERY:
   1045       case OP_MINQUERY:
   1046       case OP_POSQUERY:
   1047       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
   1048       break;
   1049 
   1050       case OP_STARI:
   1051       case OP_MINSTARI:
   1052       case OP_POSSTARI:
   1053       case OP_QUERYI:
   1054       case OP_MINQUERYI:
   1055       case OP_POSQUERYI:
   1056       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
   1057       break;
   1058 
   1059       /* Single-char upto sets the bit and tries the next */
   1060 
   1061       case OP_UPTO:
   1062       case OP_MINUPTO:
   1063       case OP_POSUPTO:
   1064       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
   1065       break;
   1066 
   1067       case OP_UPTOI:
   1068       case OP_MINUPTOI:
   1069       case OP_POSUPTOI:
   1070       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
   1071       break;
   1072 
   1073       /* At least one single char sets the bit and stops */
   1074 
   1075       case OP_EXACT:
   1076       tcode += IMM2_SIZE;
   1077       /* Fall through */
   1078       case OP_CHAR:
   1079       case OP_PLUS:
   1080       case OP_MINPLUS:
   1081       case OP_POSPLUS:
   1082       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
   1083       try_next = FALSE;
   1084       break;
   1085 
   1086       case OP_EXACTI:
   1087       tcode += IMM2_SIZE;
   1088       /* Fall through */
   1089       case OP_CHARI:
   1090       case OP_PLUSI:
   1091       case OP_MINPLUSI:
   1092       case OP_POSPLUSI:
   1093       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
   1094       try_next = FALSE;
   1095       break;
   1096 
   1097       /* Special spacing and line-terminating items. These recognize specific
   1098       lists of characters. The difference between VSPACE and ANYNL is that the
   1099       latter can match the two-character CRLF sequence, but that is not
   1100       relevant for finding the first character, so their code here is
   1101       identical. */
   1102 
   1103       case OP_HSPACE:
   1104       SET_BIT(CHAR_HT);
   1105       SET_BIT(CHAR_SPACE);
   1106 #ifdef SUPPORT_UTF
   1107       if (utf)
   1108         {
   1109 #ifdef COMPILE_PCRE8
   1110         SET_BIT(0xC2);  /* For U+00A0 */
   1111         SET_BIT(0xE1);  /* For U+1680, U+180E */
   1112         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1113         SET_BIT(0xE3);  /* For U+3000 */
   1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1115         SET_BIT(0xA0);
   1116         SET_BIT(0xFF);  /* For characters > 255 */
   1117 #endif  /* COMPILE_PCRE[8|16|32] */
   1118         }
   1119       else
   1120 #endif /* SUPPORT_UTF */
   1121         {
   1122 #ifndef EBCDIC
   1123         SET_BIT(0xA0);
   1124 #endif  /* Not EBCDIC */
   1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1126         SET_BIT(0xFF);  /* For characters > 255 */
   1127 #endif  /* COMPILE_PCRE[16|32] */
   1128         }
   1129       try_next = FALSE;
   1130       break;
   1131 
   1132       case OP_ANYNL:
   1133       case OP_VSPACE:
   1134       SET_BIT(CHAR_LF);
   1135       SET_BIT(CHAR_VT);
   1136       SET_BIT(CHAR_FF);
   1137       SET_BIT(CHAR_CR);
   1138 #ifdef SUPPORT_UTF
   1139       if (utf)
   1140         {
   1141 #ifdef COMPILE_PCRE8
   1142         SET_BIT(0xC2);  /* For U+0085 */
   1143         SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1145         SET_BIT(CHAR_NEL);
   1146         SET_BIT(0xFF);  /* For characters > 255 */
   1147 #endif  /* COMPILE_PCRE[8|16|32] */
   1148         }
   1149       else
   1150 #endif /* SUPPORT_UTF */
   1151         {
   1152         SET_BIT(CHAR_NEL);
   1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1154         SET_BIT(0xFF);  /* For characters > 255 */
   1155 #endif
   1156         }
   1157       try_next = FALSE;
   1158       break;
   1159 
   1160       /* Single character types set the bits and stop. Note that if PCRE_UCP
   1161       is set, we do not see these op codes because \d etc are converted to
   1162       properties. Therefore, these apply in the case when only characters less
   1163       than 256 are recognized to match the types. */
   1164 
   1165       case OP_NOT_DIGIT:
   1166       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
   1167       try_next = FALSE;
   1168       break;
   1169 
   1170       case OP_DIGIT:
   1171       set_type_bits(start_bits, cbit_digit, table_limit, cd);
   1172       try_next = FALSE;
   1173       break;
   1174 
   1175       /* The cbit_space table has vertical tab as whitespace; we no longer
   1176       have to play fancy tricks because Perl added VT to its whitespace at
   1177       release 5.18. PCRE added it at release 8.34. */
   1178 
   1179       case OP_NOT_WHITESPACE:
   1180       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
   1181       try_next = FALSE;
   1182       break;
   1183 
   1184       case OP_WHITESPACE:
   1185       set_type_bits(start_bits, cbit_space, table_limit, cd);
   1186       try_next = FALSE;
   1187       break;
   1188 
   1189       case OP_NOT_WORDCHAR:
   1190       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
   1191       try_next = FALSE;
   1192       break;
   1193 
   1194       case OP_WORDCHAR:
   1195       set_type_bits(start_bits, cbit_word, table_limit, cd);
   1196       try_next = FALSE;
   1197       break;
   1198 
   1199       /* One or more character type fudges the pointer and restarts, knowing
   1200       it will hit a single character type and stop there. */
   1201 
   1202       case OP_TYPEPLUS:
   1203       case OP_TYPEMINPLUS:
   1204       case OP_TYPEPOSPLUS:
   1205       tcode++;
   1206       break;
   1207 
   1208       case OP_TYPEEXACT:
   1209       tcode += 1 + IMM2_SIZE;
   1210       break;
   1211 
   1212       /* Zero or more repeats of character types set the bits and then
   1213       try again. */
   1214 
   1215       case OP_TYPEUPTO:
   1216       case OP_TYPEMINUPTO:
   1217       case OP_TYPEPOSUPTO:
   1218       tcode += IMM2_SIZE;  /* Fall through */
   1219 
   1220       case OP_TYPESTAR:
   1221       case OP_TYPEMINSTAR:
   1222       case OP_TYPEPOSSTAR:
   1223       case OP_TYPEQUERY:
   1224       case OP_TYPEMINQUERY:
   1225       case OP_TYPEPOSQUERY:
   1226       switch(tcode[1])
   1227         {
   1228         default:
   1229         case OP_ANY:
   1230         case OP_ALLANY:
   1231         return SSB_FAIL;
   1232 
   1233         case OP_HSPACE:
   1234         SET_BIT(CHAR_HT);
   1235         SET_BIT(CHAR_SPACE);
   1236 #ifdef SUPPORT_UTF
   1237         if (utf)
   1238           {
   1239 #ifdef COMPILE_PCRE8
   1240           SET_BIT(0xC2);  /* For U+00A0 */
   1241           SET_BIT(0xE1);  /* For U+1680, U+180E */
   1242           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
   1243           SET_BIT(0xE3);  /* For U+3000 */
   1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1245           SET_BIT(0xA0);
   1246           SET_BIT(0xFF);  /* For characters > 255 */
   1247 #endif  /* COMPILE_PCRE[8|16|32] */
   1248           }
   1249         else
   1250 #endif /* SUPPORT_UTF */
   1251 #ifndef EBCDIC
   1252           SET_BIT(0xA0);
   1253 #endif  /* Not EBCDIC */
   1254         break;
   1255 
   1256         case OP_ANYNL:
   1257         case OP_VSPACE:
   1258         SET_BIT(CHAR_LF);
   1259         SET_BIT(CHAR_VT);
   1260         SET_BIT(CHAR_FF);
   1261         SET_BIT(CHAR_CR);
   1262 #ifdef SUPPORT_UTF
   1263         if (utf)
   1264           {
   1265 #ifdef COMPILE_PCRE8
   1266           SET_BIT(0xC2);  /* For U+0085 */
   1267           SET_BIT(0xE2);  /* For U+2028, U+2029 */
   1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1269           SET_BIT(CHAR_NEL);
   1270           SET_BIT(0xFF);  /* For characters > 255 */
   1271 #endif  /* COMPILE_PCRE16 */
   1272           }
   1273         else
   1274 #endif /* SUPPORT_UTF */
   1275           SET_BIT(CHAR_NEL);
   1276         break;
   1277 
   1278         case OP_NOT_DIGIT:
   1279         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
   1280         break;
   1281 
   1282         case OP_DIGIT:
   1283         set_type_bits(start_bits, cbit_digit, table_limit, cd);
   1284         break;
   1285 
   1286         /* The cbit_space table has vertical tab as whitespace; we no longer
   1287         have to play fancy tricks because Perl added VT to its whitespace at
   1288         release 5.18. PCRE added it at release 8.34. */
   1289 
   1290         case OP_NOT_WHITESPACE:
   1291         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
   1292         break;
   1293 
   1294         case OP_WHITESPACE:
   1295         set_type_bits(start_bits, cbit_space, table_limit, cd);
   1296         break;
   1297 
   1298         case OP_NOT_WORDCHAR:
   1299         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
   1300         break;
   1301 
   1302         case OP_WORDCHAR:
   1303         set_type_bits(start_bits, cbit_word, table_limit, cd);
   1304         break;
   1305         }
   1306 
   1307       tcode += 2;
   1308       break;
   1309 
   1310       /* Character class where all the information is in a bit map: set the
   1311       bits and either carry on or not, according to the repeat count. If it was
   1312       a negative class, and we are operating with UTF-8 characters, any byte
   1313       with a value >= 0xc4 is a potentially valid starter because it starts a
   1314       character with a value > 255. */
   1315 
   1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   1317       case OP_XCLASS:
   1318       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
   1319         return SSB_FAIL;
   1320       /* All bits are set. */
   1321       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
   1322         return SSB_FAIL;
   1323 #endif
   1324       /* Fall through */
   1325 
   1326       case OP_NCLASS:
   1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
   1328       if (utf)
   1329         {
   1330         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
   1331         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
   1332         }
   1333 #endif
   1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
   1335       SET_BIT(0xFF);                         /* For characters > 255 */
   1336 #endif
   1337       /* Fall through */
   1338 
   1339       case OP_CLASS:
   1340         {
   1341         pcre_uint8 *map;
   1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   1343         map = NULL;
   1344         if (*tcode == OP_XCLASS)
   1345           {
   1346           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
   1347             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
   1348           tcode += GET(tcode, 1);
   1349           }
   1350         else
   1351 #endif
   1352           {
   1353           tcode++;
   1354           map = (pcre_uint8 *)tcode;
   1355           tcode += 32 / sizeof(pcre_uchar);
   1356           }
   1357 
   1358         /* In UTF-8 mode, the bits in a bit map correspond to character
   1359         values, not to byte values. However, the bit map we are constructing is
   1360         for byte values. So we have to do a conversion for characters whose
   1361         value is > 127. In fact, there are only two possible starting bytes for
   1362         characters in the range 128 - 255. */
   1363 
   1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
   1365         if (map != NULL)
   1366 #endif
   1367           {
   1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
   1369           if (utf)
   1370             {
   1371             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
   1372             for (c = 128; c < 256; c++)
   1373               {
   1374               if ((map[c/8] && (1 << (c&7))) != 0)
   1375                 {
   1376                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
   1377                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
   1378                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
   1379                 }
   1380               }
   1381             }
   1382           else
   1383 #endif
   1384             {
   1385             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
   1386             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
   1387             }
   1388           }
   1389 
   1390         /* Advance past the bit map, and act on what follows. For a zero
   1391         minimum repeat, continue; otherwise stop processing. */
   1392 
   1393         switch (*tcode)
   1394           {
   1395           case OP_CRSTAR:
   1396           case OP_CRMINSTAR:
   1397           case OP_CRQUERY:
   1398           case OP_CRMINQUERY:
   1399           case OP_CRPOSSTAR:
   1400           case OP_CRPOSQUERY:
   1401           tcode++;
   1402           break;
   1403 
   1404           case OP_CRRANGE:
   1405           case OP_CRMINRANGE:
   1406           case OP_CRPOSRANGE:
   1407           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
   1408             else try_next = FALSE;
   1409           break;
   1410 
   1411           default:
   1412           try_next = FALSE;
   1413           break;
   1414           }
   1415         }
   1416       break; /* End of bitmap class handling */
   1417 
   1418       }      /* End of switch */
   1419     }        /* End of try_next loop */
   1420 
   1421   code += GET(code, 1);   /* Advance to next branch */
   1422   }
   1423 while (*code == OP_ALT);
   1424 return yield;
   1425 }
   1426 
   1427 
   1428 
   1429 
   1430 
   1431 /*************************************************
   1432 *          Study a compiled expression           *
   1433 *************************************************/
   1434 
   1435 /* This function is handed a compiled expression that it must study to produce
   1436 information that will speed up the matching. It returns a pcre[16]_extra block
   1437 which then gets handed back to pcre_exec().
   1438 
   1439 Arguments:
   1440   re        points to the compiled expression
   1441   options   contains option bits
   1442   errorptr  points to where to place error messages;
   1443             set NULL unless error
   1444 
   1445 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
   1446               the appropriate flags set;
   1447             NULL on error or if no optimization possible
   1448 */
   1449 
   1450 #if defined COMPILE_PCRE8
   1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
   1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
   1453 #elif defined COMPILE_PCRE16
   1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
   1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
   1456 #elif defined COMPILE_PCRE32
   1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
   1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
   1459 #endif
   1460 {
   1461 int min;
   1462 int count = 0;
   1463 BOOL bits_set = FALSE;
   1464 pcre_uint8 start_bits[32];
   1465 PUBL(extra) *extra = NULL;
   1466 pcre_study_data *study;
   1467 const pcre_uint8 *tables;
   1468 pcre_uchar *code;
   1469 compile_data compile_block;
   1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
   1471 
   1472 
   1473 *errorptr = NULL;
   1474 
   1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
   1476   {
   1477   *errorptr = "argument is not a compiled regular expression";
   1478   return NULL;
   1479   }
   1480 
   1481 if ((re->flags & PCRE_MODE) == 0)
   1482   {
   1483 #if defined COMPILE_PCRE8
   1484   *errorptr = "argument not compiled in 8 bit mode";
   1485 #elif defined COMPILE_PCRE16
   1486   *errorptr = "argument not compiled in 16 bit mode";
   1487 #elif defined COMPILE_PCRE32
   1488   *errorptr = "argument not compiled in 32 bit mode";
   1489 #endif
   1490   return NULL;
   1491   }
   1492 
   1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
   1494   {
   1495   *errorptr = "unknown or incorrect option bit(s) set";
   1496   return NULL;
   1497   }
   1498 
   1499 code = (pcre_uchar *)re + re->name_table_offset +
   1500   (re->name_count * re->name_entry_size);
   1501 
   1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
   1503 a multiline pattern that matches only at "line starts", there is no point in
   1504 seeking a list of starting bytes. */
   1505 
   1506 if ((re->options & PCRE_ANCHORED) == 0 &&
   1507     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
   1508   {
   1509   int rc;
   1510 
   1511   /* Set the character tables in the block that is passed around */
   1512 
   1513   tables = re->tables;
   1514 
   1515 #if defined COMPILE_PCRE8
   1516   if (tables == NULL)
   1517     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
   1518     (void *)(&tables));
   1519 #elif defined COMPILE_PCRE16
   1520   if (tables == NULL)
   1521     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
   1522     (void *)(&tables));
   1523 #elif defined COMPILE_PCRE32
   1524   if (tables == NULL)
   1525     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
   1526     (void *)(&tables));
   1527 #endif
   1528 
   1529   compile_block.lcc = tables + lcc_offset;
   1530   compile_block.fcc = tables + fcc_offset;
   1531   compile_block.cbits = tables + cbits_offset;
   1532   compile_block.ctypes = tables + ctypes_offset;
   1533 
   1534   /* See if we can find a fixed set of initial characters for the pattern. */
   1535 
   1536   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
   1537   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
   1538     &compile_block);
   1539   bits_set = rc == SSB_DONE;
   1540   if (rc == SSB_UNKNOWN)
   1541     {
   1542     *errorptr = "internal error: opcode not recognized";
   1543     return NULL;
   1544     }
   1545   }
   1546 
   1547 /* Find the minimum length of subject string. */
   1548 
   1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
   1550   {
   1551   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
   1552   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
   1553   default: break;
   1554   }
   1555 
   1556 /* If a set of starting bytes has been identified, or if the minimum length is
   1557 greater than zero, or if JIT optimization has been requested, or if
   1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
   1559 pcre_study_data block. The study data is put in the latter, which is pointed to
   1560 by the former, which may also get additional data set later by the calling
   1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
   1562 save it in a field for returning via the pcre_fullinfo() function so that if it
   1563 becomes variable in the future, we don't have to change that code. */
   1564 
   1565 if (bits_set || min > 0 || (options & (
   1566 #ifdef SUPPORT_JIT
   1567     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
   1568     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
   1569 #endif
   1570     PCRE_STUDY_EXTRA_NEEDED)) != 0)
   1571   {
   1572   extra = (PUBL(extra) *)(PUBL(malloc))
   1573     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
   1574   if (extra == NULL)
   1575     {
   1576     *errorptr = "failed to get memory";
   1577     return NULL;
   1578     }
   1579 
   1580   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
   1581   extra->flags = PCRE_EXTRA_STUDY_DATA;
   1582   extra->study_data = study;
   1583 
   1584   study->size = sizeof(pcre_study_data);
   1585   study->flags = 0;
   1586 
   1587   /* Set the start bits always, to avoid unset memory errors if the
   1588   study data is written to a file, but set the flag only if any of the bits
   1589   are set, to save time looking when none are. */
   1590 
   1591   if (bits_set)
   1592     {
   1593     study->flags |= PCRE_STUDY_MAPPED;
   1594     memcpy(study->start_bits, start_bits, sizeof(start_bits));
   1595     }
   1596   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
   1597 
   1598 #ifdef PCRE_DEBUG
   1599   if (bits_set)
   1600     {
   1601     pcre_uint8 *ptr = start_bits;
   1602     int i;
   1603 
   1604     printf("Start bits:\n");
   1605     for (i = 0; i < 32; i++)
   1606       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
   1607     }
   1608 #endif
   1609 
   1610   /* Always set the minlength value in the block, because the JIT compiler
   1611   makes use of it. However, don't set the bit unless the length is greater than
   1612   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
   1613   checking the zero case. */
   1614 
   1615   if (min > 0)
   1616     {
   1617     study->flags |= PCRE_STUDY_MINLEN;
   1618     study->minlength = min;
   1619     }
   1620   else study->minlength = 0;
   1621 
   1622   /* If JIT support was compiled and requested, attempt the JIT compilation.
   1623   If no starting bytes were found, and the minimum length is zero, and JIT
   1624   compilation fails, abandon the extra block and return NULL, unless
   1625   PCRE_STUDY_EXTRA_NEEDED is set. */
   1626 
   1627 #ifdef SUPPORT_JIT
   1628   extra->executable_jit = NULL;
   1629   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
   1630     PRIV(jit_compile)(re, extra, JIT_COMPILE);
   1631   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
   1632     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
   1633   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
   1634     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
   1635 
   1636   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
   1637       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
   1638     {
   1639 #if defined COMPILE_PCRE8
   1640     pcre_free_study(extra);
   1641 #elif defined COMPILE_PCRE16
   1642     pcre16_free_study(extra);
   1643 #elif defined COMPILE_PCRE32
   1644     pcre32_free_study(extra);
   1645 #endif
   1646     extra = NULL;
   1647     }
   1648 #endif
   1649   }
   1650 
   1651 return extra;
   1652 }
   1653 
   1654 
   1655 /*************************************************
   1656 *          Free the study data                   *
   1657 *************************************************/
   1658 
   1659 /* This function frees the memory that was obtained by pcre_study().
   1660 
   1661 Argument:   a pointer to the pcre[16]_extra block
   1662 Returns:    nothing
   1663 */
   1664 
   1665 #if defined COMPILE_PCRE8
   1666 PCRE_EXP_DEFN void
   1667 pcre_free_study(pcre_extra *extra)
   1668 #elif defined COMPILE_PCRE16
   1669 PCRE_EXP_DEFN void
   1670 pcre16_free_study(pcre16_extra *extra)
   1671 #elif defined COMPILE_PCRE32
   1672 PCRE_EXP_DEFN void
   1673 pcre32_free_study(pcre32_extra *extra)
   1674 #endif
   1675 {
   1676 if (extra == NULL)
   1677   return;
   1678 #ifdef SUPPORT_JIT
   1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
   1680      extra->executable_jit != NULL)
   1681   PRIV(jit_free)(extra->executable_jit);
   1682 #endif
   1683 PUBL(free)(extra);
   1684 }
   1685 
   1686 /* End of pcre_study.c */
   1687