Home | History | Annotate | Download | only in pcre
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9            Copyright (c) 1997-2010 University of Cambridge
     10 
     11 -----------------------------------------------------------------------------
     12 Redistribution and use in source and binary forms, with or without
     13 modification, are permitted provided that the following conditions are met:
     14 
     15     * Redistributions of source code must retain the above copyright notice,
     16       this list of conditions and the following disclaimer.
     17 
     18     * Redistributions in binary form must reproduce the above copyright
     19       notice, this list of conditions and the following disclaimer in the
     20       documentation and/or other materials provided with the distribution.
     21 
     22     * Neither the name of the University of Cambridge nor the names of its
     23       contributors may be used to endorse or promote products derived from
     24       this software without specific prior written permission.
     25 
     26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36 POSSIBILITY OF SUCH DAMAGE.
     37 -----------------------------------------------------------------------------
     38 */
     39 
     40 
     41 /* This module contains the external function pcre_study(), along with local
     42 supporting functions. */
     43 
     44 
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 
     49 #include "pcre_internal.h"
     50 
     51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
     52 
     53 /* Returns from set_start_bits() */
     54 
     55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
     56 
     57 
     58 
     59 /*************************************************
     60 *   Find the minimum subject length for a group  *
     61 *************************************************/
     62 
     63 /* Scan a parenthesized group and compute the minimum length of subject that
     64 is needed to match it. This is a lower bound; it does not mean there is a
     65 string of that length that matches. In UTF8 mode, the result is in characters
     66 rather than bytes.
     67 
     68 Arguments:
     69   code       pointer to start of group (the bracket)
     70   startcode  pointer to start of the whole pattern
     71   options    the compiling options
     72 
     73 Returns:   the minimum length
     74            -1 if \C was encountered
     75            -2 internal error (missing capturing bracket)
     76 */
     77 
     78 static int
     79 find_minlength(const uschar *code, const uschar *startcode, int options)
     80 {
     81 int length = -1;
     82 BOOL utf8 = (options & PCRE_UTF8) != 0;
     83 BOOL had_recurse = FALSE;
     84 register int branchlength = 0;
     85 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
     86 
     87 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
     88 
     89 /* Scan along the opcodes for this branch. If we get to the end of the
     90 branch, check the length against that of the other branches. */
     91 
     92 for (;;)
     93   {
     94   int d, min;
     95   uschar *cs, *ce;
     96   register int op = *cc;
     97 
     98   switch (op)
     99     {
    100     case OP_COND:
    101     case OP_SCOND:
    102 
    103     /* If there is only one branch in a condition, the implied branch has zero
    104     length, so we don't add anything. This covers the DEFINE "condition"
    105     automatically. */
    106 
    107     cs = cc + GET(cc, 1);
    108     if (*cs != OP_ALT)
    109       {
    110       cc = cs + 1 + LINK_SIZE;
    111       break;
    112       }
    113 
    114     /* Otherwise we can fall through and treat it the same as any other
    115     subpattern. */
    116 
    117     case OP_CBRA:
    118     case OP_SCBRA:
    119     case OP_BRA:
    120     case OP_SBRA:
    121     case OP_ONCE:
    122     d = find_minlength(cc, startcode, options);
    123     if (d < 0) return d;
    124     branchlength += d;
    125     do cc += GET(cc, 1); while (*cc == OP_ALT);
    126     cc += 1 + LINK_SIZE;
    127     break;
    128 
    129     /* Reached end of a branch; if it's a ket it is the end of a nested
    130     call. If it's ALT it is an alternation in a nested call. If it is
    131     END it's the end of the outer call. All can be handled by the same code. */
    132 
    133     case OP_ALT:
    134     case OP_KET:
    135     case OP_KETRMAX:
    136     case OP_KETRMIN:
    137     case OP_END:
    138     if (length < 0 || (!had_recurse && branchlength < length))
    139       length = branchlength;
    140     if (*cc != OP_ALT) return length;
    141     cc += 1 + LINK_SIZE;
    142     branchlength = 0;
    143     had_recurse = FALSE;
    144     break;
    145 
    146     /* Skip over assertive subpatterns */
    147 
    148     case OP_ASSERT:
    149     case OP_ASSERT_NOT:
    150     case OP_ASSERTBACK:
    151     case OP_ASSERTBACK_NOT:
    152     do cc += GET(cc, 1); while (*cc == OP_ALT);
    153     /* Fall through */
    154 
    155     /* Skip over things that don't match chars */
    156 
    157     case OP_REVERSE:
    158     case OP_CREF:
    159     case OP_NCREF:
    160     case OP_RREF:
    161     case OP_NRREF:
    162     case OP_DEF:
    163     case OP_OPT:
    164     case OP_CALLOUT:
    165     case OP_SOD:
    166     case OP_SOM:
    167     case OP_EOD:
    168     case OP_EODN:
    169     case OP_CIRC:
    170     case OP_DOLL:
    171     case OP_NOT_WORD_BOUNDARY:
    172     case OP_WORD_BOUNDARY:
    173     cc += _pcre_OP_lengths[*cc];
    174     break;
    175 
    176     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
    177 
    178     case OP_BRAZERO:
    179     case OP_BRAMINZERO:
    180     case OP_SKIPZERO:
    181     cc += _pcre_OP_lengths[*cc];
    182     do cc += GET(cc, 1); while (*cc == OP_ALT);
    183     cc += 1 + LINK_SIZE;
    184     break;
    185 
    186     /* Handle literal characters and + repetitions */
    187 
    188     case OP_CHAR:
    189     case OP_CHARNC:
    190     case OP_NOT:
    191     case OP_PLUS:
    192     case OP_MINPLUS:
    193     case OP_POSPLUS:
    194     case OP_NOTPLUS:
    195     case OP_NOTMINPLUS:
    196     case OP_NOTPOSPLUS:
    197     branchlength++;
    198     cc += 2;
    199 #ifdef SUPPORT_UTF8
    200     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
    201 #endif
    202     break;
    203 
    204     case OP_TYPEPLUS:
    205     case OP_TYPEMINPLUS:
    206     case OP_TYPEPOSPLUS:
    207     branchlength++;
    208     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
    209     break;
    210 
    211     /* Handle exact repetitions. The count is already in characters, but we
    212     need to skip over a multibyte character in UTF8 mode.  */
    213 
    214     case OP_EXACT:
    215     case OP_NOTEXACT:
    216     branchlength += GET2(cc,1);
    217     cc += 4;
    218 #ifdef SUPPORT_UTF8
    219     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
    220 #endif
    221     break;
    222 
    223     case OP_TYPEEXACT:
    224     branchlength += GET2(cc,1);
    225     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
    226     break;
    227 
    228     /* Handle single-char non-literal matchers */
    229 
    230     case OP_PROP:
    231     case OP_NOTPROP:
    232     cc += 2;
    233     /* Fall through */
    234 
    235     case OP_NOT_DIGIT:
    236     case OP_DIGIT:
    237     case OP_NOT_WHITESPACE:
    238     case OP_WHITESPACE:
    239     case OP_NOT_WORDCHAR:
    240     case OP_WORDCHAR:
    241     case OP_ANY:
    242     case OP_ALLANY:
    243     case OP_EXTUNI:
    244     case OP_HSPACE:
    245     case OP_NOT_HSPACE:
    246     case OP_VSPACE:
    247     case OP_NOT_VSPACE:
    248     branchlength++;
    249     cc++;
    250     break;
    251 
    252     /* "Any newline" might match two characters */
    253 
    254     case OP_ANYNL:
    255     branchlength += 2;
    256     cc++;
    257     break;
    258 
    259     /* The single-byte matcher means we can't proceed in UTF-8 mode */
    260 
    261     case OP_ANYBYTE:
    262 #ifdef SUPPORT_UTF8
    263     if (utf8) return -1;
    264 #endif
    265     branchlength++;
    266     cc++;
    267     break;
    268 
    269     /* For repeated character types, we have to test for \p and \P, which have
    270     an extra two bytes of parameters. */
    271 
    272     case OP_TYPESTAR:
    273     case OP_TYPEMINSTAR:
    274     case OP_TYPEQUERY:
    275     case OP_TYPEMINQUERY:
    276     case OP_TYPEPOSSTAR:
    277     case OP_TYPEPOSQUERY:
    278     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
    279     cc += _pcre_OP_lengths[op];
    280     break;
    281 
    282     case OP_TYPEUPTO:
    283     case OP_TYPEMINUPTO:
    284     case OP_TYPEPOSUPTO:
    285     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
    286     cc += _pcre_OP_lengths[op];
    287     break;
    288 
    289     /* Check a class for variable quantification */
    290 
    291 #ifdef SUPPORT_UTF8
    292     case OP_XCLASS:
    293     cc += GET(cc, 1) - 33;
    294     /* Fall through */
    295 #endif
    296 
    297     case OP_CLASS:
    298     case OP_NCLASS:
    299     cc += 33;
    300 
    301     switch (*cc)
    302       {
    303       case OP_CRPLUS:
    304       case OP_CRMINPLUS:
    305       branchlength++;
    306       /* Fall through */
    307 
    308       case OP_CRSTAR:
    309       case OP_CRMINSTAR:
    310       case OP_CRQUERY:
    311       case OP_CRMINQUERY:
    312       cc++;
    313       break;
    314 
    315       case OP_CRRANGE:
    316       case OP_CRMINRANGE:
    317       branchlength += GET2(cc,1);
    318       cc += 5;
    319       break;
    320 
    321       default:
    322       branchlength++;
    323       break;
    324       }
    325     break;
    326 
    327     /* Backreferences and subroutine calls are treated in the same way: we find
    328     the minimum length for the subpattern. A recursion, however, causes an
    329     a flag to be set that causes the length of this branch to be ignored. The
    330     logic is that a recursion can only make sense if there is another
    331     alternation that stops the recursing. That will provide the minimum length
    332     (when no recursion happens). A backreference within the group that it is
    333     referencing behaves in the same way.
    334 
    335     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
    336     matches an empty string (by default it causes a matching failure), so in
    337     that case we must set the minimum length to zero. */
    338 
    339     case OP_REF:
    340     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
    341       {
    342       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
    343       if (cs == NULL) return -2;
    344       do ce += GET(ce, 1); while (*ce == OP_ALT);
    345       if (cc > cs && cc < ce)
    346         {
    347         d = 0;
    348         had_recurse = TRUE;
    349         }
    350       else d = find_minlength(cs, startcode, options);
    351       }
    352     else d = 0;
    353     cc += 3;
    354 
    355     /* Handle repeated back references */
    356 
    357     switch (*cc)
    358       {
    359       case OP_CRSTAR:
    360       case OP_CRMINSTAR:
    361       case OP_CRQUERY:
    362       case OP_CRMINQUERY:
    363       min = 0;
    364       cc++;
    365       break;
    366 
    367       case OP_CRRANGE:
    368       case OP_CRMINRANGE:
    369       min = GET2(cc, 1);
    370       cc += 5;
    371       break;
    372 
    373       default:
    374       min = 1;
    375       break;
    376       }
    377 
    378     branchlength += min * d;
    379     break;
    380 
    381     case OP_RECURSE:
    382     cs = ce = (uschar *)startcode + GET(cc, 1);
    383     if (cs == NULL) return -2;
    384     do ce += GET(ce, 1); while (*ce == OP_ALT);
    385     if (cc > cs && cc < ce)
    386       had_recurse = TRUE;
    387     else
    388       branchlength += find_minlength(cs, startcode, options);
    389     cc += 1 + LINK_SIZE;
    390     break;
    391 
    392     /* Anything else does not or need not match a character. We can get the
    393     item's length from the table, but for those that can match zero occurrences
    394     of a character, we must take special action for UTF-8 characters. */
    395 
    396     case OP_UPTO:
    397     case OP_NOTUPTO:
    398     case OP_MINUPTO:
    399     case OP_NOTMINUPTO:
    400     case OP_POSUPTO:
    401     case OP_STAR:
    402     case OP_MINSTAR:
    403     case OP_NOTMINSTAR:
    404     case OP_POSSTAR:
    405     case OP_NOTPOSSTAR:
    406     case OP_QUERY:
    407     case OP_MINQUERY:
    408     case OP_NOTMINQUERY:
    409     case OP_POSQUERY:
    410     case OP_NOTPOSQUERY:
    411     cc += _pcre_OP_lengths[op];
    412 #ifdef SUPPORT_UTF8
    413     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
    414 #endif
    415     break;
    416 
    417     /* Skip these, but we need to add in the name length. */
    418 
    419     case OP_MARK:
    420     case OP_PRUNE_ARG:
    421     case OP_SKIP_ARG:
    422     cc += _pcre_OP_lengths[op] + cc[1];
    423     break;
    424 
    425     case OP_THEN_ARG:
    426     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
    427     break;
    428 
    429     /* For the record, these are the opcodes that are matched by "default":
    430     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
    431     OP_THEN. */
    432 
    433     default:
    434     cc += _pcre_OP_lengths[op];
    435     break;
    436     }
    437   }
    438 /* Control never gets here */
    439 }
    440 
    441 
    442 
    443 /*************************************************
    444 *      Set a bit and maybe its alternate case    *
    445 *************************************************/
    446 
    447 /* Given a character, set its first byte's bit in the table, and also the
    448 corresponding bit for the other version of a letter if we are caseless. In
    449 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
    450 when Unicode property support is available.
    451 
    452 Arguments:
    453   start_bits    points to the bit map
    454   p             points to the character
    455   caseless      the caseless flag
    456   cd            the block with char table pointers
    457   utf8          TRUE for UTF-8 mode
    458 
    459 Returns:        pointer after the character
    460 */
    461 
    462 static const uschar *
    463 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
    464   compile_data *cd, BOOL utf8)
    465 {
    466 unsigned int c = *p;
    467 
    468 SET_BIT(c);
    469 
    470 #ifdef SUPPORT_UTF8
    471 if (utf8 && c > 127)
    472   {
    473   GETCHARINC(c, p);
    474 #ifdef SUPPORT_UCP
    475   if (caseless)
    476     {
    477     uschar buff[8];
    478     c = UCD_OTHERCASE(c);
    479     (void)_pcre_ord2utf8(c, buff);
    480     SET_BIT(buff[0]);
    481     }
    482 #endif
    483   return p;
    484   }
    485 #endif
    486 
    487 /* Not UTF-8 mode, or character is less than 127. */
    488 
    489 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
    490 return p + 1;
    491 }
    492 
    493 
    494 
    495 /*************************************************
    496 *     Set bits for a positive character type     *
    497 *************************************************/
    498 
    499 /* This function sets starting bits for a character type. In UTF-8 mode, we can
    500 only do a direct setting for bytes less than 128, as otherwise there can be
    501 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
    502 environment, the tables will only recognize ASCII characters anyway, but in at
    503 least one Windows environment, some higher bytes bits were set in the tables.
    504 So we deal with that case by considering the UTF-8 encoding.
    505 
    506 Arguments:
    507   start_bits     the starting bitmap
    508   cbit type      the type of character wanted
    509   table_limit    32 for non-UTF-8; 16 for UTF-8
    510   cd             the block with char table pointers
    511 
    512 Returns:         nothing
    513 */
    514 
    515 static void
    516 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
    517   compile_data *cd)
    518 {
    519 register int c;
    520 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
    521 if (table_limit == 32) return;
    522 for (c = 128; c < 256; c++)
    523   {
    524   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
    525     {
    526     uschar buff[8];
    527     (void)_pcre_ord2utf8(c, buff);
    528     SET_BIT(buff[0]);
    529     }
    530   }
    531 }
    532 
    533 
    534 /*************************************************
    535 *     Set bits for a negative character type     *
    536 *************************************************/
    537 
    538 /* This function sets starting bits for a negative character type such as \D.
    539 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
    540 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
    541 Unlike in the positive case, where we can set appropriate starting bits for
    542 specific high-valued UTF-8 characters, in this case we have to set the bits for
    543 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
    544 0xc0 (192) for simplicity.
    545 
    546 Arguments:
    547   start_bits     the starting bitmap
    548   cbit type      the type of character wanted
    549   table_limit    32 for non-UTF-8; 16 for UTF-8
    550   cd             the block with char table pointers
    551 
    552 Returns:         nothing
    553 */
    554 
    555 static void
    556 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
    557   compile_data *cd)
    558 {
    559 register int c;
    560 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
    561 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
    562 }
    563 
    564 
    565 
    566 /*************************************************
    567 *          Create bitmap of starting bytes       *
    568 *************************************************/
    569 
    570 /* This function scans a compiled unanchored expression recursively and
    571 attempts to build a bitmap of the set of possible starting bytes. As time goes
    572 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
    573 useful for parenthesized groups in patterns such as (a*)b where the group
    574 provides some optional starting bytes but scanning must continue at the outer
    575 level to find at least one mandatory byte. At the outermost level, this
    576 function fails unless the result is SSB_DONE.
    577 
    578 Arguments:
    579   code         points to an expression
    580   start_bits   points to a 32-byte table, initialized to 0
    581   caseless     the current state of the caseless flag
    582   utf8         TRUE if in UTF-8 mode
    583   cd           the block with char table pointers
    584 
    585 Returns:       SSB_FAIL     => Failed to find any starting bytes
    586                SSB_DONE     => Found mandatory starting bytes
    587                SSB_CONTINUE => Found optional starting bytes
    588 */
    589 
    590 static int
    591 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
    592   BOOL utf8, compile_data *cd)
    593 {
    594 register int c;
    595 int yield = SSB_DONE;
    596 int table_limit = utf8? 16:32;
    597 
    598 #if 0
    599 /* ========================================================================= */
    600 /* The following comment and code was inserted in January 1999. In May 2006,
    601 when it was observed to cause compiler warnings about unused values, I took it
    602 out again. If anybody is still using OS/2, they will have to put it back
    603 manually. */
    604 
    605 /* This next statement and the later reference to dummy are here in order to
    606 trick the optimizer of the IBM C compiler for OS/2 into generating correct
    607 code. Apparently IBM isn't going to fix the problem, and we would rather not
    608 disable optimization (in this module it actually makes a big difference, and
    609 the pcre module can use all the optimization it can get). */
    610 
    611 volatile int dummy;
    612 /* ========================================================================= */
    613 #endif
    614 
    615 do
    616   {
    617   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
    618   BOOL try_next = TRUE;
    619 
    620   while (try_next)    /* Loop for items in this branch */
    621     {
    622     int rc;
    623     switch(*tcode)
    624       {
    625       /* Fail if we reach something we don't understand */
    626 
    627       default:
    628       return SSB_FAIL;
    629 
    630       /* If we hit a bracket or a positive lookahead assertion, recurse to set
    631       bits from within the subpattern. If it can't find anything, we have to
    632       give up. If it finds some mandatory character(s), we are done for this
    633       branch. Otherwise, carry on scanning after the subpattern. */
    634 
    635       case OP_BRA:
    636       case OP_SBRA:
    637       case OP_CBRA:
    638       case OP_SCBRA:
    639       case OP_ONCE:
    640       case OP_ASSERT:
    641       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
    642       if (rc == SSB_FAIL) return SSB_FAIL;
    643       if (rc == SSB_DONE) try_next = FALSE; else
    644         {
    645         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
    646         tcode += 1 + LINK_SIZE;
    647         }
    648       break;
    649 
    650       /* If we hit ALT or KET, it means we haven't found anything mandatory in
    651       this branch, though we might have found something optional. For ALT, we
    652       continue with the next alternative, but we have to arrange that the final
    653       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
    654       return SSB_CONTINUE: if this is the top level, that indicates failure,
    655       but after a nested subpattern, it causes scanning to continue. */
    656 
    657       case OP_ALT:
    658       yield = SSB_CONTINUE;
    659       try_next = FALSE;
    660       break;
    661 
    662       case OP_KET:
    663       case OP_KETRMAX:
    664       case OP_KETRMIN:
    665       return SSB_CONTINUE;
    666 
    667       /* Skip over callout */
    668 
    669       case OP_CALLOUT:
    670       tcode += 2 + 2*LINK_SIZE;
    671       break;
    672 
    673       /* Skip over lookbehind and negative lookahead assertions */
    674 
    675       case OP_ASSERT_NOT:
    676       case OP_ASSERTBACK:
    677       case OP_ASSERTBACK_NOT:
    678       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
    679       tcode += 1 + LINK_SIZE;
    680       break;
    681 
    682       /* Skip over an option setting, changing the caseless flag */
    683 
    684       case OP_OPT:
    685       caseless = (tcode[1] & PCRE_CASELESS) != 0;
    686       tcode += 2;
    687       break;
    688 
    689       /* BRAZERO does the bracket, but carries on. */
    690 
    691       case OP_BRAZERO:
    692       case OP_BRAMINZERO:
    693       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
    694         return SSB_FAIL;
    695 /* =========================================================================
    696       See the comment at the head of this function concerning the next line,
    697       which was an old fudge for the benefit of OS/2.
    698       dummy = 1;
    699   ========================================================================= */
    700       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
    701       tcode += 1 + LINK_SIZE;
    702       break;
    703 
    704       /* SKIPZERO skips the bracket. */
    705 
    706       case OP_SKIPZERO:
    707       tcode++;
    708       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
    709       tcode += 1 + LINK_SIZE;
    710       break;
    711 
    712       /* Single-char * or ? sets the bit and tries the next item */
    713 
    714       case OP_STAR:
    715       case OP_MINSTAR:
    716       case OP_POSSTAR:
    717       case OP_QUERY:
    718       case OP_MINQUERY:
    719       case OP_POSQUERY:
    720       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
    721       break;
    722 
    723       /* Single-char upto sets the bit and tries the next */
    724 
    725       case OP_UPTO:
    726       case OP_MINUPTO:
    727       case OP_POSUPTO:
    728       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
    729       break;
    730 
    731       /* At least one single char sets the bit and stops */
    732 
    733       case OP_EXACT:       /* Fall through */
    734       tcode += 2;
    735 
    736       case OP_CHAR:
    737       case OP_CHARNC:
    738       case OP_PLUS:
    739       case OP_MINPLUS:
    740       case OP_POSPLUS:
    741       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
    742       try_next = FALSE;
    743       break;
    744 
    745       /* Special spacing and line-terminating items. These recognize specific
    746       lists of characters. The difference between VSPACE and ANYNL is that the
    747       latter can match the two-character CRLF sequence, but that is not
    748       relevant for finding the first character, so their code here is
    749       identical. */
    750 
    751       case OP_HSPACE:
    752       SET_BIT(0x09);
    753       SET_BIT(0x20);
    754       if (utf8)
    755         {
    756         SET_BIT(0xC2);  /* For U+00A0 */
    757         SET_BIT(0xE1);  /* For U+1680, U+180E */
    758         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
    759         SET_BIT(0xE3);  /* For U+3000 */
    760         }
    761       else SET_BIT(0xA0);
    762       try_next = FALSE;
    763       break;
    764 
    765       case OP_ANYNL:
    766       case OP_VSPACE:
    767       SET_BIT(0x0A);
    768       SET_BIT(0x0B);
    769       SET_BIT(0x0C);
    770       SET_BIT(0x0D);
    771       if (utf8)
    772         {
    773         SET_BIT(0xC2);  /* For U+0085 */
    774         SET_BIT(0xE2);  /* For U+2028, U+2029 */
    775         }
    776       else SET_BIT(0x85);
    777       try_next = FALSE;
    778       break;
    779 
    780       /* Single character types set the bits and stop. Note that if PCRE_UCP
    781       is set, we do not see these op codes because \d etc are converted to
    782       properties. Therefore, these apply in the case when only characters less
    783       than 256 are recognized to match the types. */
    784 
    785       case OP_NOT_DIGIT:
    786       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
    787       try_next = FALSE;
    788       break;
    789 
    790       case OP_DIGIT:
    791       set_type_bits(start_bits, cbit_digit, table_limit, cd);
    792       try_next = FALSE;
    793       break;
    794 
    795       /* The cbit_space table has vertical tab as whitespace; we have to
    796       ensure it is set as not whitespace. */
    797 
    798       case OP_NOT_WHITESPACE:
    799       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
    800       start_bits[1] |= 0x08;
    801       try_next = FALSE;
    802       break;
    803 
    804       /* The cbit_space table has vertical tab as whitespace; we have to
    805       not set it from the table. */
    806 
    807       case OP_WHITESPACE:
    808       c = start_bits[1];    /* Save in case it was already set */
    809       set_type_bits(start_bits, cbit_space, table_limit, cd);
    810       start_bits[1] = (start_bits[1] & ~0x08) | c;
    811       try_next = FALSE;
    812       break;
    813 
    814       case OP_NOT_WORDCHAR:
    815       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
    816       try_next = FALSE;
    817       break;
    818 
    819       case OP_WORDCHAR:
    820       set_type_bits(start_bits, cbit_word, table_limit, cd);
    821       try_next = FALSE;
    822       break;
    823 
    824       /* One or more character type fudges the pointer and restarts, knowing
    825       it will hit a single character type and stop there. */
    826 
    827       case OP_TYPEPLUS:
    828       case OP_TYPEMINPLUS:
    829       case OP_TYPEPOSPLUS:
    830       tcode++;
    831       break;
    832 
    833       case OP_TYPEEXACT:
    834       tcode += 3;
    835       break;
    836 
    837       /* Zero or more repeats of character types set the bits and then
    838       try again. */
    839 
    840       case OP_TYPEUPTO:
    841       case OP_TYPEMINUPTO:
    842       case OP_TYPEPOSUPTO:
    843       tcode += 2;               /* Fall through */
    844 
    845       case OP_TYPESTAR:
    846       case OP_TYPEMINSTAR:
    847       case OP_TYPEPOSSTAR:
    848       case OP_TYPEQUERY:
    849       case OP_TYPEMINQUERY:
    850       case OP_TYPEPOSQUERY:
    851       switch(tcode[1])
    852         {
    853         default:
    854         case OP_ANY:
    855         case OP_ALLANY:
    856         return SSB_FAIL;
    857 
    858         case OP_HSPACE:
    859         SET_BIT(0x09);
    860         SET_BIT(0x20);
    861         if (utf8)
    862           {
    863           SET_BIT(0xC2);  /* For U+00A0 */
    864           SET_BIT(0xE1);  /* For U+1680, U+180E */
    865           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
    866           SET_BIT(0xE3);  /* For U+3000 */
    867           }
    868         else SET_BIT(0xA0);
    869         break;
    870 
    871         case OP_ANYNL:
    872         case OP_VSPACE:
    873         SET_BIT(0x0A);
    874         SET_BIT(0x0B);
    875         SET_BIT(0x0C);
    876         SET_BIT(0x0D);
    877         if (utf8)
    878           {
    879           SET_BIT(0xC2);  /* For U+0085 */
    880           SET_BIT(0xE2);  /* For U+2028, U+2029 */
    881           }
    882         else SET_BIT(0x85);
    883         break;
    884 
    885         case OP_NOT_DIGIT:
    886         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
    887         break;
    888 
    889         case OP_DIGIT:
    890         set_type_bits(start_bits, cbit_digit, table_limit, cd);
    891         break;
    892 
    893         /* The cbit_space table has vertical tab as whitespace; we have to
    894         ensure it gets set as not whitespace. */
    895 
    896         case OP_NOT_WHITESPACE:
    897         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
    898         start_bits[1] |= 0x08;
    899         break;
    900 
    901         /* The cbit_space table has vertical tab as whitespace; we have to
    902         avoid setting it. */
    903 
    904         case OP_WHITESPACE:
    905         c = start_bits[1];    /* Save in case it was already set */
    906         set_type_bits(start_bits, cbit_space, table_limit, cd);
    907         start_bits[1] = (start_bits[1] & ~0x08) | c;
    908         break;
    909 
    910         case OP_NOT_WORDCHAR:
    911         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
    912         break;
    913 
    914         case OP_WORDCHAR:
    915         set_type_bits(start_bits, cbit_word, table_limit, cd);
    916         break;
    917         }
    918 
    919       tcode += 2;
    920       break;
    921 
    922       /* Character class where all the information is in a bit map: set the
    923       bits and either carry on or not, according to the repeat count. If it was
    924       a negative class, and we are operating with UTF-8 characters, any byte
    925       with a value >= 0xc4 is a potentially valid starter because it starts a
    926       character with a value > 255. */
    927 
    928       case OP_NCLASS:
    929 #ifdef SUPPORT_UTF8
    930       if (utf8)
    931         {
    932         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
    933         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
    934         }
    935 #endif
    936       /* Fall through */
    937 
    938       case OP_CLASS:
    939         {
    940         tcode++;
    941 
    942         /* In UTF-8 mode, the bits in a bit map correspond to character
    943         values, not to byte values. However, the bit map we are constructing is
    944         for byte values. So we have to do a conversion for characters whose
    945         value is > 127. In fact, there are only two possible starting bytes for
    946         characters in the range 128 - 255. */
    947 
    948 #ifdef SUPPORT_UTF8
    949         if (utf8)
    950           {
    951           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
    952           for (c = 128; c < 256; c++)
    953             {
    954             if ((tcode[c/8] && (1 << (c&7))) != 0)
    955               {
    956               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
    957               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
    958               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
    959               }
    960             }
    961           }
    962 
    963         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
    964 
    965         else
    966 #endif
    967           {
    968           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
    969           }
    970 
    971         /* Advance past the bit map, and act on what follows */
    972 
    973         tcode += 32;
    974         switch (*tcode)
    975           {
    976           case OP_CRSTAR:
    977           case OP_CRMINSTAR:
    978           case OP_CRQUERY:
    979           case OP_CRMINQUERY:
    980           tcode++;
    981           break;
    982 
    983           case OP_CRRANGE:
    984           case OP_CRMINRANGE:
    985           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
    986             else try_next = FALSE;
    987           break;
    988 
    989           default:
    990           try_next = FALSE;
    991           break;
    992           }
    993         }
    994       break; /* End of bitmap class handling */
    995 
    996       }      /* End of switch */
    997     }        /* End of try_next loop */
    998 
    999   code += GET(code, 1);   /* Advance to next branch */
   1000   }
   1001 while (*code == OP_ALT);
   1002 return yield;
   1003 }
   1004 
   1005 
   1006 
   1007 /*************************************************
   1008 *          Study a compiled expression           *
   1009 *************************************************/
   1010 
   1011 /* This function is handed a compiled expression that it must study to produce
   1012 information that will speed up the matching. It returns a pcre_extra block
   1013 which then gets handed back to pcre_exec().
   1014 
   1015 Arguments:
   1016   re        points to the compiled expression
   1017   options   contains option bits
   1018   errorptr  points to where to place error messages;
   1019             set NULL unless error
   1020 
   1021 Returns:    pointer to a pcre_extra block, with study_data filled in and the
   1022               appropriate flags set;
   1023             NULL on error or if no optimization possible
   1024 */
   1025 
   1026 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
   1027 pcre_study(const pcre *external_re, int options, const char **errorptr)
   1028 {
   1029 int min;
   1030 BOOL bits_set = FALSE;
   1031 uschar start_bits[32];
   1032 pcre_extra *extra;
   1033 pcre_study_data *study;
   1034 const uschar *tables;
   1035 uschar *code;
   1036 compile_data compile_block;
   1037 const real_pcre *re = (const real_pcre *)external_re;
   1038 
   1039 *errorptr = NULL;
   1040 
   1041 if (re == NULL || re->magic_number != MAGIC_NUMBER)
   1042   {
   1043   *errorptr = "argument is not a compiled regular expression";
   1044   return NULL;
   1045   }
   1046 
   1047 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
   1048   {
   1049   *errorptr = "unknown or incorrect option bit(s) set";
   1050   return NULL;
   1051   }
   1052 
   1053 code = (uschar *)re + re->name_table_offset +
   1054   (re->name_count * re->name_entry_size);
   1055 
   1056 /* For an anchored pattern, or an unanchored pattern that has a first char, or
   1057 a multiline pattern that matches only at "line starts", there is no point in
   1058 seeking a list of starting bytes. */
   1059 
   1060 if ((re->options & PCRE_ANCHORED) == 0 &&
   1061     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
   1062   {
   1063   /* Set the character tables in the block that is passed around */
   1064 
   1065   tables = re->tables;
   1066   if (tables == NULL)
   1067     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
   1068     (void *)(&tables));
   1069 
   1070   compile_block.lcc = tables + lcc_offset;
   1071   compile_block.fcc = tables + fcc_offset;
   1072   compile_block.cbits = tables + cbits_offset;
   1073   compile_block.ctypes = tables + ctypes_offset;
   1074 
   1075   /* See if we can find a fixed set of initial characters for the pattern. */
   1076 
   1077   memset(start_bits, 0, 32 * sizeof(uschar));
   1078   bits_set = set_start_bits(code, start_bits,
   1079     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
   1080     &compile_block) == SSB_DONE;
   1081   }
   1082 
   1083 /* Find the minimum length of subject string. */
   1084 
   1085 min = find_minlength(code, code, re->options);
   1086 
   1087 /* Return NULL if no optimization is possible. */
   1088 
   1089 if (!bits_set && min < 0) return NULL;
   1090 
   1091 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
   1092 the latter, which is pointed to by the former, which may also get additional
   1093 data set later by the calling program. At the moment, the size of
   1094 pcre_study_data is fixed. We nevertheless save it in a field for returning via
   1095 the pcre_fullinfo() function so that if it becomes variable in the future, we
   1096 don't have to change that code. */
   1097 
   1098 extra = (pcre_extra *)(pcre_malloc)
   1099   (sizeof(pcre_extra) + sizeof(pcre_study_data));
   1100 
   1101 if (extra == NULL)
   1102   {
   1103   *errorptr = "failed to get memory";
   1104   return NULL;
   1105   }
   1106 
   1107 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
   1108 extra->flags = PCRE_EXTRA_STUDY_DATA;
   1109 extra->study_data = study;
   1110 
   1111 study->size = sizeof(pcre_study_data);
   1112 study->flags = 0;
   1113 
   1114 if (bits_set)
   1115   {
   1116   study->flags |= PCRE_STUDY_MAPPED;
   1117   memcpy(study->start_bits, start_bits, sizeof(start_bits));
   1118   }
   1119 
   1120 if (min >= 0)
   1121   {
   1122   study->flags |= PCRE_STUDY_MINLEN;
   1123   study->minlength = min;
   1124   }
   1125 
   1126 return extra;
   1127 }
   1128 
   1129 /* End of pcre_study.c */
   1130