1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2012 University of Cambridge 10 11 ----------------------------------------------------------------------------- 12 Redistribution and use in source and binary forms, with or without 13 modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 POSSIBILITY OF SUCH DAMAGE. 37 ----------------------------------------------------------------------------- 38 */ 39 40 41 /* This module contains the external function pcre_study(), along with local 42 supporting functions. */ 43 44 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 49 #include "pcre_internal.h" 50 51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7)) 52 53 /* Returns from set_start_bits() */ 54 55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; 56 57 58 59 /************************************************* 60 * Find the minimum subject length for a group * 61 *************************************************/ 62 63 /* Scan a parenthesized group and compute the minimum length of subject that 64 is needed to match it. This is a lower bound; it does not mean there is a 65 string of that length that matches. In UTF8 mode, the result is in characters 66 rather than bytes. 67 68 Arguments: 69 re compiled pattern block 70 code pointer to start of group (the bracket) 71 startcode pointer to start of the whole pattern's code 72 options the compiling options 73 recurses chain of recurse_check to catch mutual recursion 74 countptr pointer to call count (to catch over complexity) 75 76 Returns: the minimum length 77 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered 78 -2 internal error (missing capturing bracket) 79 -3 internal error (opcode not listed) 80 */ 81 82 static int 83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code, 84 const pcre_uchar *startcode, int options, recurse_check *recurses, 85 int *countptr) 86 { 87 int length = -1; 88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */ 89 BOOL utf = (options & PCRE_UTF8) != 0; 90 BOOL had_recurse = FALSE; 91 recurse_check this_recurse; 92 register int branchlength = 0; 93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; 94 95 if ((*countptr)++ > 1000) return -1; /* too complex */ 96 97 if (*code == OP_CBRA || *code == OP_SCBRA || 98 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; 99 100 /* Scan along the opcodes for this branch. If we get to the end of the 101 branch, check the length against that of the other branches. */ 102 103 for (;;) 104 { 105 int d, min; 106 pcre_uchar *cs, *ce; 107 register pcre_uchar op = *cc; 108 109 switch (op) 110 { 111 case OP_COND: 112 case OP_SCOND: 113 114 /* If there is only one branch in a condition, the implied branch has zero 115 length, so we don't add anything. This covers the DEFINE "condition" 116 automatically. */ 117 118 cs = cc + GET(cc, 1); 119 if (*cs != OP_ALT) 120 { 121 cc = cs + 1 + LINK_SIZE; 122 break; 123 } 124 125 /* Otherwise we can fall through and treat it the same as any other 126 subpattern. */ 127 128 case OP_CBRA: 129 case OP_SCBRA: 130 case OP_BRA: 131 case OP_SBRA: 132 case OP_CBRAPOS: 133 case OP_SCBRAPOS: 134 case OP_BRAPOS: 135 case OP_SBRAPOS: 136 case OP_ONCE: 137 case OP_ONCE_NC: 138 d = find_minlength(re, cc, startcode, options, recurses, countptr); 139 if (d < 0) return d; 140 branchlength += d; 141 do cc += GET(cc, 1); while (*cc == OP_ALT); 142 cc += 1 + LINK_SIZE; 143 break; 144 145 /* ACCEPT makes things far too complicated; we have to give up. */ 146 147 case OP_ACCEPT: 148 case OP_ASSERT_ACCEPT: 149 return -1; 150 151 /* Reached end of a branch; if it's a ket it is the end of a nested 152 call. If it's ALT it is an alternation in a nested call. If it is END it's 153 the end of the outer call. All can be handled by the same code. If an 154 ACCEPT was previously encountered, use the length that was in force at that 155 time, and pass back the shortest ACCEPT length. */ 156 157 case OP_ALT: 158 case OP_KET: 159 case OP_KETRMAX: 160 case OP_KETRMIN: 161 case OP_KETRPOS: 162 case OP_END: 163 if (length < 0 || (!had_recurse && branchlength < length)) 164 length = branchlength; 165 if (op != OP_ALT) return length; 166 cc += 1 + LINK_SIZE; 167 branchlength = 0; 168 had_recurse = FALSE; 169 break; 170 171 /* Skip over assertive subpatterns */ 172 173 case OP_ASSERT: 174 case OP_ASSERT_NOT: 175 case OP_ASSERTBACK: 176 case OP_ASSERTBACK_NOT: 177 do cc += GET(cc, 1); while (*cc == OP_ALT); 178 /* Fall through */ 179 180 /* Skip over things that don't match chars */ 181 182 case OP_REVERSE: 183 case OP_CREF: 184 case OP_DNCREF: 185 case OP_RREF: 186 case OP_DNRREF: 187 case OP_DEF: 188 case OP_CALLOUT: 189 case OP_SOD: 190 case OP_SOM: 191 case OP_EOD: 192 case OP_EODN: 193 case OP_CIRC: 194 case OP_CIRCM: 195 case OP_DOLL: 196 case OP_DOLLM: 197 case OP_NOT_WORD_BOUNDARY: 198 case OP_WORD_BOUNDARY: 199 cc += PRIV(OP_lengths)[*cc]; 200 break; 201 202 /* Skip over a subpattern that has a {0} or {0,x} quantifier */ 203 204 case OP_BRAZERO: 205 case OP_BRAMINZERO: 206 case OP_BRAPOSZERO: 207 case OP_SKIPZERO: 208 cc += PRIV(OP_lengths)[*cc]; 209 do cc += GET(cc, 1); while (*cc == OP_ALT); 210 cc += 1 + LINK_SIZE; 211 break; 212 213 /* Handle literal characters and + repetitions */ 214 215 case OP_CHAR: 216 case OP_CHARI: 217 case OP_NOT: 218 case OP_NOTI: 219 case OP_PLUS: 220 case OP_PLUSI: 221 case OP_MINPLUS: 222 case OP_MINPLUSI: 223 case OP_POSPLUS: 224 case OP_POSPLUSI: 225 case OP_NOTPLUS: 226 case OP_NOTPLUSI: 227 case OP_NOTMINPLUS: 228 case OP_NOTMINPLUSI: 229 case OP_NOTPOSPLUS: 230 case OP_NOTPOSPLUSI: 231 branchlength++; 232 cc += 2; 233 #ifdef SUPPORT_UTF 234 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 235 #endif 236 break; 237 238 case OP_TYPEPLUS: 239 case OP_TYPEMINPLUS: 240 case OP_TYPEPOSPLUS: 241 branchlength++; 242 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; 243 break; 244 245 /* Handle exact repetitions. The count is already in characters, but we 246 need to skip over a multibyte character in UTF8 mode. */ 247 248 case OP_EXACT: 249 case OP_EXACTI: 250 case OP_NOTEXACT: 251 case OP_NOTEXACTI: 252 branchlength += GET2(cc,1); 253 cc += 2 + IMM2_SIZE; 254 #ifdef SUPPORT_UTF 255 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 256 #endif 257 break; 258 259 case OP_TYPEEXACT: 260 branchlength += GET2(cc,1); 261 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP 262 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 263 break; 264 265 /* Handle single-char non-literal matchers */ 266 267 case OP_PROP: 268 case OP_NOTPROP: 269 cc += 2; 270 /* Fall through */ 271 272 case OP_NOT_DIGIT: 273 case OP_DIGIT: 274 case OP_NOT_WHITESPACE: 275 case OP_WHITESPACE: 276 case OP_NOT_WORDCHAR: 277 case OP_WORDCHAR: 278 case OP_ANY: 279 case OP_ALLANY: 280 case OP_EXTUNI: 281 case OP_HSPACE: 282 case OP_NOT_HSPACE: 283 case OP_VSPACE: 284 case OP_NOT_VSPACE: 285 branchlength++; 286 cc++; 287 break; 288 289 /* "Any newline" might match two characters, but it also might match just 290 one. */ 291 292 case OP_ANYNL: 293 branchlength += 1; 294 cc++; 295 break; 296 297 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In 298 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever 299 appear, but leave the code, just in case.) */ 300 301 case OP_ANYBYTE: 302 #ifdef SUPPORT_UTF 303 if (utf) return -1; 304 #endif 305 branchlength++; 306 cc++; 307 break; 308 309 /* For repeated character types, we have to test for \p and \P, which have 310 an extra two bytes of parameters. */ 311 312 case OP_TYPESTAR: 313 case OP_TYPEMINSTAR: 314 case OP_TYPEQUERY: 315 case OP_TYPEMINQUERY: 316 case OP_TYPEPOSSTAR: 317 case OP_TYPEPOSQUERY: 318 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; 319 cc += PRIV(OP_lengths)[op]; 320 break; 321 322 case OP_TYPEUPTO: 323 case OP_TYPEMINUPTO: 324 case OP_TYPEPOSUPTO: 325 if (cc[1 + IMM2_SIZE] == OP_PROP 326 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; 327 cc += PRIV(OP_lengths)[op]; 328 break; 329 330 /* Check a class for variable quantification */ 331 332 case OP_CLASS: 333 case OP_NCLASS: 334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 335 case OP_XCLASS: 336 /* The original code caused an unsigned overflow in 64 bit systems, 337 so now we use a conditional statement. */ 338 if (op == OP_XCLASS) 339 cc += GET(cc, 1); 340 else 341 cc += PRIV(OP_lengths)[OP_CLASS]; 342 #else 343 cc += PRIV(OP_lengths)[OP_CLASS]; 344 #endif 345 346 switch (*cc) 347 { 348 case OP_CRPLUS: 349 case OP_CRMINPLUS: 350 case OP_CRPOSPLUS: 351 branchlength++; 352 /* Fall through */ 353 354 case OP_CRSTAR: 355 case OP_CRMINSTAR: 356 case OP_CRQUERY: 357 case OP_CRMINQUERY: 358 case OP_CRPOSSTAR: 359 case OP_CRPOSQUERY: 360 cc++; 361 break; 362 363 case OP_CRRANGE: 364 case OP_CRMINRANGE: 365 case OP_CRPOSRANGE: 366 branchlength += GET2(cc,1); 367 cc += 1 + 2 * IMM2_SIZE; 368 break; 369 370 default: 371 branchlength++; 372 break; 373 } 374 break; 375 376 /* Backreferences and subroutine calls are treated in the same way: we find 377 the minimum length for the subpattern. A recursion, however, causes an 378 a flag to be set that causes the length of this branch to be ignored. The 379 logic is that a recursion can only make sense if there is another 380 alternation that stops the recursing. That will provide the minimum length 381 (when no recursion happens). A backreference within the group that it is 382 referencing behaves in the same way. 383 384 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket 385 matches an empty string (by default it causes a matching failure), so in 386 that case we must set the minimum length to zero. */ 387 388 case OP_DNREF: /* Duplicate named pattern back reference */ 389 case OP_DNREFI: 390 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) 391 { 392 int count = GET2(cc, 1+IMM2_SIZE); 393 pcre_uchar *slot = (pcre_uchar *)re + 394 re->name_table_offset + GET2(cc, 1) * re->name_entry_size; 395 d = INT_MAX; 396 while (count-- > 0) 397 { 398 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); 399 if (cs == NULL) return -2; 400 do ce += GET(ce, 1); while (*ce == OP_ALT); 401 if (cc > cs && cc < ce) /* Simple recursion */ 402 { 403 d = 0; 404 had_recurse = TRUE; 405 break; 406 } 407 else 408 { 409 recurse_check *r = recurses; 410 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 411 if (r != NULL) /* Mutual recursion */ 412 { 413 d = 0; 414 had_recurse = TRUE; 415 break; 416 } 417 else 418 { 419 int dd; 420 this_recurse.prev = recurses; 421 this_recurse.group = cs; 422 dd = find_minlength(re, cs, startcode, options, &this_recurse, 423 countptr); 424 if (dd < d) d = dd; 425 } 426 } 427 slot += re->name_entry_size; 428 } 429 } 430 else d = 0; 431 cc += 1 + 2*IMM2_SIZE; 432 goto REPEAT_BACK_REFERENCE; 433 434 case OP_REF: /* Single back reference */ 435 case OP_REFI: 436 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) 437 { 438 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); 439 if (cs == NULL) return -2; 440 do ce += GET(ce, 1); while (*ce == OP_ALT); 441 if (cc > cs && cc < ce) /* Simple recursion */ 442 { 443 d = 0; 444 had_recurse = TRUE; 445 } 446 else 447 { 448 recurse_check *r = recurses; 449 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 450 if (r != NULL) /* Mutual recursion */ 451 { 452 d = 0; 453 had_recurse = TRUE; 454 } 455 else 456 { 457 this_recurse.prev = recurses; 458 this_recurse.group = cs; 459 d = find_minlength(re, cs, startcode, options, &this_recurse, 460 countptr); 461 } 462 } 463 } 464 else d = 0; 465 cc += 1 + IMM2_SIZE; 466 467 /* Handle repeated back references */ 468 469 REPEAT_BACK_REFERENCE: 470 switch (*cc) 471 { 472 case OP_CRSTAR: 473 case OP_CRMINSTAR: 474 case OP_CRQUERY: 475 case OP_CRMINQUERY: 476 case OP_CRPOSSTAR: 477 case OP_CRPOSQUERY: 478 min = 0; 479 cc++; 480 break; 481 482 case OP_CRPLUS: 483 case OP_CRMINPLUS: 484 case OP_CRPOSPLUS: 485 min = 1; 486 cc++; 487 break; 488 489 case OP_CRRANGE: 490 case OP_CRMINRANGE: 491 case OP_CRPOSRANGE: 492 min = GET2(cc, 1); 493 cc += 1 + 2 * IMM2_SIZE; 494 break; 495 496 default: 497 min = 1; 498 break; 499 } 500 501 branchlength += min * d; 502 break; 503 504 /* We can easily detect direct recursion, but not mutual recursion. This is 505 caught by a recursion depth count. */ 506 507 case OP_RECURSE: 508 cs = ce = (pcre_uchar *)startcode + GET(cc, 1); 509 do ce += GET(ce, 1); while (*ce == OP_ALT); 510 if (cc > cs && cc < ce) /* Simple recursion */ 511 had_recurse = TRUE; 512 else 513 { 514 recurse_check *r = recurses; 515 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 516 if (r != NULL) /* Mutual recursion */ 517 had_recurse = TRUE; 518 else 519 { 520 this_recurse.prev = recurses; 521 this_recurse.group = cs; 522 branchlength += find_minlength(re, cs, startcode, options, 523 &this_recurse, countptr); 524 } 525 } 526 cc += 1 + LINK_SIZE; 527 break; 528 529 /* Anything else does not or need not match a character. We can get the 530 item's length from the table, but for those that can match zero occurrences 531 of a character, we must take special action for UTF-8 characters. As it 532 happens, the "NOT" versions of these opcodes are used at present only for 533 ASCII characters, so they could be omitted from this list. However, in 534 future that may change, so we include them here so as not to leave a 535 gotcha for a future maintainer. */ 536 537 case OP_UPTO: 538 case OP_UPTOI: 539 case OP_NOTUPTO: 540 case OP_NOTUPTOI: 541 case OP_MINUPTO: 542 case OP_MINUPTOI: 543 case OP_NOTMINUPTO: 544 case OP_NOTMINUPTOI: 545 case OP_POSUPTO: 546 case OP_POSUPTOI: 547 case OP_NOTPOSUPTO: 548 case OP_NOTPOSUPTOI: 549 550 case OP_STAR: 551 case OP_STARI: 552 case OP_NOTSTAR: 553 case OP_NOTSTARI: 554 case OP_MINSTAR: 555 case OP_MINSTARI: 556 case OP_NOTMINSTAR: 557 case OP_NOTMINSTARI: 558 case OP_POSSTAR: 559 case OP_POSSTARI: 560 case OP_NOTPOSSTAR: 561 case OP_NOTPOSSTARI: 562 563 case OP_QUERY: 564 case OP_QUERYI: 565 case OP_NOTQUERY: 566 case OP_NOTQUERYI: 567 case OP_MINQUERY: 568 case OP_MINQUERYI: 569 case OP_NOTMINQUERY: 570 case OP_NOTMINQUERYI: 571 case OP_POSQUERY: 572 case OP_POSQUERYI: 573 case OP_NOTPOSQUERY: 574 case OP_NOTPOSQUERYI: 575 576 cc += PRIV(OP_lengths)[op]; 577 #ifdef SUPPORT_UTF 578 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 579 #endif 580 break; 581 582 /* Skip these, but we need to add in the name length. */ 583 584 case OP_MARK: 585 case OP_PRUNE_ARG: 586 case OP_SKIP_ARG: 587 case OP_THEN_ARG: 588 cc += PRIV(OP_lengths)[op] + cc[1]; 589 break; 590 591 /* The remaining opcodes are just skipped over. */ 592 593 case OP_CLOSE: 594 case OP_COMMIT: 595 case OP_FAIL: 596 case OP_PRUNE: 597 case OP_SET_SOM: 598 case OP_SKIP: 599 case OP_THEN: 600 cc += PRIV(OP_lengths)[op]; 601 break; 602 603 /* This should not occur: we list all opcodes explicitly so that when 604 new ones get added they are properly considered. */ 605 606 default: 607 return -3; 608 } 609 } 610 /* Control never gets here */ 611 } 612 613 614 615 /************************************************* 616 * Set a bit and maybe its alternate case * 617 *************************************************/ 618 619 /* Given a character, set its first byte's bit in the table, and also the 620 corresponding bit for the other version of a letter if we are caseless. In 621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing 622 when Unicode property support is available. 623 624 Arguments: 625 start_bits points to the bit map 626 p points to the character 627 caseless the caseless flag 628 cd the block with char table pointers 629 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode 630 631 Returns: pointer after the character 632 */ 633 634 static const pcre_uchar * 635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, 636 compile_data *cd, BOOL utf) 637 { 638 pcre_uint32 c = *p; 639 640 #ifdef COMPILE_PCRE8 641 SET_BIT(c); 642 643 #ifdef SUPPORT_UTF 644 if (utf && c > 127) 645 { 646 GETCHARINC(c, p); 647 #ifdef SUPPORT_UCP 648 if (caseless) 649 { 650 pcre_uchar buff[6]; 651 c = UCD_OTHERCASE(c); 652 (void)PRIV(ord2utf)(c, buff); 653 SET_BIT(buff[0]); 654 } 655 #endif /* Not SUPPORT_UCP */ 656 return p; 657 } 658 #else /* Not SUPPORT_UTF */ 659 (void)(utf); /* Stops warning for unused parameter */ 660 #endif /* SUPPORT_UTF */ 661 662 /* Not UTF-8 mode, or character is less than 127. */ 663 664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 665 return p + 1; 666 #endif /* COMPILE_PCRE8 */ 667 668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 669 if (c > 0xff) 670 { 671 c = 0xff; 672 caseless = FALSE; 673 } 674 SET_BIT(c); 675 676 #ifdef SUPPORT_UTF 677 if (utf && c > 127) 678 { 679 GETCHARINC(c, p); 680 #ifdef SUPPORT_UCP 681 if (caseless) 682 { 683 c = UCD_OTHERCASE(c); 684 if (c > 0xff) 685 c = 0xff; 686 SET_BIT(c); 687 } 688 #endif /* SUPPORT_UCP */ 689 return p; 690 } 691 #else /* Not SUPPORT_UTF */ 692 (void)(utf); /* Stops warning for unused parameter */ 693 #endif /* SUPPORT_UTF */ 694 695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); 696 return p + 1; 697 #endif 698 } 699 700 701 702 /************************************************* 703 * Set bits for a positive character type * 704 *************************************************/ 705 706 /* This function sets starting bits for a character type. In UTF-8 mode, we can 707 only do a direct setting for bytes less than 128, as otherwise there can be 708 confusion with bytes in the middle of UTF-8 characters. In a "traditional" 709 environment, the tables will only recognize ASCII characters anyway, but in at 710 least one Windows environment, some higher bytes bits were set in the tables. 711 So we deal with that case by considering the UTF-8 encoding. 712 713 Arguments: 714 start_bits the starting bitmap 715 cbit type the type of character wanted 716 table_limit 32 for non-UTF-8; 16 for UTF-8 717 cd the block with char table pointers 718 719 Returns: nothing 720 */ 721 722 static void 723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 724 compile_data *cd) 725 { 726 register pcre_uint32 c; 727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; 728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 729 if (table_limit == 32) return; 730 for (c = 128; c < 256; c++) 731 { 732 if ((cd->cbits[c/8] & (1 << (c&7))) != 0) 733 { 734 pcre_uchar buff[6]; 735 (void)PRIV(ord2utf)(c, buff); 736 SET_BIT(buff[0]); 737 } 738 } 739 #endif 740 } 741 742 743 /************************************************* 744 * Set bits for a negative character type * 745 *************************************************/ 746 747 /* This function sets starting bits for a negative character type such as \D. 748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as 749 otherwise there can be confusion with bytes in the middle of UTF-8 characters. 750 Unlike in the positive case, where we can set appropriate starting bits for 751 specific high-valued UTF-8 characters, in this case we have to set the bits for 752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at 753 0xc0 (192) for simplicity. 754 755 Arguments: 756 start_bits the starting bitmap 757 cbit type the type of character wanted 758 table_limit 32 for non-UTF-8; 16 for UTF-8 759 cd the block with char table pointers 760 761 Returns: nothing 762 */ 763 764 static void 765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit, 766 compile_data *cd) 767 { 768 register pcre_uint32 c; 769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; 770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; 772 #endif 773 } 774 775 776 777 /************************************************* 778 * Create bitmap of starting bytes * 779 *************************************************/ 780 781 /* This function scans a compiled unanchored expression recursively and 782 attempts to build a bitmap of the set of possible starting bytes. As time goes 783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is 784 useful for parenthesized groups in patterns such as (a*)b where the group 785 provides some optional starting bytes but scanning must continue at the outer 786 level to find at least one mandatory byte. At the outermost level, this 787 function fails unless the result is SSB_DONE. 788 789 Arguments: 790 code points to an expression 791 start_bits points to a 32-byte table, initialized to 0 792 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 793 cd the block with char table pointers 794 795 Returns: SSB_FAIL => Failed to find any starting bytes 796 SSB_DONE => Found mandatory starting bytes 797 SSB_CONTINUE => Found optional starting bytes 798 SSB_UNKNOWN => Hit an unrecognized opcode 799 */ 800 801 static int 802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, 803 compile_data *cd) 804 { 805 register pcre_uint32 c; 806 int yield = SSB_DONE; 807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 808 int table_limit = utf? 16:32; 809 #else 810 int table_limit = 32; 811 #endif 812 813 #if 0 814 /* ========================================================================= */ 815 /* The following comment and code was inserted in January 1999. In May 2006, 816 when it was observed to cause compiler warnings about unused values, I took it 817 out again. If anybody is still using OS/2, they will have to put it back 818 manually. */ 819 820 /* This next statement and the later reference to dummy are here in order to 821 trick the optimizer of the IBM C compiler for OS/2 into generating correct 822 code. Apparently IBM isn't going to fix the problem, and we would rather not 823 disable optimization (in this module it actually makes a big difference, and 824 the pcre module can use all the optimization it can get). */ 825 826 volatile int dummy; 827 /* ========================================================================= */ 828 #endif 829 830 do 831 { 832 BOOL try_next = TRUE; 833 const pcre_uchar *tcode = code + 1 + LINK_SIZE; 834 835 if (*code == OP_CBRA || *code == OP_SCBRA || 836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; 837 838 while (try_next) /* Loop for items in this branch */ 839 { 840 int rc; 841 842 switch(*tcode) 843 { 844 /* If we reach something we don't understand, it means a new opcode has 845 been created that hasn't been added to this code. Hopefully this problem 846 will be discovered during testing. */ 847 848 default: 849 return SSB_UNKNOWN; 850 851 /* Fail for a valid opcode that implies no starting bits. */ 852 853 case OP_ACCEPT: 854 case OP_ASSERT_ACCEPT: 855 case OP_ALLANY: 856 case OP_ANY: 857 case OP_ANYBYTE: 858 case OP_CIRC: 859 case OP_CIRCM: 860 case OP_CLOSE: 861 case OP_COMMIT: 862 case OP_COND: 863 case OP_CREF: 864 case OP_DEF: 865 case OP_DNCREF: 866 case OP_DNREF: 867 case OP_DNREFI: 868 case OP_DNRREF: 869 case OP_DOLL: 870 case OP_DOLLM: 871 case OP_END: 872 case OP_EOD: 873 case OP_EODN: 874 case OP_EXTUNI: 875 case OP_FAIL: 876 case OP_MARK: 877 case OP_NOT: 878 case OP_NOTEXACT: 879 case OP_NOTEXACTI: 880 case OP_NOTI: 881 case OP_NOTMINPLUS: 882 case OP_NOTMINPLUSI: 883 case OP_NOTMINQUERY: 884 case OP_NOTMINQUERYI: 885 case OP_NOTMINSTAR: 886 case OP_NOTMINSTARI: 887 case OP_NOTMINUPTO: 888 case OP_NOTMINUPTOI: 889 case OP_NOTPLUS: 890 case OP_NOTPLUSI: 891 case OP_NOTPOSPLUS: 892 case OP_NOTPOSPLUSI: 893 case OP_NOTPOSQUERY: 894 case OP_NOTPOSQUERYI: 895 case OP_NOTPOSSTAR: 896 case OP_NOTPOSSTARI: 897 case OP_NOTPOSUPTO: 898 case OP_NOTPOSUPTOI: 899 case OP_NOTPROP: 900 case OP_NOTQUERY: 901 case OP_NOTQUERYI: 902 case OP_NOTSTAR: 903 case OP_NOTSTARI: 904 case OP_NOTUPTO: 905 case OP_NOTUPTOI: 906 case OP_NOT_HSPACE: 907 case OP_NOT_VSPACE: 908 case OP_PRUNE: 909 case OP_PRUNE_ARG: 910 case OP_RECURSE: 911 case OP_REF: 912 case OP_REFI: 913 case OP_REVERSE: 914 case OP_RREF: 915 case OP_SCOND: 916 case OP_SET_SOM: 917 case OP_SKIP: 918 case OP_SKIP_ARG: 919 case OP_SOD: 920 case OP_SOM: 921 case OP_THEN: 922 case OP_THEN_ARG: 923 return SSB_FAIL; 924 925 /* A "real" property test implies no starting bits, but the fake property 926 PT_CLIST identifies a list of characters. These lists are short, as they 927 are used for characters with more than one "other case", so there is no 928 point in recognizing them for OP_NOTPROP. */ 929 930 case OP_PROP: 931 if (tcode[1] != PT_CLIST) return SSB_FAIL; 932 { 933 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2]; 934 while ((c = *p++) < NOTACHAR) 935 { 936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 937 if (utf) 938 { 939 pcre_uchar buff[6]; 940 (void)PRIV(ord2utf)(c, buff); 941 c = buff[0]; 942 } 943 #endif 944 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); 945 } 946 } 947 try_next = FALSE; 948 break; 949 950 /* We can ignore word boundary tests. */ 951 952 case OP_WORD_BOUNDARY: 953 case OP_NOT_WORD_BOUNDARY: 954 tcode++; 955 break; 956 957 /* If we hit a bracket or a positive lookahead assertion, recurse to set 958 bits from within the subpattern. If it can't find anything, we have to 959 give up. If it finds some mandatory character(s), we are done for this 960 branch. Otherwise, carry on scanning after the subpattern. */ 961 962 case OP_BRA: 963 case OP_SBRA: 964 case OP_CBRA: 965 case OP_SCBRA: 966 case OP_BRAPOS: 967 case OP_SBRAPOS: 968 case OP_CBRAPOS: 969 case OP_SCBRAPOS: 970 case OP_ONCE: 971 case OP_ONCE_NC: 972 case OP_ASSERT: 973 rc = set_start_bits(tcode, start_bits, utf, cd); 974 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 975 if (rc == SSB_DONE) try_next = FALSE; else 976 { 977 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 978 tcode += 1 + LINK_SIZE; 979 } 980 break; 981 982 /* If we hit ALT or KET, it means we haven't found anything mandatory in 983 this branch, though we might have found something optional. For ALT, we 984 continue with the next alternative, but we have to arrange that the final 985 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, 986 return SSB_CONTINUE: if this is the top level, that indicates failure, 987 but after a nested subpattern, it causes scanning to continue. */ 988 989 case OP_ALT: 990 yield = SSB_CONTINUE; 991 try_next = FALSE; 992 break; 993 994 case OP_KET: 995 case OP_KETRMAX: 996 case OP_KETRMIN: 997 case OP_KETRPOS: 998 return SSB_CONTINUE; 999 1000 /* Skip over callout */ 1001 1002 case OP_CALLOUT: 1003 tcode += 2 + 2*LINK_SIZE; 1004 break; 1005 1006 /* Skip over lookbehind and negative lookahead assertions */ 1007 1008 case OP_ASSERT_NOT: 1009 case OP_ASSERTBACK: 1010 case OP_ASSERTBACK_NOT: 1011 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 1012 tcode += 1 + LINK_SIZE; 1013 break; 1014 1015 /* BRAZERO does the bracket, but carries on. */ 1016 1017 case OP_BRAZERO: 1018 case OP_BRAMINZERO: 1019 case OP_BRAPOSZERO: 1020 rc = set_start_bits(++tcode, start_bits, utf, cd); 1021 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 1022 /* ========================================================================= 1023 See the comment at the head of this function concerning the next line, 1024 which was an old fudge for the benefit of OS/2. 1025 dummy = 1; 1026 ========================================================================= */ 1027 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1028 tcode += 1 + LINK_SIZE; 1029 break; 1030 1031 /* SKIPZERO skips the bracket. */ 1032 1033 case OP_SKIPZERO: 1034 tcode++; 1035 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1036 tcode += 1 + LINK_SIZE; 1037 break; 1038 1039 /* Single-char * or ? sets the bit and tries the next item */ 1040 1041 case OP_STAR: 1042 case OP_MINSTAR: 1043 case OP_POSSTAR: 1044 case OP_QUERY: 1045 case OP_MINQUERY: 1046 case OP_POSQUERY: 1047 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 1048 break; 1049 1050 case OP_STARI: 1051 case OP_MINSTARI: 1052 case OP_POSSTARI: 1053 case OP_QUERYI: 1054 case OP_MINQUERYI: 1055 case OP_POSQUERYI: 1056 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 1057 break; 1058 1059 /* Single-char upto sets the bit and tries the next */ 1060 1061 case OP_UPTO: 1062 case OP_MINUPTO: 1063 case OP_POSUPTO: 1064 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); 1065 break; 1066 1067 case OP_UPTOI: 1068 case OP_MINUPTOI: 1069 case OP_POSUPTOI: 1070 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); 1071 break; 1072 1073 /* At least one single char sets the bit and stops */ 1074 1075 case OP_EXACT: 1076 tcode += IMM2_SIZE; 1077 /* Fall through */ 1078 case OP_CHAR: 1079 case OP_PLUS: 1080 case OP_MINPLUS: 1081 case OP_POSPLUS: 1082 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); 1083 try_next = FALSE; 1084 break; 1085 1086 case OP_EXACTI: 1087 tcode += IMM2_SIZE; 1088 /* Fall through */ 1089 case OP_CHARI: 1090 case OP_PLUSI: 1091 case OP_MINPLUSI: 1092 case OP_POSPLUSI: 1093 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); 1094 try_next = FALSE; 1095 break; 1096 1097 /* Special spacing and line-terminating items. These recognize specific 1098 lists of characters. The difference between VSPACE and ANYNL is that the 1099 latter can match the two-character CRLF sequence, but that is not 1100 relevant for finding the first character, so their code here is 1101 identical. */ 1102 1103 case OP_HSPACE: 1104 SET_BIT(CHAR_HT); 1105 SET_BIT(CHAR_SPACE); 1106 #ifdef SUPPORT_UTF 1107 if (utf) 1108 { 1109 #ifdef COMPILE_PCRE8 1110 SET_BIT(0xC2); /* For U+00A0 */ 1111 SET_BIT(0xE1); /* For U+1680, U+180E */ 1112 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1113 SET_BIT(0xE3); /* For U+3000 */ 1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1115 SET_BIT(0xA0); 1116 SET_BIT(0xFF); /* For characters > 255 */ 1117 #endif /* COMPILE_PCRE[8|16|32] */ 1118 } 1119 else 1120 #endif /* SUPPORT_UTF */ 1121 { 1122 #ifndef EBCDIC 1123 SET_BIT(0xA0); 1124 #endif /* Not EBCDIC */ 1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1126 SET_BIT(0xFF); /* For characters > 255 */ 1127 #endif /* COMPILE_PCRE[16|32] */ 1128 } 1129 try_next = FALSE; 1130 break; 1131 1132 case OP_ANYNL: 1133 case OP_VSPACE: 1134 SET_BIT(CHAR_LF); 1135 SET_BIT(CHAR_VT); 1136 SET_BIT(CHAR_FF); 1137 SET_BIT(CHAR_CR); 1138 #ifdef SUPPORT_UTF 1139 if (utf) 1140 { 1141 #ifdef COMPILE_PCRE8 1142 SET_BIT(0xC2); /* For U+0085 */ 1143 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1145 SET_BIT(CHAR_NEL); 1146 SET_BIT(0xFF); /* For characters > 255 */ 1147 #endif /* COMPILE_PCRE[8|16|32] */ 1148 } 1149 else 1150 #endif /* SUPPORT_UTF */ 1151 { 1152 SET_BIT(CHAR_NEL); 1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1154 SET_BIT(0xFF); /* For characters > 255 */ 1155 #endif 1156 } 1157 try_next = FALSE; 1158 break; 1159 1160 /* Single character types set the bits and stop. Note that if PCRE_UCP 1161 is set, we do not see these op codes because \d etc are converted to 1162 properties. Therefore, these apply in the case when only characters less 1163 than 256 are recognized to match the types. */ 1164 1165 case OP_NOT_DIGIT: 1166 set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1167 try_next = FALSE; 1168 break; 1169 1170 case OP_DIGIT: 1171 set_type_bits(start_bits, cbit_digit, table_limit, cd); 1172 try_next = FALSE; 1173 break; 1174 1175 /* The cbit_space table has vertical tab as whitespace; we no longer 1176 have to play fancy tricks because Perl added VT to its whitespace at 1177 release 5.18. PCRE added it at release 8.34. */ 1178 1179 case OP_NOT_WHITESPACE: 1180 set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1181 try_next = FALSE; 1182 break; 1183 1184 case OP_WHITESPACE: 1185 set_type_bits(start_bits, cbit_space, table_limit, cd); 1186 try_next = FALSE; 1187 break; 1188 1189 case OP_NOT_WORDCHAR: 1190 set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1191 try_next = FALSE; 1192 break; 1193 1194 case OP_WORDCHAR: 1195 set_type_bits(start_bits, cbit_word, table_limit, cd); 1196 try_next = FALSE; 1197 break; 1198 1199 /* One or more character type fudges the pointer and restarts, knowing 1200 it will hit a single character type and stop there. */ 1201 1202 case OP_TYPEPLUS: 1203 case OP_TYPEMINPLUS: 1204 case OP_TYPEPOSPLUS: 1205 tcode++; 1206 break; 1207 1208 case OP_TYPEEXACT: 1209 tcode += 1 + IMM2_SIZE; 1210 break; 1211 1212 /* Zero or more repeats of character types set the bits and then 1213 try again. */ 1214 1215 case OP_TYPEUPTO: 1216 case OP_TYPEMINUPTO: 1217 case OP_TYPEPOSUPTO: 1218 tcode += IMM2_SIZE; /* Fall through */ 1219 1220 case OP_TYPESTAR: 1221 case OP_TYPEMINSTAR: 1222 case OP_TYPEPOSSTAR: 1223 case OP_TYPEQUERY: 1224 case OP_TYPEMINQUERY: 1225 case OP_TYPEPOSQUERY: 1226 switch(tcode[1]) 1227 { 1228 default: 1229 case OP_ANY: 1230 case OP_ALLANY: 1231 return SSB_FAIL; 1232 1233 case OP_HSPACE: 1234 SET_BIT(CHAR_HT); 1235 SET_BIT(CHAR_SPACE); 1236 #ifdef SUPPORT_UTF 1237 if (utf) 1238 { 1239 #ifdef COMPILE_PCRE8 1240 SET_BIT(0xC2); /* For U+00A0 */ 1241 SET_BIT(0xE1); /* For U+1680, U+180E */ 1242 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1243 SET_BIT(0xE3); /* For U+3000 */ 1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1245 SET_BIT(0xA0); 1246 SET_BIT(0xFF); /* For characters > 255 */ 1247 #endif /* COMPILE_PCRE[8|16|32] */ 1248 } 1249 else 1250 #endif /* SUPPORT_UTF */ 1251 #ifndef EBCDIC 1252 SET_BIT(0xA0); 1253 #endif /* Not EBCDIC */ 1254 break; 1255 1256 case OP_ANYNL: 1257 case OP_VSPACE: 1258 SET_BIT(CHAR_LF); 1259 SET_BIT(CHAR_VT); 1260 SET_BIT(CHAR_FF); 1261 SET_BIT(CHAR_CR); 1262 #ifdef SUPPORT_UTF 1263 if (utf) 1264 { 1265 #ifdef COMPILE_PCRE8 1266 SET_BIT(0xC2); /* For U+0085 */ 1267 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1269 SET_BIT(CHAR_NEL); 1270 SET_BIT(0xFF); /* For characters > 255 */ 1271 #endif /* COMPILE_PCRE16 */ 1272 } 1273 else 1274 #endif /* SUPPORT_UTF */ 1275 SET_BIT(CHAR_NEL); 1276 break; 1277 1278 case OP_NOT_DIGIT: 1279 set_nottype_bits(start_bits, cbit_digit, table_limit, cd); 1280 break; 1281 1282 case OP_DIGIT: 1283 set_type_bits(start_bits, cbit_digit, table_limit, cd); 1284 break; 1285 1286 /* The cbit_space table has vertical tab as whitespace; we no longer 1287 have to play fancy tricks because Perl added VT to its whitespace at 1288 release 5.18. PCRE added it at release 8.34. */ 1289 1290 case OP_NOT_WHITESPACE: 1291 set_nottype_bits(start_bits, cbit_space, table_limit, cd); 1292 break; 1293 1294 case OP_WHITESPACE: 1295 set_type_bits(start_bits, cbit_space, table_limit, cd); 1296 break; 1297 1298 case OP_NOT_WORDCHAR: 1299 set_nottype_bits(start_bits, cbit_word, table_limit, cd); 1300 break; 1301 1302 case OP_WORDCHAR: 1303 set_type_bits(start_bits, cbit_word, table_limit, cd); 1304 break; 1305 } 1306 1307 tcode += 2; 1308 break; 1309 1310 /* Character class where all the information is in a bit map: set the 1311 bits and either carry on or not, according to the repeat count. If it was 1312 a negative class, and we are operating with UTF-8 characters, any byte 1313 with a value >= 0xc4 is a potentially valid starter because it starts a 1314 character with a value > 255. */ 1315 1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 1317 case OP_XCLASS: 1318 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0) 1319 return SSB_FAIL; 1320 /* All bits are set. */ 1321 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0) 1322 return SSB_FAIL; 1323 #endif 1324 /* Fall through */ 1325 1326 case OP_NCLASS: 1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1328 if (utf) 1329 { 1330 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ 1331 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ 1332 } 1333 #endif 1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1335 SET_BIT(0xFF); /* For characters > 255 */ 1336 #endif 1337 /* Fall through */ 1338 1339 case OP_CLASS: 1340 { 1341 pcre_uint8 *map; 1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 1343 map = NULL; 1344 if (*tcode == OP_XCLASS) 1345 { 1346 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0) 1347 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1); 1348 tcode += GET(tcode, 1); 1349 } 1350 else 1351 #endif 1352 { 1353 tcode++; 1354 map = (pcre_uint8 *)tcode; 1355 tcode += 32 / sizeof(pcre_uchar); 1356 } 1357 1358 /* In UTF-8 mode, the bits in a bit map correspond to character 1359 values, not to byte values. However, the bit map we are constructing is 1360 for byte values. So we have to do a conversion for characters whose 1361 value is > 127. In fact, there are only two possible starting bytes for 1362 characters in the range 128 - 255. */ 1363 1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 1365 if (map != NULL) 1366 #endif 1367 { 1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8 1369 if (utf) 1370 { 1371 for (c = 0; c < 16; c++) start_bits[c] |= map[c]; 1372 for (c = 128; c < 256; c++) 1373 { 1374 if ((map[c/8] && (1 << (c&7))) != 0) 1375 { 1376 int d = (c >> 6) | 0xc0; /* Set bit for this starter */ 1377 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ 1378 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ 1379 } 1380 } 1381 } 1382 else 1383 #endif 1384 { 1385 /* In non-UTF-8 mode, the two bit maps are completely compatible. */ 1386 for (c = 0; c < 32; c++) start_bits[c] |= map[c]; 1387 } 1388 } 1389 1390 /* Advance past the bit map, and act on what follows. For a zero 1391 minimum repeat, continue; otherwise stop processing. */ 1392 1393 switch (*tcode) 1394 { 1395 case OP_CRSTAR: 1396 case OP_CRMINSTAR: 1397 case OP_CRQUERY: 1398 case OP_CRMINQUERY: 1399 case OP_CRPOSSTAR: 1400 case OP_CRPOSQUERY: 1401 tcode++; 1402 break; 1403 1404 case OP_CRRANGE: 1405 case OP_CRMINRANGE: 1406 case OP_CRPOSRANGE: 1407 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; 1408 else try_next = FALSE; 1409 break; 1410 1411 default: 1412 try_next = FALSE; 1413 break; 1414 } 1415 } 1416 break; /* End of bitmap class handling */ 1417 1418 } /* End of switch */ 1419 } /* End of try_next loop */ 1420 1421 code += GET(code, 1); /* Advance to next branch */ 1422 } 1423 while (*code == OP_ALT); 1424 return yield; 1425 } 1426 1427 1428 1429 1430 1431 /************************************************* 1432 * Study a compiled expression * 1433 *************************************************/ 1434 1435 /* This function is handed a compiled expression that it must study to produce 1436 information that will speed up the matching. It returns a pcre[16]_extra block 1437 which then gets handed back to pcre_exec(). 1438 1439 Arguments: 1440 re points to the compiled expression 1441 options contains option bits 1442 errorptr points to where to place error messages; 1443 set NULL unless error 1444 1445 Returns: pointer to a pcre[16]_extra block, with study_data filled in and 1446 the appropriate flags set; 1447 NULL on error or if no optimization possible 1448 */ 1449 1450 #if defined COMPILE_PCRE8 1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION 1452 pcre_study(const pcre *external_re, int options, const char **errorptr) 1453 #elif defined COMPILE_PCRE16 1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION 1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr) 1456 #elif defined COMPILE_PCRE32 1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION 1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr) 1459 #endif 1460 { 1461 int min; 1462 int count = 0; 1463 BOOL bits_set = FALSE; 1464 pcre_uint8 start_bits[32]; 1465 PUBL(extra) *extra = NULL; 1466 pcre_study_data *study; 1467 const pcre_uint8 *tables; 1468 pcre_uchar *code; 1469 compile_data compile_block; 1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re; 1471 1472 1473 *errorptr = NULL; 1474 1475 if (re == NULL || re->magic_number != MAGIC_NUMBER) 1476 { 1477 *errorptr = "argument is not a compiled regular expression"; 1478 return NULL; 1479 } 1480 1481 if ((re->flags & PCRE_MODE) == 0) 1482 { 1483 #if defined COMPILE_PCRE8 1484 *errorptr = "argument not compiled in 8 bit mode"; 1485 #elif defined COMPILE_PCRE16 1486 *errorptr = "argument not compiled in 16 bit mode"; 1487 #elif defined COMPILE_PCRE32 1488 *errorptr = "argument not compiled in 32 bit mode"; 1489 #endif 1490 return NULL; 1491 } 1492 1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) 1494 { 1495 *errorptr = "unknown or incorrect option bit(s) set"; 1496 return NULL; 1497 } 1498 1499 code = (pcre_uchar *)re + re->name_table_offset + 1500 (re->name_count * re->name_entry_size); 1501 1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or 1503 a multiline pattern that matches only at "line starts", there is no point in 1504 seeking a list of starting bytes. */ 1505 1506 if ((re->options & PCRE_ANCHORED) == 0 && 1507 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) 1508 { 1509 int rc; 1510 1511 /* Set the character tables in the block that is passed around */ 1512 1513 tables = re->tables; 1514 1515 #if defined COMPILE_PCRE8 1516 if (tables == NULL) 1517 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1518 (void *)(&tables)); 1519 #elif defined COMPILE_PCRE16 1520 if (tables == NULL) 1521 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1522 (void *)(&tables)); 1523 #elif defined COMPILE_PCRE32 1524 if (tables == NULL) 1525 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, 1526 (void *)(&tables)); 1527 #endif 1528 1529 compile_block.lcc = tables + lcc_offset; 1530 compile_block.fcc = tables + fcc_offset; 1531 compile_block.cbits = tables + cbits_offset; 1532 compile_block.ctypes = tables + ctypes_offset; 1533 1534 /* See if we can find a fixed set of initial characters for the pattern. */ 1535 1536 memset(start_bits, 0, 32 * sizeof(pcre_uint8)); 1537 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, 1538 &compile_block); 1539 bits_set = rc == SSB_DONE; 1540 if (rc == SSB_UNKNOWN) 1541 { 1542 *errorptr = "internal error: opcode not recognized"; 1543 return NULL; 1544 } 1545 } 1546 1547 /* Find the minimum length of subject string. */ 1548 1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count)) 1550 { 1551 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL; 1552 case -3: *errorptr = "internal error: opcode not recognized"; return NULL; 1553 default: break; 1554 } 1555 1556 /* If a set of starting bytes has been identified, or if the minimum length is 1557 greater than zero, or if JIT optimization has been requested, or if 1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a 1559 pcre_study_data block. The study data is put in the latter, which is pointed to 1560 by the former, which may also get additional data set later by the calling 1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless 1562 save it in a field for returning via the pcre_fullinfo() function so that if it 1563 becomes variable in the future, we don't have to change that code. */ 1564 1565 if (bits_set || min > 0 || (options & ( 1566 #ifdef SUPPORT_JIT 1567 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE | 1568 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE | 1569 #endif 1570 PCRE_STUDY_EXTRA_NEEDED)) != 0) 1571 { 1572 extra = (PUBL(extra) *)(PUBL(malloc)) 1573 (sizeof(PUBL(extra)) + sizeof(pcre_study_data)); 1574 if (extra == NULL) 1575 { 1576 *errorptr = "failed to get memory"; 1577 return NULL; 1578 } 1579 1580 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra))); 1581 extra->flags = PCRE_EXTRA_STUDY_DATA; 1582 extra->study_data = study; 1583 1584 study->size = sizeof(pcre_study_data); 1585 study->flags = 0; 1586 1587 /* Set the start bits always, to avoid unset memory errors if the 1588 study data is written to a file, but set the flag only if any of the bits 1589 are set, to save time looking when none are. */ 1590 1591 if (bits_set) 1592 { 1593 study->flags |= PCRE_STUDY_MAPPED; 1594 memcpy(study->start_bits, start_bits, sizeof(start_bits)); 1595 } 1596 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8)); 1597 1598 #ifdef PCRE_DEBUG 1599 if (bits_set) 1600 { 1601 pcre_uint8 *ptr = start_bits; 1602 int i; 1603 1604 printf("Start bits:\n"); 1605 for (i = 0; i < 32; i++) 1606 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); 1607 } 1608 #endif 1609 1610 /* Always set the minlength value in the block, because the JIT compiler 1611 makes use of it. However, don't set the bit unless the length is greater than 1612 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time 1613 checking the zero case. */ 1614 1615 if (min > 0) 1616 { 1617 study->flags |= PCRE_STUDY_MINLEN; 1618 study->minlength = min; 1619 } 1620 else study->minlength = 0; 1621 1622 /* If JIT support was compiled and requested, attempt the JIT compilation. 1623 If no starting bytes were found, and the minimum length is zero, and JIT 1624 compilation fails, abandon the extra block and return NULL, unless 1625 PCRE_STUDY_EXTRA_NEEDED is set. */ 1626 1627 #ifdef SUPPORT_JIT 1628 extra->executable_jit = NULL; 1629 if ((options & PCRE_STUDY_JIT_COMPILE) != 0) 1630 PRIV(jit_compile)(re, extra, JIT_COMPILE); 1631 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0) 1632 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE); 1633 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0) 1634 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE); 1635 1636 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 && 1637 (options & PCRE_STUDY_EXTRA_NEEDED) == 0) 1638 { 1639 #if defined COMPILE_PCRE8 1640 pcre_free_study(extra); 1641 #elif defined COMPILE_PCRE16 1642 pcre16_free_study(extra); 1643 #elif defined COMPILE_PCRE32 1644 pcre32_free_study(extra); 1645 #endif 1646 extra = NULL; 1647 } 1648 #endif 1649 } 1650 1651 return extra; 1652 } 1653 1654 1655 /************************************************* 1656 * Free the study data * 1657 *************************************************/ 1658 1659 /* This function frees the memory that was obtained by pcre_study(). 1660 1661 Argument: a pointer to the pcre[16]_extra block 1662 Returns: nothing 1663 */ 1664 1665 #if defined COMPILE_PCRE8 1666 PCRE_EXP_DEFN void 1667 pcre_free_study(pcre_extra *extra) 1668 #elif defined COMPILE_PCRE16 1669 PCRE_EXP_DEFN void 1670 pcre16_free_study(pcre16_extra *extra) 1671 #elif defined COMPILE_PCRE32 1672 PCRE_EXP_DEFN void 1673 pcre32_free_study(pcre32_extra *extra) 1674 #endif 1675 { 1676 if (extra == NULL) 1677 return; 1678 #ifdef SUPPORT_JIT 1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && 1680 extra->executable_jit != NULL) 1681 PRIV(jit_free)(extra->executable_jit); 1682 #endif 1683 PUBL(free)(extra); 1684 } 1685 1686 /* End of pcre_study.c */ 1687