1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 /* This module contains functions for scanning a compiled pattern and 42 collecting data (e.g. minimum matching length). */ 43 44 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 49 50 #include "pcre2_internal.h" 51 52 53 /* Set a bit in the starting code unit bit map. */ 54 55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7)) 56 57 /* Returns from set_start_bits() */ 58 59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; 60 61 62 /************************************************* 63 * Find the minimum subject length for a group * 64 *************************************************/ 65 66 /* Scan a parenthesized group and compute the minimum length of subject that 67 is needed to match it. This is a lower bound; it does not mean there is a 68 string of that length that matches. In UTF mode, the result is in characters 69 rather than code units. The field in a compiled pattern for storing the minimum 70 length is 16-bits long (on the grounds that anything longer than that is 71 pathological), so we give up when we reach that amount. This also means that 72 integer overflow for really crazy patterns cannot happen. 73 74 Arguments: 75 re compiled pattern block 76 code pointer to start of group (the bracket) 77 startcode pointer to start of the whole pattern's code 78 utf UTF flag 79 recurses chain of recurse_check to catch mutual recursion 80 countptr pointer to call count (to catch over complexity) 81 82 Returns: the minimum length 83 -1 \C in UTF-8 mode 84 or (*ACCEPT) 85 or pattern too complicated 86 or back reference to duplicate name/number 87 -2 internal error (missing capturing bracket) 88 -3 internal error (opcode not listed) 89 */ 90 91 static int 92 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, 93 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr) 94 { 95 int length = -1; 96 int prev_cap_recno = -1; 97 int prev_cap_d = 0; 98 int prev_recurse_recno = -1; 99 int prev_recurse_d = 0; 100 uint32_t once_fudge = 0; 101 BOOL had_recurse = FALSE; 102 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; 103 recurse_check this_recurse; 104 register int branchlength = 0; 105 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; 106 107 /* If this is a "could be empty" group, its minimum length is 0. */ 108 109 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; 110 111 /* Skip over capturing bracket number */ 112 113 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; 114 115 /* A large and/or complex regex can take too long to process. */ 116 117 if ((*countptr)++ > 1000) return -1; 118 119 /* Scan along the opcodes for this branch. If we get to the end of the branch, 120 check the length against that of the other branches. If the accumulated length 121 passes 16-bits, stop. */ 122 123 for (;;) 124 { 125 int d, min, recno; 126 PCRE2_UCHAR *cs, *ce; 127 register PCRE2_UCHAR op = *cc; 128 129 if (branchlength >= UINT16_MAX) return UINT16_MAX; 130 131 switch (op) 132 { 133 case OP_COND: 134 case OP_SCOND: 135 136 /* If there is only one branch in a condition, the implied branch has zero 137 length, so we don't add anything. This covers the DEFINE "condition" 138 automatically. If there are two branches we can treat it the same as any 139 other non-capturing subpattern. */ 140 141 cs = cc + GET(cc, 1); 142 if (*cs != OP_ALT) 143 { 144 cc = cs + 1 + LINK_SIZE; 145 break; 146 } 147 goto PROCESS_NON_CAPTURE; 148 149 /* There's a special case of OP_ONCE, when it is wrapped round an 150 OP_RECURSE. We'd like to process the latter at this level so that 151 remembering the value works for repeated cases. So we do nothing, but 152 set a fudge value to skip over the OP_KET after the recurse. */ 153 154 case OP_ONCE: 155 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) 156 { 157 once_fudge = 1 + LINK_SIZE; 158 cc += 1 + LINK_SIZE; 159 break; 160 } 161 /* Fall through */ 162 163 case OP_ONCE_NC: 164 case OP_BRA: 165 case OP_SBRA: 166 case OP_BRAPOS: 167 case OP_SBRAPOS: 168 PROCESS_NON_CAPTURE: 169 d = find_minlength(re, cc, startcode, utf, recurses, countptr); 170 if (d < 0) return d; 171 branchlength += d; 172 do cc += GET(cc, 1); while (*cc == OP_ALT); 173 cc += 1 + LINK_SIZE; 174 break; 175 176 /* To save time for repeated capturing subpatterns, we remember the 177 length of the previous one. Unfortunately we can't do the same for 178 the unnumbered ones above. Nor can we do this if (?| is present in the 179 pattern because captures with the same number are not then identical. */ 180 181 case OP_CBRA: 182 case OP_SCBRA: 183 case OP_CBRAPOS: 184 case OP_SCBRAPOS: 185 recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE); 186 if (recno != prev_cap_recno) 187 { 188 prev_cap_recno = recno; 189 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr); 190 if (prev_cap_d < 0) return prev_cap_d; 191 } 192 branchlength += prev_cap_d; 193 do cc += GET(cc, 1); while (*cc == OP_ALT); 194 cc += 1 + LINK_SIZE; 195 break; 196 197 /* ACCEPT makes things far too complicated; we have to give up. */ 198 199 case OP_ACCEPT: 200 case OP_ASSERT_ACCEPT: 201 return -1; 202 203 /* Reached end of a branch; if it's a ket it is the end of a nested 204 call. If it's ALT it is an alternation in a nested call. If it is END it's 205 the end of the outer call. All can be handled by the same code. If an 206 ACCEPT was previously encountered, use the length that was in force at that 207 time, and pass back the shortest ACCEPT length. */ 208 209 case OP_ALT: 210 case OP_KET: 211 case OP_KETRMAX: 212 case OP_KETRMIN: 213 case OP_KETRPOS: 214 case OP_END: 215 if (length < 0 || (!had_recurse && branchlength < length)) 216 length = branchlength; 217 if (op != OP_ALT) return length; 218 cc += 1 + LINK_SIZE; 219 branchlength = 0; 220 had_recurse = FALSE; 221 break; 222 223 /* Skip over assertive subpatterns */ 224 225 case OP_ASSERT: 226 case OP_ASSERT_NOT: 227 case OP_ASSERTBACK: 228 case OP_ASSERTBACK_NOT: 229 do cc += GET(cc, 1); while (*cc == OP_ALT); 230 /* Fall through */ 231 232 /* Skip over things that don't match chars */ 233 234 case OP_REVERSE: 235 case OP_CREF: 236 case OP_DNCREF: 237 case OP_RREF: 238 case OP_DNRREF: 239 case OP_FALSE: 240 case OP_TRUE: 241 case OP_CALLOUT: 242 case OP_SOD: 243 case OP_SOM: 244 case OP_EOD: 245 case OP_EODN: 246 case OP_CIRC: 247 case OP_CIRCM: 248 case OP_DOLL: 249 case OP_DOLLM: 250 case OP_NOT_WORD_BOUNDARY: 251 case OP_WORD_BOUNDARY: 252 cc += PRIV(OP_lengths)[*cc]; 253 break; 254 255 case OP_CALLOUT_STR: 256 cc += GET(cc, 1 + 2*LINK_SIZE); 257 break; 258 259 /* Skip over a subpattern that has a {0} or {0,x} quantifier */ 260 261 case OP_BRAZERO: 262 case OP_BRAMINZERO: 263 case OP_BRAPOSZERO: 264 case OP_SKIPZERO: 265 cc += PRIV(OP_lengths)[*cc]; 266 do cc += GET(cc, 1); while (*cc == OP_ALT); 267 cc += 1 + LINK_SIZE; 268 break; 269 270 /* Handle literal characters and + repetitions */ 271 272 case OP_CHAR: 273 case OP_CHARI: 274 case OP_NOT: 275 case OP_NOTI: 276 case OP_PLUS: 277 case OP_PLUSI: 278 case OP_MINPLUS: 279 case OP_MINPLUSI: 280 case OP_POSPLUS: 281 case OP_POSPLUSI: 282 case OP_NOTPLUS: 283 case OP_NOTPLUSI: 284 case OP_NOTMINPLUS: 285 case OP_NOTMINPLUSI: 286 case OP_NOTPOSPLUS: 287 case OP_NOTPOSPLUSI: 288 branchlength++; 289 cc += 2; 290 #ifdef SUPPORT_UNICODE 291 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 292 #endif 293 break; 294 295 case OP_TYPEPLUS: 296 case OP_TYPEMINPLUS: 297 case OP_TYPEPOSPLUS: 298 branchlength++; 299 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; 300 break; 301 302 /* Handle exact repetitions. The count is already in characters, but we 303 may need to skip over a multibyte character in UTF mode. */ 304 305 case OP_EXACT: 306 case OP_EXACTI: 307 case OP_NOTEXACT: 308 case OP_NOTEXACTI: 309 branchlength += GET2(cc,1); 310 cc += 2 + IMM2_SIZE; 311 #ifdef SUPPORT_UNICODE 312 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 313 #endif 314 break; 315 316 case OP_TYPEEXACT: 317 branchlength += GET2(cc,1); 318 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP 319 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 320 break; 321 322 /* Handle single-char non-literal matchers */ 323 324 case OP_PROP: 325 case OP_NOTPROP: 326 cc += 2; 327 /* Fall through */ 328 329 case OP_NOT_DIGIT: 330 case OP_DIGIT: 331 case OP_NOT_WHITESPACE: 332 case OP_WHITESPACE: 333 case OP_NOT_WORDCHAR: 334 case OP_WORDCHAR: 335 case OP_ANY: 336 case OP_ALLANY: 337 case OP_EXTUNI: 338 case OP_HSPACE: 339 case OP_NOT_HSPACE: 340 case OP_VSPACE: 341 case OP_NOT_VSPACE: 342 branchlength++; 343 cc++; 344 break; 345 346 /* "Any newline" might match two characters, but it also might match just 347 one. */ 348 349 case OP_ANYNL: 350 branchlength += 1; 351 cc++; 352 break; 353 354 /* The single-byte matcher means we can't proceed in UTF mode. (In 355 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever 356 appear, but leave the code, just in case.) */ 357 358 case OP_ANYBYTE: 359 #ifdef SUPPORT_UNICODE 360 if (utf) return -1; 361 #endif 362 branchlength++; 363 cc++; 364 break; 365 366 /* For repeated character types, we have to test for \p and \P, which have 367 an extra two bytes of parameters. */ 368 369 case OP_TYPESTAR: 370 case OP_TYPEMINSTAR: 371 case OP_TYPEQUERY: 372 case OP_TYPEMINQUERY: 373 case OP_TYPEPOSSTAR: 374 case OP_TYPEPOSQUERY: 375 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; 376 cc += PRIV(OP_lengths)[op]; 377 break; 378 379 case OP_TYPEUPTO: 380 case OP_TYPEMINUPTO: 381 case OP_TYPEPOSUPTO: 382 if (cc[1 + IMM2_SIZE] == OP_PROP 383 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; 384 cc += PRIV(OP_lengths)[op]; 385 break; 386 387 /* Check a class for variable quantification */ 388 389 case OP_CLASS: 390 case OP_NCLASS: 391 #ifdef SUPPORT_WIDE_CHARS 392 case OP_XCLASS: 393 /* The original code caused an unsigned overflow in 64 bit systems, 394 so now we use a conditional statement. */ 395 if (op == OP_XCLASS) 396 cc += GET(cc, 1); 397 else 398 cc += PRIV(OP_lengths)[OP_CLASS]; 399 #else 400 cc += PRIV(OP_lengths)[OP_CLASS]; 401 #endif 402 403 switch (*cc) 404 { 405 case OP_CRPLUS: 406 case OP_CRMINPLUS: 407 case OP_CRPOSPLUS: 408 branchlength++; 409 /* Fall through */ 410 411 case OP_CRSTAR: 412 case OP_CRMINSTAR: 413 case OP_CRQUERY: 414 case OP_CRMINQUERY: 415 case OP_CRPOSSTAR: 416 case OP_CRPOSQUERY: 417 cc++; 418 break; 419 420 case OP_CRRANGE: 421 case OP_CRMINRANGE: 422 case OP_CRPOSRANGE: 423 branchlength += GET2(cc,1); 424 cc += 1 + 2 * IMM2_SIZE; 425 break; 426 427 default: 428 branchlength++; 429 break; 430 } 431 break; 432 433 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same 434 way: we find the minimum length for the subpattern. A recursion 435 (backreference or subroutine) causes an a flag to be set that causes the 436 length of this branch to be ignored. The logic is that a recursion can only 437 make sense if there is another alternative that stops the recursing. That 438 will provide the minimum length (when no recursion happens). 439 440 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket 441 matches an empty string (by default it causes a matching failure), so in 442 that case we must set the minimum length to zero. */ 443 444 /* Duplicate named pattern back reference. We cannot reliably find a length 445 for this if duplicate numbers are present in the pattern. */ 446 447 case OP_DNREF: 448 case OP_DNREFI: 449 if (dupcapused) return -1; 450 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) 451 { 452 int count = GET2(cc, 1+IMM2_SIZE); 453 PCRE2_UCHAR *slot = 454 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 455 GET2(cc, 1) * re->name_entry_size; 456 457 d = INT_MAX; 458 459 /* Scan all groups with the same name */ 460 461 while (count-- > 0) 462 { 463 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); 464 if (cs == NULL) return -2; 465 do ce += GET(ce, 1); while (*ce == OP_ALT); 466 if (cc > cs && cc < ce) /* Simple recursion */ 467 { 468 d = 0; 469 had_recurse = TRUE; 470 break; 471 } 472 else 473 { 474 recurse_check *r = recurses; 475 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 476 if (r != NULL) /* Mutual recursion */ 477 { 478 d = 0; 479 had_recurse = TRUE; 480 break; 481 } 482 else 483 { 484 int dd; 485 this_recurse.prev = recurses; 486 this_recurse.group = cs; 487 dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); 488 if (dd < d) d = dd; 489 } 490 } 491 slot += re->name_entry_size; 492 } 493 } 494 else d = 0; 495 cc += 1 + 2*IMM2_SIZE; 496 goto REPEAT_BACK_REFERENCE; 497 498 /* Single back reference. We cannot find a length for this if duplicate 499 numbers are present in the pattern. */ 500 501 case OP_REF: 502 case OP_REFI: 503 if (dupcapused) return -1; 504 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) 505 { 506 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); 507 if (cs == NULL) return -2; 508 do ce += GET(ce, 1); while (*ce == OP_ALT); 509 if (cc > cs && cc < ce) /* Simple recursion */ 510 { 511 d = 0; 512 had_recurse = TRUE; 513 } 514 else 515 { 516 recurse_check *r = recurses; 517 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 518 if (r != NULL) /* Mutual recursion */ 519 { 520 d = 0; 521 had_recurse = TRUE; 522 } 523 else 524 { 525 this_recurse.prev = recurses; 526 this_recurse.group = cs; 527 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); 528 } 529 } 530 } 531 else d = 0; 532 cc += 1 + IMM2_SIZE; 533 534 /* Handle repeated back references */ 535 536 REPEAT_BACK_REFERENCE: 537 switch (*cc) 538 { 539 case OP_CRSTAR: 540 case OP_CRMINSTAR: 541 case OP_CRQUERY: 542 case OP_CRMINQUERY: 543 case OP_CRPOSSTAR: 544 case OP_CRPOSQUERY: 545 min = 0; 546 cc++; 547 break; 548 549 case OP_CRPLUS: 550 case OP_CRMINPLUS: 551 case OP_CRPOSPLUS: 552 min = 1; 553 cc++; 554 break; 555 556 case OP_CRRANGE: 557 case OP_CRMINRANGE: 558 case OP_CRPOSRANGE: 559 min = GET2(cc, 1); 560 cc += 1 + 2 * IMM2_SIZE; 561 break; 562 563 default: 564 min = 1; 565 break; 566 } 567 568 /* Take care not to overflow: (1) min and d are ints, so check that their 569 product is not greater than INT_MAX. (2) branchlength is limited to 570 UINT16_MAX (checked at the top of the loop). */ 571 572 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d) 573 branchlength = UINT16_MAX; 574 else branchlength += min * d; 575 break; 576 577 /* Recursion always refers to the first occurrence of a subpattern with a 578 given number. Therefore, we can always make use of caching, even when the 579 pattern contains multiple subpatterns with the same number. */ 580 581 case OP_RECURSE: 582 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); 583 recno = GET2(cs, 1+LINK_SIZE); 584 if (recno == prev_recurse_recno) 585 { 586 branchlength += prev_recurse_d; 587 } 588 else 589 { 590 do ce += GET(ce, 1); while (*ce == OP_ALT); 591 if (cc > cs && cc < ce) /* Simple recursion */ 592 had_recurse = TRUE; 593 else 594 { 595 recurse_check *r = recurses; 596 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 597 if (r != NULL) /* Mutual recursion */ 598 had_recurse = TRUE; 599 else 600 { 601 this_recurse.prev = recurses; 602 this_recurse.group = cs; 603 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, 604 countptr); 605 if (prev_recurse_d < 0) return prev_recurse_d; 606 prev_recurse_recno = recno; 607 branchlength += prev_recurse_d; 608 } 609 } 610 } 611 cc += 1 + LINK_SIZE + once_fudge; 612 once_fudge = 0; 613 break; 614 615 /* Anything else does not or need not match a character. We can get the 616 item's length from the table, but for those that can match zero occurrences 617 of a character, we must take special action for UTF-8 characters. As it 618 happens, the "NOT" versions of these opcodes are used at present only for 619 ASCII characters, so they could be omitted from this list. However, in 620 future that may change, so we include them here so as not to leave a 621 gotcha for a future maintainer. */ 622 623 case OP_UPTO: 624 case OP_UPTOI: 625 case OP_NOTUPTO: 626 case OP_NOTUPTOI: 627 case OP_MINUPTO: 628 case OP_MINUPTOI: 629 case OP_NOTMINUPTO: 630 case OP_NOTMINUPTOI: 631 case OP_POSUPTO: 632 case OP_POSUPTOI: 633 case OP_NOTPOSUPTO: 634 case OP_NOTPOSUPTOI: 635 636 case OP_STAR: 637 case OP_STARI: 638 case OP_NOTSTAR: 639 case OP_NOTSTARI: 640 case OP_MINSTAR: 641 case OP_MINSTARI: 642 case OP_NOTMINSTAR: 643 case OP_NOTMINSTARI: 644 case OP_POSSTAR: 645 case OP_POSSTARI: 646 case OP_NOTPOSSTAR: 647 case OP_NOTPOSSTARI: 648 649 case OP_QUERY: 650 case OP_QUERYI: 651 case OP_NOTQUERY: 652 case OP_NOTQUERYI: 653 case OP_MINQUERY: 654 case OP_MINQUERYI: 655 case OP_NOTMINQUERY: 656 case OP_NOTMINQUERYI: 657 case OP_POSQUERY: 658 case OP_POSQUERYI: 659 case OP_NOTPOSQUERY: 660 case OP_NOTPOSQUERYI: 661 662 cc += PRIV(OP_lengths)[op]; 663 #ifdef SUPPORT_UNICODE 664 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 665 #endif 666 break; 667 668 /* Skip these, but we need to add in the name length. */ 669 670 case OP_MARK: 671 case OP_PRUNE_ARG: 672 case OP_SKIP_ARG: 673 case OP_THEN_ARG: 674 cc += PRIV(OP_lengths)[op] + cc[1]; 675 break; 676 677 /* The remaining opcodes are just skipped over. */ 678 679 case OP_CLOSE: 680 case OP_COMMIT: 681 case OP_FAIL: 682 case OP_PRUNE: 683 case OP_SET_SOM: 684 case OP_SKIP: 685 case OP_THEN: 686 cc += PRIV(OP_lengths)[op]; 687 break; 688 689 /* This should not occur: we list all opcodes explicitly so that when 690 new ones get added they are properly considered. */ 691 692 default: 693 return -3; 694 } 695 } 696 /* Control never gets here */ 697 } 698 699 700 701 /************************************************* 702 * Set a bit and maybe its alternate case * 703 *************************************************/ 704 705 /* Given a character, set its first code unit's bit in the table, and also the 706 corresponding bit for the other version of a letter if we are caseless. 707 708 Arguments: 709 re points to the regex block 710 p points to the first code unit of the character 711 caseless TRUE if caseless 712 utf TRUE for UTF mode 713 714 Returns: pointer after the character 715 */ 716 717 static PCRE2_SPTR 718 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf) 719 { 720 uint32_t c = *p++; /* First code unit */ 721 (void)utf; /* Stop compiler warning when UTF not supported */ 722 723 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for 724 0xff. */ 725 726 #if PCRE2_CODE_UNIT_WIDTH != 8 727 if (c > 0xff) SET_BIT(0xff); else 728 #endif 729 730 SET_BIT(c); 731 732 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find 733 the end of the character, even when caseless. */ 734 735 #ifdef SUPPORT_UNICODE 736 if (utf) 737 { 738 #if PCRE2_CODE_UNIT_WIDTH == 8 739 if (c >= 0xc0) GETUTF8INC(c, p); 740 #elif PCRE2_CODE_UNIT_WIDTH == 16 741 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p); 742 #endif 743 } 744 #endif /* SUPPORT_UNICODE */ 745 746 /* If caseless, handle the other case of the character. */ 747 748 if (caseless) 749 { 750 if (utf) 751 { 752 #if PCRE2_CODE_UNIT_WIDTH == 8 753 PCRE2_UCHAR buff[6]; 754 c = UCD_OTHERCASE(c); 755 (void)PRIV(ord2utf)(c, buff); 756 SET_BIT(buff[0]); 757 #else /* 16-bit or 32-bit mode */ 758 c = UCD_OTHERCASE(c); 759 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); 760 #endif 761 } 762 763 /* Not UTF */ 764 765 else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); 766 } 767 768 return p; 769 } 770 771 772 773 /************************************************* 774 * Set bits for a positive character type * 775 *************************************************/ 776 777 /* This function sets starting bits for a character type. In UTF-8 mode, we can 778 only do a direct setting for bytes less than 128, as otherwise there can be 779 confusion with bytes in the middle of UTF-8 characters. In a "traditional" 780 environment, the tables will only recognize ASCII characters anyway, but in at 781 least one Windows environment, some higher bytes bits were set in the tables. 782 So we deal with that case by considering the UTF-8 encoding. 783 784 Arguments: 785 re the regex block 786 cbit type the type of character wanted 787 table_limit 32 for non-UTF-8; 16 for UTF-8 788 789 Returns: nothing 790 */ 791 792 static void 793 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) 794 { 795 register uint32_t c; 796 for (c = 0; c < table_limit; c++) 797 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type]; 798 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 799 if (table_limit == 32) return; 800 for (c = 128; c < 256; c++) 801 { 802 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0) 803 { 804 PCRE2_UCHAR buff[6]; 805 (void)PRIV(ord2utf)(c, buff); 806 SET_BIT(buff[0]); 807 } 808 } 809 #endif /* UTF-8 */ 810 } 811 812 813 /************************************************* 814 * Set bits for a negative character type * 815 *************************************************/ 816 817 /* This function sets starting bits for a negative character type such as \D. 818 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as 819 otherwise there can be confusion with bytes in the middle of UTF-8 characters. 820 Unlike in the positive case, where we can set appropriate starting bits for 821 specific high-valued UTF-8 characters, in this case we have to set the bits for 822 all high-valued characters. The lowest is 0xc2, but we overkill by starting at 823 0xc0 (192) for simplicity. 824 825 Arguments: 826 re the regex block 827 cbit type the type of character wanted 828 table_limit 32 for non-UTF-8; 16 for UTF-8 829 830 Returns: nothing 831 */ 832 833 static void 834 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) 835 { 836 register uint32_t c; 837 for (c = 0; c < table_limit; c++) 838 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]); 839 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 840 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff; 841 #endif 842 } 843 844 845 846 /************************************************* 847 * Create bitmap of starting bytes * 848 *************************************************/ 849 850 /* This function scans a compiled unanchored expression recursively and 851 attempts to build a bitmap of the set of possible starting code units whose 852 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause 853 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode 854 we pass a value of 16 rather than 32 as the final argument. (See comments in 855 those functions for the reason.) 856 857 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as 858 (a*)b where the group provides some optional starting code units but scanning 859 must continue at the outer level to find at least one mandatory code unit. At 860 the outermost level, this function fails unless the result is SSB_DONE. 861 862 Arguments: 863 re points to the compiled regex block 864 code points to an expression 865 utf TRUE if in UTF mode 866 867 Returns: SSB_FAIL => Failed to find any starting code units 868 SSB_DONE => Found mandatory starting code units 869 SSB_CONTINUE => Found optional starting code units 870 SSB_UNKNOWN => Hit an unrecognized opcode 871 */ 872 873 static int 874 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf) 875 { 876 register uint32_t c; 877 int yield = SSB_DONE; 878 879 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 880 int table_limit = utf? 16:32; 881 #else 882 int table_limit = 32; 883 #endif 884 885 do 886 { 887 BOOL try_next = TRUE; 888 PCRE2_SPTR tcode = code + 1 + LINK_SIZE; 889 890 if (*code == OP_CBRA || *code == OP_SCBRA || 891 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; 892 893 while (try_next) /* Loop for items in this branch */ 894 { 895 int rc; 896 uint8_t *classmap = NULL; 897 898 switch(*tcode) 899 { 900 /* If we reach something we don't understand, it means a new opcode has 901 been created that hasn't been added to this function. Hopefully this 902 problem will be discovered during testing. */ 903 904 default: 905 return SSB_UNKNOWN; 906 907 /* Fail for a valid opcode that implies no starting bits. */ 908 909 case OP_ACCEPT: 910 case OP_ASSERT_ACCEPT: 911 case OP_ALLANY: 912 case OP_ANY: 913 case OP_ANYBYTE: 914 case OP_CIRC: 915 case OP_CIRCM: 916 case OP_CLOSE: 917 case OP_COMMIT: 918 case OP_COND: 919 case OP_CREF: 920 case OP_FALSE: 921 case OP_TRUE: 922 case OP_DNCREF: 923 case OP_DNREF: 924 case OP_DNREFI: 925 case OP_DNRREF: 926 case OP_DOLL: 927 case OP_DOLLM: 928 case OP_END: 929 case OP_EOD: 930 case OP_EODN: 931 case OP_EXTUNI: 932 case OP_FAIL: 933 case OP_MARK: 934 case OP_NOT: 935 case OP_NOTEXACT: 936 case OP_NOTEXACTI: 937 case OP_NOTI: 938 case OP_NOTMINPLUS: 939 case OP_NOTMINPLUSI: 940 case OP_NOTMINQUERY: 941 case OP_NOTMINQUERYI: 942 case OP_NOTMINSTAR: 943 case OP_NOTMINSTARI: 944 case OP_NOTMINUPTO: 945 case OP_NOTMINUPTOI: 946 case OP_NOTPLUS: 947 case OP_NOTPLUSI: 948 case OP_NOTPOSPLUS: 949 case OP_NOTPOSPLUSI: 950 case OP_NOTPOSQUERY: 951 case OP_NOTPOSQUERYI: 952 case OP_NOTPOSSTAR: 953 case OP_NOTPOSSTARI: 954 case OP_NOTPOSUPTO: 955 case OP_NOTPOSUPTOI: 956 case OP_NOTPROP: 957 case OP_NOTQUERY: 958 case OP_NOTQUERYI: 959 case OP_NOTSTAR: 960 case OP_NOTSTARI: 961 case OP_NOTUPTO: 962 case OP_NOTUPTOI: 963 case OP_NOT_HSPACE: 964 case OP_NOT_VSPACE: 965 case OP_PRUNE: 966 case OP_PRUNE_ARG: 967 case OP_RECURSE: 968 case OP_REF: 969 case OP_REFI: 970 case OP_REVERSE: 971 case OP_RREF: 972 case OP_SCOND: 973 case OP_SET_SOM: 974 case OP_SKIP: 975 case OP_SKIP_ARG: 976 case OP_SOD: 977 case OP_SOM: 978 case OP_THEN: 979 case OP_THEN_ARG: 980 return SSB_FAIL; 981 982 /* A "real" property test implies no starting bits, but the fake property 983 PT_CLIST identifies a list of characters. These lists are short, as they 984 are used for characters with more than one "other case", so there is no 985 point in recognizing them for OP_NOTPROP. */ 986 987 case OP_PROP: 988 if (tcode[1] != PT_CLIST) return SSB_FAIL; 989 { 990 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2]; 991 while ((c = *p++) < NOTACHAR) 992 { 993 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 994 if (utf) 995 { 996 PCRE2_UCHAR buff[6]; 997 (void)PRIV(ord2utf)(c, buff); 998 c = buff[0]; 999 } 1000 #endif 1001 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); 1002 } 1003 } 1004 try_next = FALSE; 1005 break; 1006 1007 /* We can ignore word boundary tests. */ 1008 1009 case OP_WORD_BOUNDARY: 1010 case OP_NOT_WORD_BOUNDARY: 1011 tcode++; 1012 break; 1013 1014 /* If we hit a bracket or a positive lookahead assertion, recurse to set 1015 bits from within the subpattern. If it can't find anything, we have to 1016 give up. If it finds some mandatory character(s), we are done for this 1017 branch. Otherwise, carry on scanning after the subpattern. */ 1018 1019 case OP_BRA: 1020 case OP_SBRA: 1021 case OP_CBRA: 1022 case OP_SCBRA: 1023 case OP_BRAPOS: 1024 case OP_SBRAPOS: 1025 case OP_CBRAPOS: 1026 case OP_SCBRAPOS: 1027 case OP_ONCE: 1028 case OP_ONCE_NC: 1029 case OP_ASSERT: 1030 rc = set_start_bits(re, tcode, utf); 1031 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 1032 if (rc == SSB_DONE) try_next = FALSE; else 1033 { 1034 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 1035 tcode += 1 + LINK_SIZE; 1036 } 1037 break; 1038 1039 /* If we hit ALT or KET, it means we haven't found anything mandatory in 1040 this branch, though we might have found something optional. For ALT, we 1041 continue with the next alternative, but we have to arrange that the final 1042 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, 1043 return SSB_CONTINUE: if this is the top level, that indicates failure, 1044 but after a nested subpattern, it causes scanning to continue. */ 1045 1046 case OP_ALT: 1047 yield = SSB_CONTINUE; 1048 try_next = FALSE; 1049 break; 1050 1051 case OP_KET: 1052 case OP_KETRMAX: 1053 case OP_KETRMIN: 1054 case OP_KETRPOS: 1055 return SSB_CONTINUE; 1056 1057 /* Skip over callout */ 1058 1059 case OP_CALLOUT: 1060 tcode += PRIV(OP_lengths)[OP_CALLOUT]; 1061 break; 1062 1063 case OP_CALLOUT_STR: 1064 tcode += GET(tcode, 1 + 2*LINK_SIZE); 1065 break; 1066 1067 /* Skip over lookbehind and negative lookahead assertions */ 1068 1069 case OP_ASSERT_NOT: 1070 case OP_ASSERTBACK: 1071 case OP_ASSERTBACK_NOT: 1072 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 1073 tcode += 1 + LINK_SIZE; 1074 break; 1075 1076 /* BRAZERO does the bracket, but carries on. */ 1077 1078 case OP_BRAZERO: 1079 case OP_BRAMINZERO: 1080 case OP_BRAPOSZERO: 1081 rc = set_start_bits(re, ++tcode, utf); 1082 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 1083 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1084 tcode += 1 + LINK_SIZE; 1085 break; 1086 1087 /* SKIPZERO skips the bracket. */ 1088 1089 case OP_SKIPZERO: 1090 tcode++; 1091 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1092 tcode += 1 + LINK_SIZE; 1093 break; 1094 1095 /* Single-char * or ? sets the bit and tries the next item */ 1096 1097 case OP_STAR: 1098 case OP_MINSTAR: 1099 case OP_POSSTAR: 1100 case OP_QUERY: 1101 case OP_MINQUERY: 1102 case OP_POSQUERY: 1103 tcode = set_table_bit(re, tcode + 1, FALSE, utf); 1104 break; 1105 1106 case OP_STARI: 1107 case OP_MINSTARI: 1108 case OP_POSSTARI: 1109 case OP_QUERYI: 1110 case OP_MINQUERYI: 1111 case OP_POSQUERYI: 1112 tcode = set_table_bit(re, tcode + 1, TRUE, utf); 1113 break; 1114 1115 /* Single-char upto sets the bit and tries the next */ 1116 1117 case OP_UPTO: 1118 case OP_MINUPTO: 1119 case OP_POSUPTO: 1120 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf); 1121 break; 1122 1123 case OP_UPTOI: 1124 case OP_MINUPTOI: 1125 case OP_POSUPTOI: 1126 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf); 1127 break; 1128 1129 /* At least one single char sets the bit and stops */ 1130 1131 case OP_EXACT: 1132 tcode += IMM2_SIZE; 1133 /* Fall through */ 1134 case OP_CHAR: 1135 case OP_PLUS: 1136 case OP_MINPLUS: 1137 case OP_POSPLUS: 1138 (void)set_table_bit(re, tcode + 1, FALSE, utf); 1139 try_next = FALSE; 1140 break; 1141 1142 case OP_EXACTI: 1143 tcode += IMM2_SIZE; 1144 /* Fall through */ 1145 case OP_CHARI: 1146 case OP_PLUSI: 1147 case OP_MINPLUSI: 1148 case OP_POSPLUSI: 1149 (void)set_table_bit(re, tcode + 1, TRUE, utf); 1150 try_next = FALSE; 1151 break; 1152 1153 /* Special spacing and line-terminating items. These recognize specific 1154 lists of characters. The difference between VSPACE and ANYNL is that the 1155 latter can match the two-character CRLF sequence, but that is not 1156 relevant for finding the first character, so their code here is 1157 identical. */ 1158 1159 case OP_HSPACE: 1160 SET_BIT(CHAR_HT); 1161 SET_BIT(CHAR_SPACE); 1162 1163 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1164 the bits for 0xA0 and for code units >= 255, independently of UTF. */ 1165 1166 #if PCRE2_CODE_UNIT_WIDTH != 8 1167 SET_BIT(0xA0); 1168 SET_BIT(0xFF); 1169 #else 1170 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1171 units of horizontal space characters. */ 1172 1173 #ifdef SUPPORT_UNICODE 1174 if (utf) 1175 { 1176 SET_BIT(0xC2); /* For U+00A0 */ 1177 SET_BIT(0xE1); /* For U+1680, U+180E */ 1178 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1179 SET_BIT(0xE3); /* For U+3000 */ 1180 } 1181 else 1182 #endif 1183 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless 1184 the code is EBCDIC. */ 1185 { 1186 #ifndef EBCDIC 1187 SET_BIT(0xA0); 1188 #endif /* Not EBCDIC */ 1189 } 1190 #endif /* 8-bit support */ 1191 1192 try_next = FALSE; 1193 break; 1194 1195 case OP_ANYNL: 1196 case OP_VSPACE: 1197 SET_BIT(CHAR_LF); 1198 SET_BIT(CHAR_VT); 1199 SET_BIT(CHAR_FF); 1200 SET_BIT(CHAR_CR); 1201 1202 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1203 the bits for NEL and for code units >= 255, independently of UTF. */ 1204 1205 #if PCRE2_CODE_UNIT_WIDTH != 8 1206 SET_BIT(CHAR_NEL); 1207 SET_BIT(0xFF); 1208 #else 1209 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1210 units of vertical space characters. */ 1211 1212 #ifdef SUPPORT_UNICODE 1213 if (utf) 1214 { 1215 SET_BIT(0xC2); /* For U+0085 (NEL) */ 1216 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1217 } 1218 else 1219 #endif 1220 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ 1221 { 1222 SET_BIT(CHAR_NEL); 1223 } 1224 #endif /* 8-bit support */ 1225 1226 try_next = FALSE; 1227 break; 1228 1229 /* Single character types set the bits and stop. Note that if PCRE2_UCP 1230 is set, we do not see these op codes because \d etc are converted to 1231 properties. Therefore, these apply in the case when only characters less 1232 than 256 are recognized to match the types. */ 1233 1234 case OP_NOT_DIGIT: 1235 set_nottype_bits(re, cbit_digit, table_limit); 1236 try_next = FALSE; 1237 break; 1238 1239 case OP_DIGIT: 1240 set_type_bits(re, cbit_digit, table_limit); 1241 try_next = FALSE; 1242 break; 1243 1244 case OP_NOT_WHITESPACE: 1245 set_nottype_bits(re, cbit_space, table_limit); 1246 try_next = FALSE; 1247 break; 1248 1249 case OP_WHITESPACE: 1250 set_type_bits(re, cbit_space, table_limit); 1251 try_next = FALSE; 1252 break; 1253 1254 case OP_NOT_WORDCHAR: 1255 set_nottype_bits(re, cbit_word, table_limit); 1256 try_next = FALSE; 1257 break; 1258 1259 case OP_WORDCHAR: 1260 set_type_bits(re, cbit_word, table_limit); 1261 try_next = FALSE; 1262 break; 1263 1264 /* One or more character type fudges the pointer and restarts, knowing 1265 it will hit a single character type and stop there. */ 1266 1267 case OP_TYPEPLUS: 1268 case OP_TYPEMINPLUS: 1269 case OP_TYPEPOSPLUS: 1270 tcode++; 1271 break; 1272 1273 case OP_TYPEEXACT: 1274 tcode += 1 + IMM2_SIZE; 1275 break; 1276 1277 /* Zero or more repeats of character types set the bits and then 1278 try again. */ 1279 1280 case OP_TYPEUPTO: 1281 case OP_TYPEMINUPTO: 1282 case OP_TYPEPOSUPTO: 1283 tcode += IMM2_SIZE; /* Fall through */ 1284 1285 case OP_TYPESTAR: 1286 case OP_TYPEMINSTAR: 1287 case OP_TYPEPOSSTAR: 1288 case OP_TYPEQUERY: 1289 case OP_TYPEMINQUERY: 1290 case OP_TYPEPOSQUERY: 1291 switch(tcode[1]) 1292 { 1293 default: 1294 case OP_ANY: 1295 case OP_ALLANY: 1296 return SSB_FAIL; 1297 1298 case OP_HSPACE: 1299 SET_BIT(CHAR_HT); 1300 SET_BIT(CHAR_SPACE); 1301 1302 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1303 the bits for 0xA0 and for code units >= 255, independently of UTF. */ 1304 1305 #if PCRE2_CODE_UNIT_WIDTH != 8 1306 SET_BIT(0xA0); 1307 SET_BIT(0xFF); 1308 #else 1309 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1310 units of horizontal space characters. */ 1311 1312 #ifdef SUPPORT_UNICODE 1313 if (utf) 1314 { 1315 SET_BIT(0xC2); /* For U+00A0 */ 1316 SET_BIT(0xE1); /* For U+1680, U+180E */ 1317 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1318 SET_BIT(0xE3); /* For U+3000 */ 1319 } 1320 else 1321 #endif 1322 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless 1323 the code is EBCDIC. */ 1324 { 1325 #ifndef EBCDIC 1326 SET_BIT(0xA0); 1327 #endif /* Not EBCDIC */ 1328 } 1329 #endif /* 8-bit support */ 1330 break; 1331 1332 case OP_ANYNL: 1333 case OP_VSPACE: 1334 SET_BIT(CHAR_LF); 1335 SET_BIT(CHAR_VT); 1336 SET_BIT(CHAR_FF); 1337 SET_BIT(CHAR_CR); 1338 1339 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1340 the bits for NEL and for code units >= 255, independently of UTF. */ 1341 1342 #if PCRE2_CODE_UNIT_WIDTH != 8 1343 SET_BIT(CHAR_NEL); 1344 SET_BIT(0xFF); 1345 #else 1346 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1347 units of vertical space characters. */ 1348 1349 #ifdef SUPPORT_UNICODE 1350 if (utf) 1351 { 1352 SET_BIT(0xC2); /* For U+0085 (NEL) */ 1353 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1354 } 1355 else 1356 #endif 1357 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ 1358 { 1359 SET_BIT(CHAR_NEL); 1360 } 1361 #endif /* 8-bit support */ 1362 break; 1363 1364 case OP_NOT_DIGIT: 1365 set_nottype_bits(re, cbit_digit, table_limit); 1366 break; 1367 1368 case OP_DIGIT: 1369 set_type_bits(re, cbit_digit, table_limit); 1370 break; 1371 1372 case OP_NOT_WHITESPACE: 1373 set_nottype_bits(re, cbit_space, table_limit); 1374 break; 1375 1376 case OP_WHITESPACE: 1377 set_type_bits(re, cbit_space, table_limit); 1378 break; 1379 1380 case OP_NOT_WORDCHAR: 1381 set_nottype_bits(re, cbit_word, table_limit); 1382 break; 1383 1384 case OP_WORDCHAR: 1385 set_type_bits(re, cbit_word, table_limit); 1386 break; 1387 } 1388 1389 tcode += 2; 1390 break; 1391 1392 /* Extended class: if there are any property checks, or if this is a 1393 negative XCLASS without a map, give up. If there are no property checks, 1394 there must be wide characters on the XCLASS list, because otherwise an 1395 XCLASS would not have been created. This means that code points >= 255 1396 are always potential starters. */ 1397 1398 #ifdef SUPPORT_WIDE_CHARS 1399 case OP_XCLASS: 1400 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 || 1401 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT) 1402 return SSB_FAIL; 1403 1404 /* We have a positive XCLASS or a negative one without a map. Set up the 1405 map pointer if there is one, and fall through. */ 1406 1407 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL : 1408 (uint8_t *)(tcode + 1 + LINK_SIZE + 1); 1409 #endif 1410 1411 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are 1412 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter 1413 because it starts a character with a value > 255. In 8-bit non-UTF mode, 1414 there is no difference between CLASS and NCLASS. In all other wide 1415 character modes, set the 0xFF bit to indicate code units >= 255. */ 1416 1417 case OP_NCLASS: 1418 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 1419 if (utf) 1420 { 1421 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ 1422 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ 1423 } 1424 #elif PCRE2_CODE_UNIT_WIDTH != 8 1425 SET_BIT(0xFF); /* For characters >= 255 */ 1426 #endif 1427 /* Fall through */ 1428 1429 /* Enter here for a positive non-XCLASS. If we have fallen through from 1430 an XCLASS, classmap will already be set; just advance the code pointer. 1431 Otherwise, set up classmap for a a non-XCLASS and advance past it. */ 1432 1433 case OP_CLASS: 1434 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else 1435 { 1436 classmap = (uint8_t *)(++tcode); 1437 tcode += 32 / sizeof(PCRE2_UCHAR); 1438 } 1439 1440 /* When wide characters are supported, classmap may be NULL. In UTF-8 1441 (sic) mode, the bits in a class bit map correspond to character values, 1442 not to byte values. However, the bit map we are constructing is for byte 1443 values. So we have to do a conversion for characters whose code point is 1444 greater than 127. In fact, there are only two possible starting bytes for 1445 characters in the range 128 - 255. */ 1446 1447 if (classmap != NULL) 1448 { 1449 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 1450 if (utf) 1451 { 1452 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c]; 1453 for (c = 128; c < 256; c++) 1454 { 1455 if ((classmap[c/8] & (1 << (c&7))) != 0) 1456 { 1457 int d = (c >> 6) | 0xc0; /* Set bit for this starter */ 1458 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */ 1459 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ 1460 } 1461 } 1462 } 1463 else 1464 #endif 1465 /* In all modes except UTF-8, the two bit maps are compatible. */ 1466 1467 { 1468 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c]; 1469 } 1470 } 1471 1472 /* Act on what follows the class. For a zero minimum repeat, continue; 1473 otherwise stop processing. */ 1474 1475 switch (*tcode) 1476 { 1477 case OP_CRSTAR: 1478 case OP_CRMINSTAR: 1479 case OP_CRQUERY: 1480 case OP_CRMINQUERY: 1481 case OP_CRPOSSTAR: 1482 case OP_CRPOSQUERY: 1483 tcode++; 1484 break; 1485 1486 case OP_CRRANGE: 1487 case OP_CRMINRANGE: 1488 case OP_CRPOSRANGE: 1489 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; 1490 else try_next = FALSE; 1491 break; 1492 1493 default: 1494 try_next = FALSE; 1495 break; 1496 } 1497 break; /* End of class handling case */ 1498 } /* End of switch for opcodes */ 1499 } /* End of try_next loop */ 1500 1501 code += GET(code, 1); /* Advance to next branch */ 1502 } 1503 while (*code == OP_ALT); 1504 1505 return yield; 1506 } 1507 1508 1509 1510 /************************************************* 1511 * Study a compiled expression * 1512 *************************************************/ 1513 1514 /* This function is handed a compiled expression that it must study to produce 1515 information that will speed up the matching. 1516 1517 Argument: points to the compiled expression 1518 Returns: 0 normally; non-zero should never normally occur 1519 1 unknown opcode in set_start_bits 1520 2 missing capturing bracket 1521 3 unknown opcode in find_minlength 1522 */ 1523 1524 int 1525 PRIV(study)(pcre2_real_code *re) 1526 { 1527 int min; 1528 int count = 0; 1529 PCRE2_UCHAR *code; 1530 BOOL utf = (re->overall_options & PCRE2_UTF) != 0; 1531 1532 /* Find start of compiled code */ 1533 1534 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 1535 re->name_entry_size * re->name_count; 1536 1537 /* For an anchored pattern, or an unanchored pattern that has a first code 1538 unit, or a multiline pattern that matches only at "line start", there is no 1539 point in seeking a list of starting code units. */ 1540 1541 if ((re->overall_options & PCRE2_ANCHORED) == 0 && 1542 (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) 1543 { 1544 int rc = set_start_bits(re, code, utf); 1545 if (rc == SSB_UNKNOWN) return 1; 1546 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; 1547 } 1548 1549 /* Find the minimum length of subject string. If it can match an empty string, 1550 the minimum length is already known. */ 1551 1552 if ((re->flags & PCRE2_MATCH_EMPTY) == 0) 1553 { 1554 switch(min = find_minlength(re, code, code, utf, NULL, &count)) 1555 { 1556 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ 1557 break; /* Leave minlength unchanged (will be zero) */ 1558 1559 case -2: 1560 return 2; /* missing capturing bracket */ 1561 1562 case -3: 1563 return 3; /* unrecognized opcode */ 1564 1565 default: 1566 if (min > UINT16_MAX) min = UINT16_MAX; 1567 re->minlength = min; 1568 break; 1569 } 1570 } 1571 1572 return 0; 1573 } 1574 1575 /* End of pcre2_study.c */ 1576