1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 /* This module contains functions for scanning a compiled pattern and 42 collecting data (e.g. minimum matching length). */ 43 44 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 49 #include "pcre2_internal.h" 50 51 /* The maximum remembered capturing brackets minimum. */ 52 53 #define MAX_CACHE_BACKREF 128 54 55 /* Set a bit in the starting code unit bit map. */ 56 57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7)) 58 59 /* Returns from set_start_bits() */ 60 61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; 62 63 64 /************************************************* 65 * Find the minimum subject length for a group * 66 *************************************************/ 67 68 /* Scan a parenthesized group and compute the minimum length of subject that 69 is needed to match it. This is a lower bound; it does not mean there is a 70 string of that length that matches. In UTF mode, the result is in characters 71 rather than code units. The field in a compiled pattern for storing the minimum 72 length is 16-bits long (on the grounds that anything longer than that is 73 pathological), so we give up when we reach that amount. This also means that 74 integer overflow for really crazy patterns cannot happen. 75 76 Backreference minimum lengths are cached to speed up multiple references. This 77 function is called only when the highest back reference in the pattern is less 78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the 79 caching vector. The zeroth element contains the number of the highest set 80 value. 81 82 Arguments: 83 re compiled pattern block 84 code pointer to start of group (the bracket) 85 startcode pointer to start of the whole pattern's code 86 utf UTF flag 87 recurses chain of recurse_check to catch mutual recursion 88 countptr pointer to call count (to catch over complexity) 89 backref_cache vector for caching back references. 90 91 Returns: the minimum length 92 -1 \C in UTF-8 mode 93 or (*ACCEPT) 94 or pattern too complicated 95 or back reference to duplicate name/number 96 -2 internal error (missing capturing bracket) 97 -3 internal error (opcode not listed) 98 */ 99 100 static int 101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, 102 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr, 103 int *backref_cache) 104 { 105 int length = -1; 106 int prev_cap_recno = -1; 107 int prev_cap_d = 0; 108 int prev_recurse_recno = -1; 109 int prev_recurse_d = 0; 110 uint32_t once_fudge = 0; 111 BOOL had_recurse = FALSE; 112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; 113 recurse_check this_recurse; 114 int branchlength = 0; 115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; 116 117 /* If this is a "could be empty" group, its minimum length is 0. */ 118 119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; 120 121 /* Skip over capturing bracket number */ 122 123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; 124 125 /* A large and/or complex regex can take too long to process. */ 126 127 if ((*countptr)++ > 1000) return -1; 128 129 /* Scan along the opcodes for this branch. If we get to the end of the branch, 130 check the length against that of the other branches. If the accumulated length 131 passes 16-bits, stop. */ 132 133 for (;;) 134 { 135 int d, min, recno; 136 PCRE2_UCHAR *cs, *ce; 137 PCRE2_UCHAR op = *cc; 138 139 if (branchlength >= UINT16_MAX) return UINT16_MAX; 140 141 switch (op) 142 { 143 case OP_COND: 144 case OP_SCOND: 145 146 /* If there is only one branch in a condition, the implied branch has zero 147 length, so we don't add anything. This covers the DEFINE "condition" 148 automatically. If there are two branches we can treat it the same as any 149 other non-capturing subpattern. */ 150 151 cs = cc + GET(cc, 1); 152 if (*cs != OP_ALT) 153 { 154 cc = cs + 1 + LINK_SIZE; 155 break; 156 } 157 goto PROCESS_NON_CAPTURE; 158 159 case OP_BRA: 160 /* There's a special case of OP_BRA, when it is wrapped round a repeated 161 OP_RECURSE. We'd like to process the latter at this level so that 162 remembering the value works for repeated cases. So we do nothing, but 163 set a fudge value to skip over the OP_KET after the recurse. */ 164 165 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) 166 { 167 once_fudge = 1 + LINK_SIZE; 168 cc += 1 + LINK_SIZE; 169 break; 170 } 171 /* Fall through */ 172 173 case OP_ONCE: 174 case OP_SBRA: 175 case OP_BRAPOS: 176 case OP_SBRAPOS: 177 PROCESS_NON_CAPTURE: 178 d = find_minlength(re, cc, startcode, utf, recurses, countptr, 179 backref_cache); 180 if (d < 0) return d; 181 branchlength += d; 182 do cc += GET(cc, 1); while (*cc == OP_ALT); 183 cc += 1 + LINK_SIZE; 184 break; 185 186 /* To save time for repeated capturing subpatterns, we remember the 187 length of the previous one. Unfortunately we can't do the same for 188 the unnumbered ones above. Nor can we do this if (?| is present in the 189 pattern because captures with the same number are not then identical. */ 190 191 case OP_CBRA: 192 case OP_SCBRA: 193 case OP_CBRAPOS: 194 case OP_SCBRAPOS: 195 recno = (int)GET2(cc, 1+LINK_SIZE); 196 if (dupcapused || recno != prev_cap_recno) 197 { 198 prev_cap_recno = recno; 199 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr, 200 backref_cache); 201 if (prev_cap_d < 0) return prev_cap_d; 202 } 203 branchlength += prev_cap_d; 204 do cc += GET(cc, 1); while (*cc == OP_ALT); 205 cc += 1 + LINK_SIZE; 206 break; 207 208 /* ACCEPT makes things far too complicated; we have to give up. */ 209 210 case OP_ACCEPT: 211 case OP_ASSERT_ACCEPT: 212 return -1; 213 214 /* Reached end of a branch; if it's a ket it is the end of a nested 215 call. If it's ALT it is an alternation in a nested call. If it is END it's 216 the end of the outer call. All can be handled by the same code. If an 217 ACCEPT was previously encountered, use the length that was in force at that 218 time, and pass back the shortest ACCEPT length. */ 219 220 case OP_ALT: 221 case OP_KET: 222 case OP_KETRMAX: 223 case OP_KETRMIN: 224 case OP_KETRPOS: 225 case OP_END: 226 if (length < 0 || (!had_recurse && branchlength < length)) 227 length = branchlength; 228 if (op != OP_ALT) return length; 229 cc += 1 + LINK_SIZE; 230 branchlength = 0; 231 had_recurse = FALSE; 232 break; 233 234 /* Skip over assertive subpatterns */ 235 236 case OP_ASSERT: 237 case OP_ASSERT_NOT: 238 case OP_ASSERTBACK: 239 case OP_ASSERTBACK_NOT: 240 do cc += GET(cc, 1); while (*cc == OP_ALT); 241 /* Fall through */ 242 243 /* Skip over things that don't match chars */ 244 245 case OP_REVERSE: 246 case OP_CREF: 247 case OP_DNCREF: 248 case OP_RREF: 249 case OP_DNRREF: 250 case OP_FALSE: 251 case OP_TRUE: 252 case OP_CALLOUT: 253 case OP_SOD: 254 case OP_SOM: 255 case OP_EOD: 256 case OP_EODN: 257 case OP_CIRC: 258 case OP_CIRCM: 259 case OP_DOLL: 260 case OP_DOLLM: 261 case OP_NOT_WORD_BOUNDARY: 262 case OP_WORD_BOUNDARY: 263 cc += PRIV(OP_lengths)[*cc]; 264 break; 265 266 case OP_CALLOUT_STR: 267 cc += GET(cc, 1 + 2*LINK_SIZE); 268 break; 269 270 /* Skip over a subpattern that has a {0} or {0,x} quantifier */ 271 272 case OP_BRAZERO: 273 case OP_BRAMINZERO: 274 case OP_BRAPOSZERO: 275 case OP_SKIPZERO: 276 cc += PRIV(OP_lengths)[*cc]; 277 do cc += GET(cc, 1); while (*cc == OP_ALT); 278 cc += 1 + LINK_SIZE; 279 break; 280 281 /* Handle literal characters and + repetitions */ 282 283 case OP_CHAR: 284 case OP_CHARI: 285 case OP_NOT: 286 case OP_NOTI: 287 case OP_PLUS: 288 case OP_PLUSI: 289 case OP_MINPLUS: 290 case OP_MINPLUSI: 291 case OP_POSPLUS: 292 case OP_POSPLUSI: 293 case OP_NOTPLUS: 294 case OP_NOTPLUSI: 295 case OP_NOTMINPLUS: 296 case OP_NOTMINPLUSI: 297 case OP_NOTPOSPLUS: 298 case OP_NOTPOSPLUSI: 299 branchlength++; 300 cc += 2; 301 #ifdef SUPPORT_UNICODE 302 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 303 #endif 304 break; 305 306 case OP_TYPEPLUS: 307 case OP_TYPEMINPLUS: 308 case OP_TYPEPOSPLUS: 309 branchlength++; 310 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; 311 break; 312 313 /* Handle exact repetitions. The count is already in characters, but we 314 may need to skip over a multibyte character in UTF mode. */ 315 316 case OP_EXACT: 317 case OP_EXACTI: 318 case OP_NOTEXACT: 319 case OP_NOTEXACTI: 320 branchlength += GET2(cc,1); 321 cc += 2 + IMM2_SIZE; 322 #ifdef SUPPORT_UNICODE 323 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 324 #endif 325 break; 326 327 case OP_TYPEEXACT: 328 branchlength += GET2(cc,1); 329 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP 330 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 331 break; 332 333 /* Handle single-char non-literal matchers */ 334 335 case OP_PROP: 336 case OP_NOTPROP: 337 cc += 2; 338 /* Fall through */ 339 340 case OP_NOT_DIGIT: 341 case OP_DIGIT: 342 case OP_NOT_WHITESPACE: 343 case OP_WHITESPACE: 344 case OP_NOT_WORDCHAR: 345 case OP_WORDCHAR: 346 case OP_ANY: 347 case OP_ALLANY: 348 case OP_EXTUNI: 349 case OP_HSPACE: 350 case OP_NOT_HSPACE: 351 case OP_VSPACE: 352 case OP_NOT_VSPACE: 353 branchlength++; 354 cc++; 355 break; 356 357 /* "Any newline" might match two characters, but it also might match just 358 one. */ 359 360 case OP_ANYNL: 361 branchlength += 1; 362 cc++; 363 break; 364 365 /* The single-byte matcher means we can't proceed in UTF mode. (In 366 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever 367 appear, but leave the code, just in case.) */ 368 369 case OP_ANYBYTE: 370 #ifdef SUPPORT_UNICODE 371 if (utf) return -1; 372 #endif 373 branchlength++; 374 cc++; 375 break; 376 377 /* For repeated character types, we have to test for \p and \P, which have 378 an extra two bytes of parameters. */ 379 380 case OP_TYPESTAR: 381 case OP_TYPEMINSTAR: 382 case OP_TYPEQUERY: 383 case OP_TYPEMINQUERY: 384 case OP_TYPEPOSSTAR: 385 case OP_TYPEPOSQUERY: 386 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; 387 cc += PRIV(OP_lengths)[op]; 388 break; 389 390 case OP_TYPEUPTO: 391 case OP_TYPEMINUPTO: 392 case OP_TYPEPOSUPTO: 393 if (cc[1 + IMM2_SIZE] == OP_PROP 394 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; 395 cc += PRIV(OP_lengths)[op]; 396 break; 397 398 /* Check a class for variable quantification */ 399 400 case OP_CLASS: 401 case OP_NCLASS: 402 #ifdef SUPPORT_WIDE_CHARS 403 case OP_XCLASS: 404 /* The original code caused an unsigned overflow in 64 bit systems, 405 so now we use a conditional statement. */ 406 if (op == OP_XCLASS) 407 cc += GET(cc, 1); 408 else 409 cc += PRIV(OP_lengths)[OP_CLASS]; 410 #else 411 cc += PRIV(OP_lengths)[OP_CLASS]; 412 #endif 413 414 switch (*cc) 415 { 416 case OP_CRPLUS: 417 case OP_CRMINPLUS: 418 case OP_CRPOSPLUS: 419 branchlength++; 420 /* Fall through */ 421 422 case OP_CRSTAR: 423 case OP_CRMINSTAR: 424 case OP_CRQUERY: 425 case OP_CRMINQUERY: 426 case OP_CRPOSSTAR: 427 case OP_CRPOSQUERY: 428 cc++; 429 break; 430 431 case OP_CRRANGE: 432 case OP_CRMINRANGE: 433 case OP_CRPOSRANGE: 434 branchlength += GET2(cc,1); 435 cc += 1 + 2 * IMM2_SIZE; 436 break; 437 438 default: 439 branchlength++; 440 break; 441 } 442 break; 443 444 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same 445 way: we find the minimum length for the subpattern. A recursion 446 (backreference or subroutine) causes an a flag to be set that causes the 447 length of this branch to be ignored. The logic is that a recursion can only 448 make sense if there is another alternative that stops the recursing. That 449 will provide the minimum length (when no recursion happens). 450 451 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket 452 matches an empty string (by default it causes a matching failure), so in 453 that case we must set the minimum length to zero. */ 454 455 /* Duplicate named pattern back reference. We cannot reliably find a length 456 for this if duplicate numbers are present in the pattern. */ 457 458 case OP_DNREF: 459 case OP_DNREFI: 460 if (dupcapused) return -1; 461 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) 462 { 463 int count = GET2(cc, 1+IMM2_SIZE); 464 PCRE2_UCHAR *slot = 465 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 466 GET2(cc, 1) * re->name_entry_size; 467 468 d = INT_MAX; 469 470 /* Scan all groups with the same name; find the shortest. */ 471 472 while (count-- > 0) 473 { 474 int dd, i; 475 recno = GET2(slot, 0); 476 477 if (recno <= backref_cache[0] && backref_cache[recno] >= 0) 478 dd = backref_cache[recno]; 479 else 480 { 481 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); 482 if (cs == NULL) return -2; 483 do ce += GET(ce, 1); while (*ce == OP_ALT); 484 if (cc > cs && cc < ce) /* Simple recursion */ 485 { 486 dd = 0; 487 had_recurse = TRUE; 488 } 489 else 490 { 491 recurse_check *r = recurses; 492 for (r = recurses; r != NULL; r = r->prev) 493 if (r->group == cs) break; 494 if (r != NULL) /* Mutual recursion */ 495 { 496 dd = 0; 497 had_recurse = TRUE; 498 } 499 else 500 { 501 this_recurse.prev = recurses; 502 this_recurse.group = cs; 503 dd = find_minlength(re, cs, startcode, utf, &this_recurse, 504 countptr, backref_cache); 505 if (dd < 0) return dd; 506 } 507 } 508 509 backref_cache[recno] = dd; 510 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; 511 backref_cache[0] = recno; 512 } 513 514 if (dd < d) d = dd; 515 if (d <= 0) break; /* No point looking at any more */ 516 slot += re->name_entry_size; 517 } 518 } 519 else d = 0; 520 cc += 1 + 2*IMM2_SIZE; 521 goto REPEAT_BACK_REFERENCE; 522 523 /* Single back reference. We cannot find a length for this if duplicate 524 numbers are present in the pattern. */ 525 526 case OP_REF: 527 case OP_REFI: 528 if (dupcapused) return -1; 529 recno = GET2(cc, 1); 530 if (recno <= backref_cache[0] && backref_cache[recno] >= 0) 531 d = backref_cache[recno]; 532 else 533 { 534 int i; 535 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) 536 { 537 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); 538 if (cs == NULL) return -2; 539 do ce += GET(ce, 1); while (*ce == OP_ALT); 540 if (cc > cs && cc < ce) /* Simple recursion */ 541 { 542 d = 0; 543 had_recurse = TRUE; 544 } 545 else 546 { 547 recurse_check *r = recurses; 548 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 549 if (r != NULL) /* Mutual recursion */ 550 { 551 d = 0; 552 had_recurse = TRUE; 553 } 554 else 555 { 556 this_recurse.prev = recurses; 557 this_recurse.group = cs; 558 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, 559 backref_cache); 560 if (d < 0) return d; 561 } 562 } 563 } 564 else d = 0; 565 566 backref_cache[recno] = d; 567 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; 568 backref_cache[0] = recno; 569 } 570 571 cc += 1 + IMM2_SIZE; 572 573 /* Handle repeated back references */ 574 575 REPEAT_BACK_REFERENCE: 576 switch (*cc) 577 { 578 case OP_CRSTAR: 579 case OP_CRMINSTAR: 580 case OP_CRQUERY: 581 case OP_CRMINQUERY: 582 case OP_CRPOSSTAR: 583 case OP_CRPOSQUERY: 584 min = 0; 585 cc++; 586 break; 587 588 case OP_CRPLUS: 589 case OP_CRMINPLUS: 590 case OP_CRPOSPLUS: 591 min = 1; 592 cc++; 593 break; 594 595 case OP_CRRANGE: 596 case OP_CRMINRANGE: 597 case OP_CRPOSRANGE: 598 min = GET2(cc, 1); 599 cc += 1 + 2 * IMM2_SIZE; 600 break; 601 602 default: 603 min = 1; 604 break; 605 } 606 607 /* Take care not to overflow: (1) min and d are ints, so check that their 608 product is not greater than INT_MAX. (2) branchlength is limited to 609 UINT16_MAX (checked at the top of the loop). */ 610 611 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d) 612 branchlength = UINT16_MAX; 613 else branchlength += min * d; 614 break; 615 616 /* Recursion always refers to the first occurrence of a subpattern with a 617 given number. Therefore, we can always make use of caching, even when the 618 pattern contains multiple subpatterns with the same number. */ 619 620 case OP_RECURSE: 621 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); 622 recno = GET2(cs, 1+LINK_SIZE); 623 if (recno == prev_recurse_recno) 624 { 625 branchlength += prev_recurse_d; 626 } 627 else 628 { 629 do ce += GET(ce, 1); while (*ce == OP_ALT); 630 if (cc > cs && cc < ce) /* Simple recursion */ 631 had_recurse = TRUE; 632 else 633 { 634 recurse_check *r = recurses; 635 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 636 if (r != NULL) /* Mutual recursion */ 637 had_recurse = TRUE; 638 else 639 { 640 this_recurse.prev = recurses; 641 this_recurse.group = cs; 642 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, 643 countptr, backref_cache); 644 if (prev_recurse_d < 0) return prev_recurse_d; 645 prev_recurse_recno = recno; 646 branchlength += prev_recurse_d; 647 } 648 } 649 } 650 cc += 1 + LINK_SIZE + once_fudge; 651 once_fudge = 0; 652 break; 653 654 /* Anything else does not or need not match a character. We can get the 655 item's length from the table, but for those that can match zero occurrences 656 of a character, we must take special action for UTF-8 characters. As it 657 happens, the "NOT" versions of these opcodes are used at present only for 658 ASCII characters, so they could be omitted from this list. However, in 659 future that may change, so we include them here so as not to leave a 660 gotcha for a future maintainer. */ 661 662 case OP_UPTO: 663 case OP_UPTOI: 664 case OP_NOTUPTO: 665 case OP_NOTUPTOI: 666 case OP_MINUPTO: 667 case OP_MINUPTOI: 668 case OP_NOTMINUPTO: 669 case OP_NOTMINUPTOI: 670 case OP_POSUPTO: 671 case OP_POSUPTOI: 672 case OP_NOTPOSUPTO: 673 case OP_NOTPOSUPTOI: 674 675 case OP_STAR: 676 case OP_STARI: 677 case OP_NOTSTAR: 678 case OP_NOTSTARI: 679 case OP_MINSTAR: 680 case OP_MINSTARI: 681 case OP_NOTMINSTAR: 682 case OP_NOTMINSTARI: 683 case OP_POSSTAR: 684 case OP_POSSTARI: 685 case OP_NOTPOSSTAR: 686 case OP_NOTPOSSTARI: 687 688 case OP_QUERY: 689 case OP_QUERYI: 690 case OP_NOTQUERY: 691 case OP_NOTQUERYI: 692 case OP_MINQUERY: 693 case OP_MINQUERYI: 694 case OP_NOTMINQUERY: 695 case OP_NOTMINQUERYI: 696 case OP_POSQUERY: 697 case OP_POSQUERYI: 698 case OP_NOTPOSQUERY: 699 case OP_NOTPOSQUERYI: 700 701 cc += PRIV(OP_lengths)[op]; 702 #ifdef SUPPORT_UNICODE 703 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 704 #endif 705 break; 706 707 /* Skip these, but we need to add in the name length. */ 708 709 case OP_MARK: 710 case OP_COMMIT_ARG: 711 case OP_PRUNE_ARG: 712 case OP_SKIP_ARG: 713 case OP_THEN_ARG: 714 cc += PRIV(OP_lengths)[op] + cc[1]; 715 break; 716 717 /* The remaining opcodes are just skipped over. */ 718 719 case OP_CLOSE: 720 case OP_COMMIT: 721 case OP_FAIL: 722 case OP_PRUNE: 723 case OP_SET_SOM: 724 case OP_SKIP: 725 case OP_THEN: 726 cc += PRIV(OP_lengths)[op]; 727 break; 728 729 /* This should not occur: we list all opcodes explicitly so that when 730 new ones get added they are properly considered. */ 731 732 default: 733 return -3; 734 } 735 } 736 /* Control never gets here */ 737 } 738 739 740 741 /************************************************* 742 * Set a bit and maybe its alternate case * 743 *************************************************/ 744 745 /* Given a character, set its first code unit's bit in the table, and also the 746 corresponding bit for the other version of a letter if we are caseless. 747 748 Arguments: 749 re points to the regex block 750 p points to the first code unit of the character 751 caseless TRUE if caseless 752 utf TRUE for UTF mode 753 754 Returns: pointer after the character 755 */ 756 757 static PCRE2_SPTR 758 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf) 759 { 760 uint32_t c = *p++; /* First code unit */ 761 (void)utf; /* Stop compiler warning when UTF not supported */ 762 763 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for 764 0xff. */ 765 766 #if PCRE2_CODE_UNIT_WIDTH != 8 767 if (c > 0xff) SET_BIT(0xff); else 768 #endif 769 770 SET_BIT(c); 771 772 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find 773 the end of the character, even when caseless. */ 774 775 #ifdef SUPPORT_UNICODE 776 if (utf) 777 { 778 #if PCRE2_CODE_UNIT_WIDTH == 8 779 if (c >= 0xc0) GETUTF8INC(c, p); 780 #elif PCRE2_CODE_UNIT_WIDTH == 16 781 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p); 782 #endif 783 } 784 #endif /* SUPPORT_UNICODE */ 785 786 /* If caseless, handle the other case of the character. */ 787 788 if (caseless) 789 { 790 #ifdef SUPPORT_UNICODE 791 if (utf) 792 { 793 #if PCRE2_CODE_UNIT_WIDTH == 8 794 PCRE2_UCHAR buff[6]; 795 c = UCD_OTHERCASE(c); 796 (void)PRIV(ord2utf)(c, buff); 797 SET_BIT(buff[0]); 798 #else /* 16-bit or 32-bit mode */ 799 c = UCD_OTHERCASE(c); 800 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); 801 #endif 802 } 803 else 804 #endif /* SUPPORT_UNICODE */ 805 806 /* Not UTF */ 807 808 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]); 809 } 810 811 return p; 812 } 813 814 815 816 /************************************************* 817 * Set bits for a positive character type * 818 *************************************************/ 819 820 /* This function sets starting bits for a character type. In UTF-8 mode, we can 821 only do a direct setting for bytes less than 128, as otherwise there can be 822 confusion with bytes in the middle of UTF-8 characters. In a "traditional" 823 environment, the tables will only recognize ASCII characters anyway, but in at 824 least one Windows environment, some higher bytes bits were set in the tables. 825 So we deal with that case by considering the UTF-8 encoding. 826 827 Arguments: 828 re the regex block 829 cbit type the type of character wanted 830 table_limit 32 for non-UTF-8; 16 for UTF-8 831 832 Returns: nothing 833 */ 834 835 static void 836 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) 837 { 838 uint32_t c; 839 for (c = 0; c < table_limit; c++) 840 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type]; 841 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 842 if (table_limit == 32) return; 843 for (c = 128; c < 256; c++) 844 { 845 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0) 846 { 847 PCRE2_UCHAR buff[6]; 848 (void)PRIV(ord2utf)(c, buff); 849 SET_BIT(buff[0]); 850 } 851 } 852 #endif /* UTF-8 */ 853 } 854 855 856 /************************************************* 857 * Set bits for a negative character type * 858 *************************************************/ 859 860 /* This function sets starting bits for a negative character type such as \D. 861 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as 862 otherwise there can be confusion with bytes in the middle of UTF-8 characters. 863 Unlike in the positive case, where we can set appropriate starting bits for 864 specific high-valued UTF-8 characters, in this case we have to set the bits for 865 all high-valued characters. The lowest is 0xc2, but we overkill by starting at 866 0xc0 (192) for simplicity. 867 868 Arguments: 869 re the regex block 870 cbit type the type of character wanted 871 table_limit 32 for non-UTF-8; 16 for UTF-8 872 873 Returns: nothing 874 */ 875 876 static void 877 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) 878 { 879 uint32_t c; 880 for (c = 0; c < table_limit; c++) 881 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]); 882 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 883 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff; 884 #endif 885 } 886 887 888 889 /************************************************* 890 * Create bitmap of starting bytes * 891 *************************************************/ 892 893 /* This function scans a compiled unanchored expression recursively and 894 attempts to build a bitmap of the set of possible starting code units whose 895 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause 896 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode 897 we pass a value of 16 rather than 32 as the final argument. (See comments in 898 those functions for the reason.) 899 900 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as 901 (a*)b where the group provides some optional starting code units but scanning 902 must continue at the outer level to find at least one mandatory code unit. At 903 the outermost level, this function fails unless the result is SSB_DONE. 904 905 Arguments: 906 re points to the compiled regex block 907 code points to an expression 908 utf TRUE if in UTF mode 909 910 Returns: SSB_FAIL => Failed to find any starting code units 911 SSB_DONE => Found mandatory starting code units 912 SSB_CONTINUE => Found optional starting code units 913 SSB_UNKNOWN => Hit an unrecognized opcode 914 */ 915 916 static int 917 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf) 918 { 919 uint32_t c; 920 int yield = SSB_DONE; 921 922 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 923 int table_limit = utf? 16:32; 924 #else 925 int table_limit = 32; 926 #endif 927 928 do 929 { 930 BOOL try_next = TRUE; 931 PCRE2_SPTR tcode = code + 1 + LINK_SIZE; 932 933 if (*code == OP_CBRA || *code == OP_SCBRA || 934 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE; 935 936 while (try_next) /* Loop for items in this branch */ 937 { 938 int rc; 939 uint8_t *classmap = NULL; 940 941 switch(*tcode) 942 { 943 /* If we reach something we don't understand, it means a new opcode has 944 been created that hasn't been added to this function. Hopefully this 945 problem will be discovered during testing. */ 946 947 default: 948 return SSB_UNKNOWN; 949 950 /* Fail for a valid opcode that implies no starting bits. */ 951 952 case OP_ACCEPT: 953 case OP_ASSERT_ACCEPT: 954 case OP_ALLANY: 955 case OP_ANY: 956 case OP_ANYBYTE: 957 case OP_CIRCM: 958 case OP_CLOSE: 959 case OP_COMMIT: 960 case OP_COMMIT_ARG: 961 case OP_COND: 962 case OP_CREF: 963 case OP_FALSE: 964 case OP_TRUE: 965 case OP_DNCREF: 966 case OP_DNREF: 967 case OP_DNREFI: 968 case OP_DNRREF: 969 case OP_DOLL: 970 case OP_DOLLM: 971 case OP_END: 972 case OP_EOD: 973 case OP_EODN: 974 case OP_EXTUNI: 975 case OP_FAIL: 976 case OP_MARK: 977 case OP_NOT: 978 case OP_NOTEXACT: 979 case OP_NOTEXACTI: 980 case OP_NOTI: 981 case OP_NOTMINPLUS: 982 case OP_NOTMINPLUSI: 983 case OP_NOTMINQUERY: 984 case OP_NOTMINQUERYI: 985 case OP_NOTMINSTAR: 986 case OP_NOTMINSTARI: 987 case OP_NOTMINUPTO: 988 case OP_NOTMINUPTOI: 989 case OP_NOTPLUS: 990 case OP_NOTPLUSI: 991 case OP_NOTPOSPLUS: 992 case OP_NOTPOSPLUSI: 993 case OP_NOTPOSQUERY: 994 case OP_NOTPOSQUERYI: 995 case OP_NOTPOSSTAR: 996 case OP_NOTPOSSTARI: 997 case OP_NOTPOSUPTO: 998 case OP_NOTPOSUPTOI: 999 case OP_NOTPROP: 1000 case OP_NOTQUERY: 1001 case OP_NOTQUERYI: 1002 case OP_NOTSTAR: 1003 case OP_NOTSTARI: 1004 case OP_NOTUPTO: 1005 case OP_NOTUPTOI: 1006 case OP_NOT_HSPACE: 1007 case OP_NOT_VSPACE: 1008 case OP_PRUNE: 1009 case OP_PRUNE_ARG: 1010 case OP_RECURSE: 1011 case OP_REF: 1012 case OP_REFI: 1013 case OP_REVERSE: 1014 case OP_RREF: 1015 case OP_SCOND: 1016 case OP_SET_SOM: 1017 case OP_SKIP: 1018 case OP_SKIP_ARG: 1019 case OP_SOD: 1020 case OP_SOM: 1021 case OP_THEN: 1022 case OP_THEN_ARG: 1023 return SSB_FAIL; 1024 1025 /* OP_CIRC happens only at the start of an anchored branch (multiline ^ 1026 uses OP_CIRCM). Skip over it. */ 1027 1028 case OP_CIRC: 1029 tcode += PRIV(OP_lengths)[OP_CIRC]; 1030 break; 1031 1032 /* A "real" property test implies no starting bits, but the fake property 1033 PT_CLIST identifies a list of characters. These lists are short, as they 1034 are used for characters with more than one "other case", so there is no 1035 point in recognizing them for OP_NOTPROP. */ 1036 1037 case OP_PROP: 1038 if (tcode[1] != PT_CLIST) return SSB_FAIL; 1039 { 1040 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2]; 1041 while ((c = *p++) < NOTACHAR) 1042 { 1043 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 1044 if (utf) 1045 { 1046 PCRE2_UCHAR buff[6]; 1047 (void)PRIV(ord2utf)(c, buff); 1048 c = buff[0]; 1049 } 1050 #endif 1051 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); 1052 } 1053 } 1054 try_next = FALSE; 1055 break; 1056 1057 /* We can ignore word boundary tests. */ 1058 1059 case OP_WORD_BOUNDARY: 1060 case OP_NOT_WORD_BOUNDARY: 1061 tcode++; 1062 break; 1063 1064 /* If we hit a bracket or a positive lookahead assertion, recurse to set 1065 bits from within the subpattern. If it can't find anything, we have to 1066 give up. If it finds some mandatory character(s), we are done for this 1067 branch. Otherwise, carry on scanning after the subpattern. */ 1068 1069 case OP_BRA: 1070 case OP_SBRA: 1071 case OP_CBRA: 1072 case OP_SCBRA: 1073 case OP_BRAPOS: 1074 case OP_SBRAPOS: 1075 case OP_CBRAPOS: 1076 case OP_SCBRAPOS: 1077 case OP_ONCE: 1078 case OP_ASSERT: 1079 rc = set_start_bits(re, tcode, utf); 1080 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 1081 if (rc == SSB_DONE) try_next = FALSE; else 1082 { 1083 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 1084 tcode += 1 + LINK_SIZE; 1085 } 1086 break; 1087 1088 /* If we hit ALT or KET, it means we haven't found anything mandatory in 1089 this branch, though we might have found something optional. For ALT, we 1090 continue with the next alternative, but we have to arrange that the final 1091 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, 1092 return SSB_CONTINUE: if this is the top level, that indicates failure, 1093 but after a nested subpattern, it causes scanning to continue. */ 1094 1095 case OP_ALT: 1096 yield = SSB_CONTINUE; 1097 try_next = FALSE; 1098 break; 1099 1100 case OP_KET: 1101 case OP_KETRMAX: 1102 case OP_KETRMIN: 1103 case OP_KETRPOS: 1104 return SSB_CONTINUE; 1105 1106 /* Skip over callout */ 1107 1108 case OP_CALLOUT: 1109 tcode += PRIV(OP_lengths)[OP_CALLOUT]; 1110 break; 1111 1112 case OP_CALLOUT_STR: 1113 tcode += GET(tcode, 1 + 2*LINK_SIZE); 1114 break; 1115 1116 /* Skip over lookbehind and negative lookahead assertions */ 1117 1118 case OP_ASSERT_NOT: 1119 case OP_ASSERTBACK: 1120 case OP_ASSERTBACK_NOT: 1121 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 1122 tcode += 1 + LINK_SIZE; 1123 break; 1124 1125 /* BRAZERO does the bracket, but carries on. */ 1126 1127 case OP_BRAZERO: 1128 case OP_BRAMINZERO: 1129 case OP_BRAPOSZERO: 1130 rc = set_start_bits(re, ++tcode, utf); 1131 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; 1132 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1133 tcode += 1 + LINK_SIZE; 1134 break; 1135 1136 /* SKIPZERO skips the bracket. */ 1137 1138 case OP_SKIPZERO: 1139 tcode++; 1140 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 1141 tcode += 1 + LINK_SIZE; 1142 break; 1143 1144 /* Single-char * or ? sets the bit and tries the next item */ 1145 1146 case OP_STAR: 1147 case OP_MINSTAR: 1148 case OP_POSSTAR: 1149 case OP_QUERY: 1150 case OP_MINQUERY: 1151 case OP_POSQUERY: 1152 tcode = set_table_bit(re, tcode + 1, FALSE, utf); 1153 break; 1154 1155 case OP_STARI: 1156 case OP_MINSTARI: 1157 case OP_POSSTARI: 1158 case OP_QUERYI: 1159 case OP_MINQUERYI: 1160 case OP_POSQUERYI: 1161 tcode = set_table_bit(re, tcode + 1, TRUE, utf); 1162 break; 1163 1164 /* Single-char upto sets the bit and tries the next */ 1165 1166 case OP_UPTO: 1167 case OP_MINUPTO: 1168 case OP_POSUPTO: 1169 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf); 1170 break; 1171 1172 case OP_UPTOI: 1173 case OP_MINUPTOI: 1174 case OP_POSUPTOI: 1175 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf); 1176 break; 1177 1178 /* At least one single char sets the bit and stops */ 1179 1180 case OP_EXACT: 1181 tcode += IMM2_SIZE; 1182 /* Fall through */ 1183 case OP_CHAR: 1184 case OP_PLUS: 1185 case OP_MINPLUS: 1186 case OP_POSPLUS: 1187 (void)set_table_bit(re, tcode + 1, FALSE, utf); 1188 try_next = FALSE; 1189 break; 1190 1191 case OP_EXACTI: 1192 tcode += IMM2_SIZE; 1193 /* Fall through */ 1194 case OP_CHARI: 1195 case OP_PLUSI: 1196 case OP_MINPLUSI: 1197 case OP_POSPLUSI: 1198 (void)set_table_bit(re, tcode + 1, TRUE, utf); 1199 try_next = FALSE; 1200 break; 1201 1202 /* Special spacing and line-terminating items. These recognize specific 1203 lists of characters. The difference between VSPACE and ANYNL is that the 1204 latter can match the two-character CRLF sequence, but that is not 1205 relevant for finding the first character, so their code here is 1206 identical. */ 1207 1208 case OP_HSPACE: 1209 SET_BIT(CHAR_HT); 1210 SET_BIT(CHAR_SPACE); 1211 1212 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1213 the bits for 0xA0 and for code units >= 255, independently of UTF. */ 1214 1215 #if PCRE2_CODE_UNIT_WIDTH != 8 1216 SET_BIT(0xA0); 1217 SET_BIT(0xFF); 1218 #else 1219 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1220 units of horizontal space characters. */ 1221 1222 #ifdef SUPPORT_UNICODE 1223 if (utf) 1224 { 1225 SET_BIT(0xC2); /* For U+00A0 */ 1226 SET_BIT(0xE1); /* For U+1680, U+180E */ 1227 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1228 SET_BIT(0xE3); /* For U+3000 */ 1229 } 1230 else 1231 #endif 1232 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless 1233 the code is EBCDIC. */ 1234 { 1235 #ifndef EBCDIC 1236 SET_BIT(0xA0); 1237 #endif /* Not EBCDIC */ 1238 } 1239 #endif /* 8-bit support */ 1240 1241 try_next = FALSE; 1242 break; 1243 1244 case OP_ANYNL: 1245 case OP_VSPACE: 1246 SET_BIT(CHAR_LF); 1247 SET_BIT(CHAR_VT); 1248 SET_BIT(CHAR_FF); 1249 SET_BIT(CHAR_CR); 1250 1251 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1252 the bits for NEL and for code units >= 255, independently of UTF. */ 1253 1254 #if PCRE2_CODE_UNIT_WIDTH != 8 1255 SET_BIT(CHAR_NEL); 1256 SET_BIT(0xFF); 1257 #else 1258 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1259 units of vertical space characters. */ 1260 1261 #ifdef SUPPORT_UNICODE 1262 if (utf) 1263 { 1264 SET_BIT(0xC2); /* For U+0085 (NEL) */ 1265 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1266 } 1267 else 1268 #endif 1269 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ 1270 { 1271 SET_BIT(CHAR_NEL); 1272 } 1273 #endif /* 8-bit support */ 1274 1275 try_next = FALSE; 1276 break; 1277 1278 /* Single character types set the bits and stop. Note that if PCRE2_UCP 1279 is set, we do not see these opcodes because \d etc are converted to 1280 properties. Therefore, these apply in the case when only characters less 1281 than 256 are recognized to match the types. */ 1282 1283 case OP_NOT_DIGIT: 1284 set_nottype_bits(re, cbit_digit, table_limit); 1285 try_next = FALSE; 1286 break; 1287 1288 case OP_DIGIT: 1289 set_type_bits(re, cbit_digit, table_limit); 1290 try_next = FALSE; 1291 break; 1292 1293 case OP_NOT_WHITESPACE: 1294 set_nottype_bits(re, cbit_space, table_limit); 1295 try_next = FALSE; 1296 break; 1297 1298 case OP_WHITESPACE: 1299 set_type_bits(re, cbit_space, table_limit); 1300 try_next = FALSE; 1301 break; 1302 1303 case OP_NOT_WORDCHAR: 1304 set_nottype_bits(re, cbit_word, table_limit); 1305 try_next = FALSE; 1306 break; 1307 1308 case OP_WORDCHAR: 1309 set_type_bits(re, cbit_word, table_limit); 1310 try_next = FALSE; 1311 break; 1312 1313 /* One or more character type fudges the pointer and restarts, knowing 1314 it will hit a single character type and stop there. */ 1315 1316 case OP_TYPEPLUS: 1317 case OP_TYPEMINPLUS: 1318 case OP_TYPEPOSPLUS: 1319 tcode++; 1320 break; 1321 1322 case OP_TYPEEXACT: 1323 tcode += 1 + IMM2_SIZE; 1324 break; 1325 1326 /* Zero or more repeats of character types set the bits and then 1327 try again. */ 1328 1329 case OP_TYPEUPTO: 1330 case OP_TYPEMINUPTO: 1331 case OP_TYPEPOSUPTO: 1332 tcode += IMM2_SIZE; /* Fall through */ 1333 1334 case OP_TYPESTAR: 1335 case OP_TYPEMINSTAR: 1336 case OP_TYPEPOSSTAR: 1337 case OP_TYPEQUERY: 1338 case OP_TYPEMINQUERY: 1339 case OP_TYPEPOSQUERY: 1340 switch(tcode[1]) 1341 { 1342 default: 1343 case OP_ANY: 1344 case OP_ALLANY: 1345 return SSB_FAIL; 1346 1347 case OP_HSPACE: 1348 SET_BIT(CHAR_HT); 1349 SET_BIT(CHAR_SPACE); 1350 1351 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1352 the bits for 0xA0 and for code units >= 255, independently of UTF. */ 1353 1354 #if PCRE2_CODE_UNIT_WIDTH != 8 1355 SET_BIT(0xA0); 1356 SET_BIT(0xFF); 1357 #else 1358 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1359 units of horizontal space characters. */ 1360 1361 #ifdef SUPPORT_UNICODE 1362 if (utf) 1363 { 1364 SET_BIT(0xC2); /* For U+00A0 */ 1365 SET_BIT(0xE1); /* For U+1680, U+180E */ 1366 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ 1367 SET_BIT(0xE3); /* For U+3000 */ 1368 } 1369 else 1370 #endif 1371 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless 1372 the code is EBCDIC. */ 1373 { 1374 #ifndef EBCDIC 1375 SET_BIT(0xA0); 1376 #endif /* Not EBCDIC */ 1377 } 1378 #endif /* 8-bit support */ 1379 break; 1380 1381 case OP_ANYNL: 1382 case OP_VSPACE: 1383 SET_BIT(CHAR_LF); 1384 SET_BIT(CHAR_VT); 1385 SET_BIT(CHAR_FF); 1386 SET_BIT(CHAR_CR); 1387 1388 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set 1389 the bits for NEL and for code units >= 255, independently of UTF. */ 1390 1391 #if PCRE2_CODE_UNIT_WIDTH != 8 1392 SET_BIT(CHAR_NEL); 1393 SET_BIT(0xFF); 1394 #else 1395 /* For the 8-bit library in UTF-8 mode, set the bits for the first code 1396 units of vertical space characters. */ 1397 1398 #ifdef SUPPORT_UNICODE 1399 if (utf) 1400 { 1401 SET_BIT(0xC2); /* For U+0085 (NEL) */ 1402 SET_BIT(0xE2); /* For U+2028, U+2029 */ 1403 } 1404 else 1405 #endif 1406 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */ 1407 { 1408 SET_BIT(CHAR_NEL); 1409 } 1410 #endif /* 8-bit support */ 1411 break; 1412 1413 case OP_NOT_DIGIT: 1414 set_nottype_bits(re, cbit_digit, table_limit); 1415 break; 1416 1417 case OP_DIGIT: 1418 set_type_bits(re, cbit_digit, table_limit); 1419 break; 1420 1421 case OP_NOT_WHITESPACE: 1422 set_nottype_bits(re, cbit_space, table_limit); 1423 break; 1424 1425 case OP_WHITESPACE: 1426 set_type_bits(re, cbit_space, table_limit); 1427 break; 1428 1429 case OP_NOT_WORDCHAR: 1430 set_nottype_bits(re, cbit_word, table_limit); 1431 break; 1432 1433 case OP_WORDCHAR: 1434 set_type_bits(re, cbit_word, table_limit); 1435 break; 1436 } 1437 1438 tcode += 2; 1439 break; 1440 1441 /* Extended class: if there are any property checks, or if this is a 1442 negative XCLASS without a map, give up. If there are no property checks, 1443 there must be wide characters on the XCLASS list, because otherwise an 1444 XCLASS would not have been created. This means that code points >= 255 1445 are always potential starters. */ 1446 1447 #ifdef SUPPORT_WIDE_CHARS 1448 case OP_XCLASS: 1449 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 || 1450 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT) 1451 return SSB_FAIL; 1452 1453 /* We have a positive XCLASS or a negative one without a map. Set up the 1454 map pointer if there is one, and fall through. */ 1455 1456 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL : 1457 (uint8_t *)(tcode + 1 + LINK_SIZE + 1); 1458 #endif 1459 /* It seems that the fall through comment must be outside the #ifdef if 1460 it is to avoid the gcc compiler warning. */ 1461 1462 /* Fall through */ 1463 1464 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are 1465 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter 1466 because it starts a character with a value > 255. In 8-bit non-UTF mode, 1467 there is no difference between CLASS and NCLASS. In all other wide 1468 character modes, set the 0xFF bit to indicate code units >= 255. */ 1469 1470 case OP_NCLASS: 1471 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 1472 if (utf) 1473 { 1474 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ 1475 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ 1476 } 1477 #elif PCRE2_CODE_UNIT_WIDTH != 8 1478 SET_BIT(0xFF); /* For characters >= 255 */ 1479 #endif 1480 /* Fall through */ 1481 1482 /* Enter here for a positive non-XCLASS. If we have fallen through from 1483 an XCLASS, classmap will already be set; just advance the code pointer. 1484 Otherwise, set up classmap for a a non-XCLASS and advance past it. */ 1485 1486 case OP_CLASS: 1487 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else 1488 { 1489 classmap = (uint8_t *)(++tcode); 1490 tcode += 32 / sizeof(PCRE2_UCHAR); 1491 } 1492 1493 /* When wide characters are supported, classmap may be NULL. In UTF-8 1494 (sic) mode, the bits in a class bit map correspond to character values, 1495 not to byte values. However, the bit map we are constructing is for byte 1496 values. So we have to do a conversion for characters whose code point is 1497 greater than 127. In fact, there are only two possible starting bytes for 1498 characters in the range 128 - 255. */ 1499 1500 if (classmap != NULL) 1501 { 1502 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 1503 if (utf) 1504 { 1505 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c]; 1506 for (c = 128; c < 256; c++) 1507 { 1508 if ((classmap[c/8] & (1 << (c&7))) != 0) 1509 { 1510 int d = (c >> 6) | 0xc0; /* Set bit for this starter */ 1511 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */ 1512 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ 1513 } 1514 } 1515 } 1516 else 1517 #endif 1518 /* In all modes except UTF-8, the two bit maps are compatible. */ 1519 1520 { 1521 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c]; 1522 } 1523 } 1524 1525 /* Act on what follows the class. For a zero minimum repeat, continue; 1526 otherwise stop processing. */ 1527 1528 switch (*tcode) 1529 { 1530 case OP_CRSTAR: 1531 case OP_CRMINSTAR: 1532 case OP_CRQUERY: 1533 case OP_CRMINQUERY: 1534 case OP_CRPOSSTAR: 1535 case OP_CRPOSQUERY: 1536 tcode++; 1537 break; 1538 1539 case OP_CRRANGE: 1540 case OP_CRMINRANGE: 1541 case OP_CRPOSRANGE: 1542 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; 1543 else try_next = FALSE; 1544 break; 1545 1546 default: 1547 try_next = FALSE; 1548 break; 1549 } 1550 break; /* End of class handling case */ 1551 } /* End of switch for opcodes */ 1552 } /* End of try_next loop */ 1553 1554 code += GET(code, 1); /* Advance to next branch */ 1555 } 1556 while (*code == OP_ALT); 1557 1558 return yield; 1559 } 1560 1561 1562 1563 /************************************************* 1564 * Study a compiled expression * 1565 *************************************************/ 1566 1567 /* This function is handed a compiled expression that it must study to produce 1568 information that will speed up the matching. 1569 1570 Argument: points to the compiled expression 1571 Returns: 0 normally; non-zero should never normally occur 1572 1 unknown opcode in set_start_bits 1573 2 missing capturing bracket 1574 3 unknown opcode in find_minlength 1575 */ 1576 1577 int 1578 PRIV(study)(pcre2_real_code *re) 1579 { 1580 int min; 1581 int count = 0; 1582 PCRE2_UCHAR *code; 1583 BOOL utf = (re->overall_options & PCRE2_UTF) != 0; 1584 1585 /* Find start of compiled code */ 1586 1587 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 1588 re->name_entry_size * re->name_count; 1589 1590 /* For a pattern that has a first code unit, or a multiline pattern that 1591 matches only at "line start", there is no point in seeking a list of starting 1592 code units. */ 1593 1594 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) 1595 { 1596 int rc = set_start_bits(re, code, utf); 1597 if (rc == SSB_UNKNOWN) return 1; 1598 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; 1599 } 1600 1601 /* Find the minimum length of subject string. If the pattern can match an empty 1602 string, the minimum length is already known. If there are more back references 1603 than the size of the vector we are going to cache them in, do nothing. A 1604 pattern that complicated will probably take a long time to analyze and may in 1605 any case turn out to be too complicated. Note that back reference minima are 1606 held as 16-bit numbers. */ 1607 1608 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 && 1609 re->top_backref <= MAX_CACHE_BACKREF) 1610 { 1611 int backref_cache[MAX_CACHE_BACKREF+1]; 1612 backref_cache[0] = 0; /* Highest one that is set */ 1613 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache); 1614 switch(min) 1615 { 1616 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ 1617 break; /* Leave minlength unchanged (will be zero) */ 1618 1619 case -2: 1620 return 2; /* missing capturing bracket */ 1621 1622 case -3: 1623 return 3; /* unrecognized opcode */ 1624 1625 default: 1626 if (min > UINT16_MAX) min = UINT16_MAX; 1627 re->minlength = min; 1628 break; 1629 } 1630 } 1631 1632 return 0; 1633 } 1634 1635 /* End of pcre2_study.c */ 1636