1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 /* This module contains functions that scan a compiled pattern and change 42 repeats into possessive repeats where possible. */ 43 44 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 49 50 #include "pcre2_internal.h" 51 52 53 /************************************************* 54 * Tables for auto-possessification * 55 *************************************************/ 56 57 /* This table is used to check whether auto-possessification is possible 58 between adjacent character-type opcodes. The left-hand (repeated) opcode is 59 used to select the row, and the right-hand opcode is use to select the column. 60 A value of 1 means that auto-possessification is OK. For example, the second 61 value in the first row means that \D+\d can be turned into \D++\d. 62 63 The Unicode property types (\P and \p) have to be present to fill out the table 64 because of what their opcode values are, but the table values should always be 65 zero because property types are handled separately in the code. The last four 66 columns apply to items that cannot be repeated, so there is no need to have 67 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is 68 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ 69 70 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) 71 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) 72 73 static const uint8_t autoposstab[APTROWS][APTCOLS] = { 74 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ 75 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ 76 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ 77 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ 78 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ 79 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ 80 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ 81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ 82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ 83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ 84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ 85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ 86 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ 87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ 88 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ 89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ 90 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ 91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ 92 }; 93 94 #ifdef SUPPORT_UNICODE 95 /* This table is used to check whether auto-possessification is possible 96 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The 97 left-hand (repeated) opcode is used to select the row, and the right-hand 98 opcode is used to select the column. The values are as follows: 99 100 0 Always return FALSE (never auto-possessify) 101 1 Character groups are distinct (possessify if both are OP_PROP) 102 2 Check character categories in the same group (general or particular) 103 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) 104 105 4 Check left general category vs right particular category 106 5 Check right general category vs left particular category 107 108 6 Left alphanum vs right general category 109 7 Left space vs right general category 110 8 Left word vs right general category 111 112 9 Right alphanum vs left general category 113 10 Right space vs left general category 114 11 Right word vs left general category 115 116 12 Left alphanum vs right particular category 117 13 Left space vs right particular category 118 14 Left word vs right particular category 119 120 15 Right alphanum vs left particular category 121 16 Right space vs left particular category 122 17 Right word vs left particular category 123 */ 124 125 static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { 126 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */ 127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ 128 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */ 129 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */ 130 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */ 131 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ 132 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */ 133 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */ 134 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */ 135 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */ 136 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ 137 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */ 138 }; 139 140 /* This table is used to check whether auto-possessification is possible 141 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one 142 specifies a general category and the other specifies a particular category. The 143 row is selected by the general category and the column by the particular 144 category. The value is 1 if the particular category is not part of the general 145 category. */ 146 147 static const uint8_t catposstab[7][30] = { 148 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ 149 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ 150 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ 151 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ 152 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ 153 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ 154 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ 155 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ 156 }; 157 158 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against 159 a general or particular category. The properties in each row are those 160 that apply to the character set in question. Duplication means that a little 161 unnecessary work is done when checking, but this keeps things much simpler 162 because they can all use the same code. For more details see the comment where 163 this table is used. 164 165 Note: SPACE and PXSPACE used to be different because Perl excluded VT from 166 "space", but from Perl 5.18 it's included, so both categories are treated the 167 same here. */ 168 169 static const uint8_t posspropstab[3][4] = { 170 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ 171 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ 172 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ 173 }; 174 #endif /* SUPPORT_UNICODE */ 175 176 177 178 #ifdef SUPPORT_UNICODE 179 /************************************************* 180 * Check a character and a property * 181 *************************************************/ 182 183 /* This function is called by compare_opcodes() when a property item is 184 adjacent to a fixed character. 185 186 Arguments: 187 c the character 188 ptype the property type 189 pdata the data for the type 190 negated TRUE if it's a negated property (\P or \p{^) 191 192 Returns: TRUE if auto-possessifying is OK 193 */ 194 195 static BOOL 196 check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, 197 BOOL negated) 198 { 199 const uint32_t *p; 200 const ucd_record *prop = GET_UCD(c); 201 202 switch(ptype) 203 { 204 case PT_LAMP: 205 return (prop->chartype == ucp_Lu || 206 prop->chartype == ucp_Ll || 207 prop->chartype == ucp_Lt) == negated; 208 209 case PT_GC: 210 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; 211 212 case PT_PC: 213 return (pdata == prop->chartype) == negated; 214 215 case PT_SC: 216 return (pdata == prop->script) == negated; 217 218 /* These are specials */ 219 220 case PT_ALNUM: 221 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || 222 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; 223 224 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which 225 means that Perl space and POSIX space are now identical. PCRE was changed 226 at release 8.34. */ 227 228 case PT_SPACE: /* Perl space */ 229 case PT_PXSPACE: /* POSIX space */ 230 switch(c) 231 { 232 HSPACE_CASES: 233 VSPACE_CASES: 234 return negated; 235 236 default: 237 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; 238 } 239 break; /* Control never reaches here */ 240 241 case PT_WORD: 242 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || 243 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 244 c == CHAR_UNDERSCORE) == negated; 245 246 case PT_CLIST: 247 p = PRIV(ucd_caseless_sets) + prop->caseset; 248 for (;;) 249 { 250 if (c < *p) return !negated; 251 if (c == *p++) return negated; 252 } 253 break; /* Control never reaches here */ 254 } 255 256 return FALSE; 257 } 258 #endif /* SUPPORT_UNICODE */ 259 260 261 262 /************************************************* 263 * Base opcode of repeated opcodes * 264 *************************************************/ 265 266 /* Returns the base opcode for repeated single character type opcodes. If the 267 opcode is not a repeated character type, it returns with the original value. 268 269 Arguments: c opcode 270 Returns: base opcode for the type 271 */ 272 273 static PCRE2_UCHAR 274 get_repeat_base(PCRE2_UCHAR c) 275 { 276 return (c > OP_TYPEPOSUPTO)? c : 277 (c >= OP_TYPESTAR)? OP_TYPESTAR : 278 (c >= OP_NOTSTARI)? OP_NOTSTARI : 279 (c >= OP_NOTSTAR)? OP_NOTSTAR : 280 (c >= OP_STARI)? OP_STARI : 281 OP_STAR; 282 } 283 284 285 /************************************************* 286 * Fill the character property list * 287 *************************************************/ 288 289 /* Checks whether the code points to an opcode that can take part in auto- 290 possessification, and if so, fills a list with its properties. 291 292 Arguments: 293 code points to start of expression 294 utf TRUE if in UTF mode 295 fcc points to the case-flipping table 296 list points to output list 297 list[0] will be filled with the opcode 298 list[1] will be non-zero if this opcode 299 can match an empty character string 300 list[2..7] depends on the opcode 301 302 Returns: points to the start of the next opcode if *code is accepted 303 NULL if *code is not accepted 304 */ 305 306 static PCRE2_SPTR 307 get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc, 308 uint32_t *list) 309 { 310 PCRE2_UCHAR c = *code; 311 PCRE2_UCHAR base; 312 PCRE2_SPTR end; 313 uint32_t chr; 314 315 #ifdef SUPPORT_UNICODE 316 uint32_t *clist_dest; 317 const uint32_t *clist_src; 318 #else 319 (void)utf; /* Suppress "unused parameter" compiler warning */ 320 #endif 321 322 list[0] = c; 323 list[1] = FALSE; 324 code++; 325 326 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) 327 { 328 base = get_repeat_base(c); 329 c -= (base - OP_STAR); 330 331 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) 332 code += IMM2_SIZE; 333 334 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && 335 c != OP_POSPLUS); 336 337 switch(base) 338 { 339 case OP_STAR: 340 list[0] = OP_CHAR; 341 break; 342 343 case OP_STARI: 344 list[0] = OP_CHARI; 345 break; 346 347 case OP_NOTSTAR: 348 list[0] = OP_NOT; 349 break; 350 351 case OP_NOTSTARI: 352 list[0] = OP_NOTI; 353 break; 354 355 case OP_TYPESTAR: 356 list[0] = *code; 357 code++; 358 break; 359 } 360 c = list[0]; 361 } 362 363 switch(c) 364 { 365 case OP_NOT_DIGIT: 366 case OP_DIGIT: 367 case OP_NOT_WHITESPACE: 368 case OP_WHITESPACE: 369 case OP_NOT_WORDCHAR: 370 case OP_WORDCHAR: 371 case OP_ANY: 372 case OP_ALLANY: 373 case OP_ANYNL: 374 case OP_NOT_HSPACE: 375 case OP_HSPACE: 376 case OP_NOT_VSPACE: 377 case OP_VSPACE: 378 case OP_EXTUNI: 379 case OP_EODN: 380 case OP_EOD: 381 case OP_DOLL: 382 case OP_DOLLM: 383 return code; 384 385 case OP_CHAR: 386 case OP_NOT: 387 GETCHARINCTEST(chr, code); 388 list[2] = chr; 389 list[3] = NOTACHAR; 390 return code; 391 392 case OP_CHARI: 393 case OP_NOTI: 394 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; 395 GETCHARINCTEST(chr, code); 396 list[2] = chr; 397 398 #ifdef SUPPORT_UNICODE 399 if (chr < 128 || (chr < 256 && !utf)) 400 list[3] = fcc[chr]; 401 else 402 list[3] = UCD_OTHERCASE(chr); 403 #elif defined SUPPORT_WIDE_CHARS 404 list[3] = (chr < 256) ? fcc[chr] : chr; 405 #else 406 list[3] = fcc[chr]; 407 #endif 408 409 /* The othercase might be the same value. */ 410 411 if (chr == list[3]) 412 list[3] = NOTACHAR; 413 else 414 list[4] = NOTACHAR; 415 return code; 416 417 #ifdef SUPPORT_UNICODE 418 case OP_PROP: 419 case OP_NOTPROP: 420 if (code[0] != PT_CLIST) 421 { 422 list[2] = code[0]; 423 list[3] = code[1]; 424 return code + 2; 425 } 426 427 /* Convert only if we have enough space. */ 428 429 clist_src = PRIV(ucd_caseless_sets) + code[1]; 430 clist_dest = list + 2; 431 code += 2; 432 433 do { 434 if (clist_dest >= list + 8) 435 { 436 /* Early return if there is not enough space. This should never 437 happen, since all clists are shorter than 5 character now. */ 438 list[2] = code[0]; 439 list[3] = code[1]; 440 return code; 441 } 442 *clist_dest++ = *clist_src; 443 } 444 while(*clist_src++ != NOTACHAR); 445 446 /* All characters are stored. The terminating NOTACHAR is copied from the 447 clist itself. */ 448 449 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; 450 return code; 451 #endif 452 453 case OP_NCLASS: 454 case OP_CLASS: 455 #ifdef SUPPORT_WIDE_CHARS 456 case OP_XCLASS: 457 if (c == OP_XCLASS) 458 end = code + GET(code, 0) - 1; 459 else 460 #endif 461 end = code + 32 / sizeof(PCRE2_UCHAR); 462 463 switch(*end) 464 { 465 case OP_CRSTAR: 466 case OP_CRMINSTAR: 467 case OP_CRQUERY: 468 case OP_CRMINQUERY: 469 case OP_CRPOSSTAR: 470 case OP_CRPOSQUERY: 471 list[1] = TRUE; 472 end++; 473 break; 474 475 case OP_CRPLUS: 476 case OP_CRMINPLUS: 477 case OP_CRPOSPLUS: 478 end++; 479 break; 480 481 case OP_CRRANGE: 482 case OP_CRMINRANGE: 483 case OP_CRPOSRANGE: 484 list[1] = (GET2(end, 1) == 0); 485 end += 1 + 2 * IMM2_SIZE; 486 break; 487 } 488 list[2] = (uint32_t)(end - code); 489 return end; 490 } 491 return NULL; /* Opcode not accepted */ 492 } 493 494 495 496 /************************************************* 497 * Scan further character sets for match * 498 *************************************************/ 499 500 /* Checks whether the base and the current opcode have a common character, in 501 which case the base cannot be possessified. 502 503 Arguments: 504 code points to the byte code 505 utf TRUE in UTF mode 506 cb compile data block 507 base_list the data list of the base opcode 508 base_end the end of the data list 509 rec_limit points to recursion depth counter 510 511 Returns: TRUE if the auto-possessification is possible 512 */ 513 514 static BOOL 515 compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb, 516 const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) 517 { 518 PCRE2_UCHAR c; 519 uint32_t list[8]; 520 const uint32_t *chr_ptr; 521 const uint32_t *ochr_ptr; 522 const uint32_t *list_ptr; 523 PCRE2_SPTR next_code; 524 #ifdef SUPPORT_WIDE_CHARS 525 PCRE2_SPTR xclass_flags; 526 #endif 527 const uint8_t *class_bitset; 528 const uint8_t *set1, *set2, *set_end; 529 uint32_t chr; 530 BOOL accepted, invert_bits; 531 BOOL entered_a_group = FALSE; 532 533 if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ 534 535 /* Note: the base_list[1] contains whether the current opcode has a greedy 536 (represented by a non-zero value) quantifier. This is a different from 537 other character type lists, which store here that the character iterator 538 matches to an empty string (also represented by a non-zero value). */ 539 540 for(;;) 541 { 542 /* All operations move the code pointer forward. 543 Therefore infinite recursions are not possible. */ 544 545 c = *code; 546 547 /* Skip over callouts */ 548 549 if (c == OP_CALLOUT) 550 { 551 code += PRIV(OP_lengths)[c]; 552 continue; 553 } 554 555 if (c == OP_CALLOUT_STR) 556 { 557 code += GET(code, 1 + 2*LINK_SIZE); 558 continue; 559 } 560 561 if (c == OP_ALT) 562 { 563 do code += GET(code, 1); while (*code == OP_ALT); 564 c = *code; 565 } 566 567 switch(c) 568 { 569 case OP_END: 570 case OP_KETRPOS: 571 /* TRUE only in greedy case. The non-greedy case could be replaced by 572 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT 573 uses more memory, which we cannot get at this stage.) */ 574 575 return base_list[1] != 0; 576 577 case OP_KET: 578 /* If the bracket is capturing, and referenced by an OP_RECURSE, or 579 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case 580 cannot be converted to a possessive form. */ 581 582 if (base_list[1] == 0) return FALSE; 583 584 switch(*(code - GET(code, 1))) 585 { 586 case OP_ASSERT: 587 case OP_ASSERT_NOT: 588 case OP_ASSERTBACK: 589 case OP_ASSERTBACK_NOT: 590 case OP_ONCE: 591 case OP_ONCE_NC: 592 /* Atomic sub-patterns and assertions can always auto-possessify their 593 last iterator. However, if the group was entered as a result of checking 594 a previous iterator, this is not possible. */ 595 596 return !entered_a_group; 597 } 598 599 code += PRIV(OP_lengths)[c]; 600 continue; 601 602 case OP_ONCE: 603 case OP_ONCE_NC: 604 case OP_BRA: 605 case OP_CBRA: 606 next_code = code + GET(code, 1); 607 code += PRIV(OP_lengths)[c]; 608 609 while (*next_code == OP_ALT) 610 { 611 if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit)) 612 return FALSE; 613 code = next_code + 1 + LINK_SIZE; 614 next_code += GET(next_code, 1); 615 } 616 617 entered_a_group = TRUE; 618 continue; 619 620 case OP_BRAZERO: 621 case OP_BRAMINZERO: 622 623 next_code = code + 1; 624 if (*next_code != OP_BRA && *next_code != OP_CBRA 625 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE; 626 627 do next_code += GET(next_code, 1); while (*next_code == OP_ALT); 628 629 /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ 630 631 next_code += 1 + LINK_SIZE; 632 if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit)) 633 return FALSE; 634 635 code += PRIV(OP_lengths)[c]; 636 continue; 637 638 default: 639 break; 640 } 641 642 /* Check for a supported opcode, and load its properties. */ 643 644 code = get_chr_property_list(code, utf, cb->fcc, list); 645 if (code == NULL) return FALSE; /* Unsupported */ 646 647 /* If either opcode is a small character list, set pointers for comparing 648 characters from that list with another list, or with a property. */ 649 650 if (base_list[0] == OP_CHAR) 651 { 652 chr_ptr = base_list + 2; 653 list_ptr = list; 654 } 655 else if (list[0] == OP_CHAR) 656 { 657 chr_ptr = list + 2; 658 list_ptr = base_list; 659 } 660 661 /* Character bitsets can also be compared to certain opcodes. */ 662 663 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS 664 #if PCRE2_CODE_UNIT_WIDTH == 8 665 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ 666 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) 667 #endif 668 ) 669 { 670 #if PCRE2_CODE_UNIT_WIDTH == 8 671 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) 672 #else 673 if (base_list[0] == OP_CLASS) 674 #endif 675 { 676 set1 = (uint8_t *)(base_end - base_list[2]); 677 list_ptr = list; 678 } 679 else 680 { 681 set1 = (uint8_t *)(code - list[2]); 682 list_ptr = base_list; 683 } 684 685 invert_bits = FALSE; 686 switch(list_ptr[0]) 687 { 688 case OP_CLASS: 689 case OP_NCLASS: 690 set2 = (uint8_t *) 691 ((list_ptr == list ? code : base_end) - list_ptr[2]); 692 break; 693 694 #ifdef SUPPORT_WIDE_CHARS 695 case OP_XCLASS: 696 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; 697 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; 698 if ((*xclass_flags & XCL_MAP) == 0) 699 { 700 /* No bits are set for characters < 256. */ 701 if (list[1] == 0) return TRUE; 702 /* Might be an empty repeat. */ 703 continue; 704 } 705 set2 = (uint8_t *)(xclass_flags + 1); 706 break; 707 #endif 708 709 case OP_NOT_DIGIT: 710 invert_bits = TRUE; 711 /* Fall through */ 712 case OP_DIGIT: 713 set2 = (uint8_t *)(cb->cbits + cbit_digit); 714 break; 715 716 case OP_NOT_WHITESPACE: 717 invert_bits = TRUE; 718 /* Fall through */ 719 case OP_WHITESPACE: 720 set2 = (uint8_t *)(cb->cbits + cbit_space); 721 break; 722 723 case OP_NOT_WORDCHAR: 724 invert_bits = TRUE; 725 /* Fall through */ 726 case OP_WORDCHAR: 727 set2 = (uint8_t *)(cb->cbits + cbit_word); 728 break; 729 730 default: 731 return FALSE; 732 } 733 734 /* Because the bit sets are unaligned bytes, we need to perform byte 735 comparison here. */ 736 737 set_end = set1 + 32; 738 if (invert_bits) 739 { 740 do 741 { 742 if ((*set1++ & ~(*set2++)) != 0) return FALSE; 743 } 744 while (set1 < set_end); 745 } 746 else 747 { 748 do 749 { 750 if ((*set1++ & *set2++) != 0) return FALSE; 751 } 752 while (set1 < set_end); 753 } 754 755 if (list[1] == 0) return TRUE; 756 /* Might be an empty repeat. */ 757 continue; 758 } 759 760 /* Some property combinations also acceptable. Unicode property opcodes are 761 processed specially; the rest can be handled with a lookup table. */ 762 763 else 764 { 765 uint32_t leftop, rightop; 766 767 leftop = base_list[0]; 768 rightop = list[0]; 769 770 #ifdef SUPPORT_UNICODE 771 accepted = FALSE; /* Always set in non-unicode case. */ 772 if (leftop == OP_PROP || leftop == OP_NOTPROP) 773 { 774 if (rightop == OP_EOD) 775 accepted = TRUE; 776 else if (rightop == OP_PROP || rightop == OP_NOTPROP) 777 { 778 int n; 779 const uint8_t *p; 780 BOOL same = leftop == rightop; 781 BOOL lisprop = leftop == OP_PROP; 782 BOOL risprop = rightop == OP_PROP; 783 BOOL bothprop = lisprop && risprop; 784 785 /* There's a table that specifies how each combination is to be 786 processed: 787 0 Always return FALSE (never auto-possessify) 788 1 Character groups are distinct (possessify if both are OP_PROP) 789 2 Check character categories in the same group (general or particular) 790 3 Return TRUE if the two opcodes are not the same 791 ... see comments below 792 */ 793 794 n = propposstab[base_list[2]][list[2]]; 795 switch(n) 796 { 797 case 0: break; 798 case 1: accepted = bothprop; break; 799 case 2: accepted = (base_list[3] == list[3]) != same; break; 800 case 3: accepted = !same; break; 801 802 case 4: /* Left general category, right particular category */ 803 accepted = risprop && catposstab[base_list[3]][list[3]] == same; 804 break; 805 806 case 5: /* Right general category, left particular category */ 807 accepted = lisprop && catposstab[list[3]][base_list[3]] == same; 808 break; 809 810 /* This code is logically tricky. Think hard before fiddling with it. 811 The posspropstab table has four entries per row. Each row relates to 812 one of PCRE's special properties such as ALNUM or SPACE or WORD. 813 Only WORD actually needs all four entries, but using repeats for the 814 others means they can all use the same code below. 815 816 The first two entries in each row are Unicode general categories, and 817 apply always, because all the characters they include are part of the 818 PCRE character set. The third and fourth entries are a general and a 819 particular category, respectively, that include one or more relevant 820 characters. One or the other is used, depending on whether the check 821 is for a general or a particular category. However, in both cases the 822 category contains more characters than the specials that are defined 823 for the property being tested against. Therefore, it cannot be used 824 in a NOTPROP case. 825 826 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. 827 Underscore is covered by ucp_P or ucp_Po. */ 828 829 case 6: /* Left alphanum vs right general category */ 830 case 7: /* Left space vs right general category */ 831 case 8: /* Left word vs right general category */ 832 p = posspropstab[n-6]; 833 accepted = risprop && lisprop == 834 (list[3] != p[0] && 835 list[3] != p[1] && 836 (list[3] != p[2] || !lisprop)); 837 break; 838 839 case 9: /* Right alphanum vs left general category */ 840 case 10: /* Right space vs left general category */ 841 case 11: /* Right word vs left general category */ 842 p = posspropstab[n-9]; 843 accepted = lisprop && risprop == 844 (base_list[3] != p[0] && 845 base_list[3] != p[1] && 846 (base_list[3] != p[2] || !risprop)); 847 break; 848 849 case 12: /* Left alphanum vs right particular category */ 850 case 13: /* Left space vs right particular category */ 851 case 14: /* Left word vs right particular category */ 852 p = posspropstab[n-12]; 853 accepted = risprop && lisprop == 854 (catposstab[p[0]][list[3]] && 855 catposstab[p[1]][list[3]] && 856 (list[3] != p[3] || !lisprop)); 857 break; 858 859 case 15: /* Right alphanum vs left particular category */ 860 case 16: /* Right space vs left particular category */ 861 case 17: /* Right word vs left particular category */ 862 p = posspropstab[n-15]; 863 accepted = lisprop && risprop == 864 (catposstab[p[0]][base_list[3]] && 865 catposstab[p[1]][base_list[3]] && 866 (base_list[3] != p[3] || !risprop)); 867 break; 868 } 869 } 870 } 871 872 else 873 #endif /* SUPPORT_UNICODE */ 874 875 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && 876 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && 877 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; 878 879 if (!accepted) return FALSE; 880 881 if (list[1] == 0) return TRUE; 882 /* Might be an empty repeat. */ 883 continue; 884 } 885 886 /* Control reaches here only if one of the items is a small character list. 887 All characters are checked against the other side. */ 888 889 do 890 { 891 chr = *chr_ptr; 892 893 switch(list_ptr[0]) 894 { 895 case OP_CHAR: 896 ochr_ptr = list_ptr + 2; 897 do 898 { 899 if (chr == *ochr_ptr) return FALSE; 900 ochr_ptr++; 901 } 902 while(*ochr_ptr != NOTACHAR); 903 break; 904 905 case OP_NOT: 906 ochr_ptr = list_ptr + 2; 907 do 908 { 909 if (chr == *ochr_ptr) 910 break; 911 ochr_ptr++; 912 } 913 while(*ochr_ptr != NOTACHAR); 914 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ 915 break; 916 917 /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* 918 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ 919 920 case OP_DIGIT: 921 if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; 922 break; 923 924 case OP_NOT_DIGIT: 925 if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; 926 break; 927 928 case OP_WHITESPACE: 929 if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; 930 break; 931 932 case OP_NOT_WHITESPACE: 933 if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; 934 break; 935 936 case OP_WORDCHAR: 937 if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; 938 break; 939 940 case OP_NOT_WORDCHAR: 941 if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; 942 break; 943 944 case OP_HSPACE: 945 switch(chr) 946 { 947 HSPACE_CASES: return FALSE; 948 default: break; 949 } 950 break; 951 952 case OP_NOT_HSPACE: 953 switch(chr) 954 { 955 HSPACE_CASES: break; 956 default: return FALSE; 957 } 958 break; 959 960 case OP_ANYNL: 961 case OP_VSPACE: 962 switch(chr) 963 { 964 VSPACE_CASES: return FALSE; 965 default: break; 966 } 967 break; 968 969 case OP_NOT_VSPACE: 970 switch(chr) 971 { 972 VSPACE_CASES: break; 973 default: return FALSE; 974 } 975 break; 976 977 case OP_DOLL: 978 case OP_EODN: 979 switch (chr) 980 { 981 case CHAR_CR: 982 case CHAR_LF: 983 case CHAR_VT: 984 case CHAR_FF: 985 case CHAR_NEL: 986 #ifndef EBCDIC 987 case 0x2028: 988 case 0x2029: 989 #endif /* Not EBCDIC */ 990 return FALSE; 991 } 992 break; 993 994 case OP_EOD: /* Can always possessify before \z */ 995 break; 996 997 #ifdef SUPPORT_UNICODE 998 case OP_PROP: 999 case OP_NOTPROP: 1000 if (!check_char_prop(chr, list_ptr[2], list_ptr[3], 1001 list_ptr[0] == OP_NOTPROP)) 1002 return FALSE; 1003 break; 1004 #endif 1005 1006 case OP_NCLASS: 1007 if (chr > 255) return FALSE; 1008 /* Fall through */ 1009 1010 case OP_CLASS: 1011 if (chr > 255) break; 1012 class_bitset = (uint8_t *) 1013 ((list_ptr == list ? code : base_end) - list_ptr[2]); 1014 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE; 1015 break; 1016 1017 #ifdef SUPPORT_WIDE_CHARS 1018 case OP_XCLASS: 1019 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - 1020 list_ptr[2] + LINK_SIZE, utf)) return FALSE; 1021 break; 1022 #endif 1023 1024 default: 1025 return FALSE; 1026 } 1027 1028 chr_ptr++; 1029 } 1030 while(*chr_ptr != NOTACHAR); 1031 1032 /* At least one character must be matched from this opcode. */ 1033 1034 if (list[1] == 0) return TRUE; 1035 } 1036 1037 /* Control never reaches here. There used to be a fail-save return FALSE; here, 1038 but some compilers complain about an unreachable statement. */ 1039 } 1040 1041 1042 1043 /************************************************* 1044 * Scan compiled regex for auto-possession * 1045 *************************************************/ 1046 1047 /* Replaces single character iterations with their possessive alternatives 1048 if appropriate. This function modifies the compiled opcode! Hitting a 1049 non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a 1050 bad UTF string was compiled with PCRE2_NO_UTF_CHECK. 1051 1052 Arguments: 1053 code points to start of the byte code 1054 utf TRUE in UTF mode 1055 cb compile data block 1056 1057 Returns: 0 for success 1058 -1 if a non-existant opcode is encountered 1059 */ 1060 1061 int 1062 PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) 1063 { 1064 register PCRE2_UCHAR c; 1065 PCRE2_SPTR end; 1066 PCRE2_UCHAR *repeat_opcode; 1067 uint32_t list[8]; 1068 int rec_limit; 1069 1070 for (;;) 1071 { 1072 c = *code; 1073 1074 if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */ 1075 1076 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) 1077 { 1078 c -= get_repeat_base(c) - OP_STAR; 1079 end = (c <= OP_MINUPTO) ? 1080 get_chr_property_list(code, utf, cb->fcc, list) : NULL; 1081 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; 1082 1083 rec_limit = 1000; 1084 if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit)) 1085 { 1086 switch(c) 1087 { 1088 case OP_STAR: 1089 *code += OP_POSSTAR - OP_STAR; 1090 break; 1091 1092 case OP_MINSTAR: 1093 *code += OP_POSSTAR - OP_MINSTAR; 1094 break; 1095 1096 case OP_PLUS: 1097 *code += OP_POSPLUS - OP_PLUS; 1098 break; 1099 1100 case OP_MINPLUS: 1101 *code += OP_POSPLUS - OP_MINPLUS; 1102 break; 1103 1104 case OP_QUERY: 1105 *code += OP_POSQUERY - OP_QUERY; 1106 break; 1107 1108 case OP_MINQUERY: 1109 *code += OP_POSQUERY - OP_MINQUERY; 1110 break; 1111 1112 case OP_UPTO: 1113 *code += OP_POSUPTO - OP_UPTO; 1114 break; 1115 1116 case OP_MINUPTO: 1117 *code += OP_POSUPTO - OP_MINUPTO; 1118 break; 1119 } 1120 } 1121 c = *code; 1122 } 1123 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) 1124 { 1125 #ifdef SUPPORT_WIDE_CHARS 1126 if (c == OP_XCLASS) 1127 repeat_opcode = code + GET(code, 1); 1128 else 1129 #endif 1130 repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); 1131 1132 c = *repeat_opcode; 1133 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) 1134 { 1135 /* end must not be NULL. */ 1136 end = get_chr_property_list(code, utf, cb->fcc, list); 1137 1138 list[1] = (c & 1) == 0; 1139 1140 rec_limit = 1000; 1141 if (compare_opcodes(end, utf, cb, list, end, &rec_limit)) 1142 { 1143 switch (c) 1144 { 1145 case OP_CRSTAR: 1146 case OP_CRMINSTAR: 1147 *repeat_opcode = OP_CRPOSSTAR; 1148 break; 1149 1150 case OP_CRPLUS: 1151 case OP_CRMINPLUS: 1152 *repeat_opcode = OP_CRPOSPLUS; 1153 break; 1154 1155 case OP_CRQUERY: 1156 case OP_CRMINQUERY: 1157 *repeat_opcode = OP_CRPOSQUERY; 1158 break; 1159 1160 case OP_CRRANGE: 1161 case OP_CRMINRANGE: 1162 *repeat_opcode = OP_CRPOSRANGE; 1163 break; 1164 } 1165 } 1166 } 1167 c = *code; 1168 } 1169 1170 switch(c) 1171 { 1172 case OP_END: 1173 return 0; 1174 1175 case OP_TYPESTAR: 1176 case OP_TYPEMINSTAR: 1177 case OP_TYPEPLUS: 1178 case OP_TYPEMINPLUS: 1179 case OP_TYPEQUERY: 1180 case OP_TYPEMINQUERY: 1181 case OP_TYPEPOSSTAR: 1182 case OP_TYPEPOSPLUS: 1183 case OP_TYPEPOSQUERY: 1184 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 1185 break; 1186 1187 case OP_TYPEUPTO: 1188 case OP_TYPEMINUPTO: 1189 case OP_TYPEEXACT: 1190 case OP_TYPEPOSUPTO: 1191 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 1192 code += 2; 1193 break; 1194 1195 case OP_CALLOUT_STR: 1196 code += GET(code, 1 + 2*LINK_SIZE); 1197 break; 1198 1199 #ifdef SUPPORT_WIDE_CHARS 1200 case OP_XCLASS: 1201 code += GET(code, 1); 1202 break; 1203 #endif 1204 1205 case OP_MARK: 1206 case OP_PRUNE_ARG: 1207 case OP_SKIP_ARG: 1208 case OP_THEN_ARG: 1209 code += code[1]; 1210 break; 1211 } 1212 1213 /* Add in the fixed length from the table */ 1214 1215 code += PRIV(OP_lengths)[c]; 1216 1217 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be 1218 followed by a multi-byte character. The length in the table is a minimum, so 1219 we have to arrange to skip the extra code units. */ 1220 1221 #ifdef MAYBE_UTF_MULTI 1222 if (utf) switch(c) 1223 { 1224 case OP_CHAR: 1225 case OP_CHARI: 1226 case OP_NOT: 1227 case OP_NOTI: 1228 case OP_STAR: 1229 case OP_MINSTAR: 1230 case OP_PLUS: 1231 case OP_MINPLUS: 1232 case OP_QUERY: 1233 case OP_MINQUERY: 1234 case OP_UPTO: 1235 case OP_MINUPTO: 1236 case OP_EXACT: 1237 case OP_POSSTAR: 1238 case OP_POSPLUS: 1239 case OP_POSQUERY: 1240 case OP_POSUPTO: 1241 case OP_STARI: 1242 case OP_MINSTARI: 1243 case OP_PLUSI: 1244 case OP_MINPLUSI: 1245 case OP_QUERYI: 1246 case OP_MINQUERYI: 1247 case OP_UPTOI: 1248 case OP_MINUPTOI: 1249 case OP_EXACTI: 1250 case OP_POSSTARI: 1251 case OP_POSPLUSI: 1252 case OP_POSQUERYI: 1253 case OP_POSUPTOI: 1254 case OP_NOTSTAR: 1255 case OP_NOTMINSTAR: 1256 case OP_NOTPLUS: 1257 case OP_NOTMINPLUS: 1258 case OP_NOTQUERY: 1259 case OP_NOTMINQUERY: 1260 case OP_NOTUPTO: 1261 case OP_NOTMINUPTO: 1262 case OP_NOTEXACT: 1263 case OP_NOTPOSSTAR: 1264 case OP_NOTPOSPLUS: 1265 case OP_NOTPOSQUERY: 1266 case OP_NOTPOSUPTO: 1267 case OP_NOTSTARI: 1268 case OP_NOTMINSTARI: 1269 case OP_NOTPLUSI: 1270 case OP_NOTMINPLUSI: 1271 case OP_NOTQUERYI: 1272 case OP_NOTMINQUERYI: 1273 case OP_NOTUPTOI: 1274 case OP_NOTMINUPTOI: 1275 case OP_NOTEXACTI: 1276 case OP_NOTPOSSTARI: 1277 case OP_NOTPOSPLUSI: 1278 case OP_NOTPOSQUERYI: 1279 case OP_NOTPOSUPTOI: 1280 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 1281 break; 1282 } 1283 #else 1284 (void)(utf); /* Keep compiler happy by referencing function argument */ 1285 #endif /* SUPPORT_WIDE_CHARS */ 1286 } 1287 } 1288 1289 /* End of pcre2_auto_possess.c */ 1290