1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 /* This module contains the external function pcre2_dfa_match(), which is an 43 alternative matching function that uses a sort of DFA algorithm (not a true 44 FSM). This is NOT Perl-compatible, but it has advantages in certain 45 applications. */ 46 47 48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 49 the performance of his patterns greatly. I could not use it as it stood, as it 50 was not thread safe, and made assumptions about pattern sizes. Also, it caused 51 test 7 to loop, and test 9 to crash with a segfault. 52 53 The issue is the check for duplicate states, which is done by a simple linear 54 search up the state list. (Grep for "duplicate" below to find the code.) For 55 many patterns, there will never be many states active at one time, so a simple 56 linear search is fine. In patterns that have many active states, it might be a 57 bottleneck. The suggested code used an indexing scheme to remember which states 58 had previously been used for each character, and avoided the linear search when 59 it knew there was no chance of a duplicate. This was implemented when adding 60 states to the state lists. 61 62 I wrote some thread-safe, not-limited code to try something similar at the time 63 of checking for duplicates (instead of when adding states), using index vectors 64 on the stack. It did give a 13% improvement with one specially constructed 65 pattern for certain subject strings, but on other strings and on many of the 66 simpler patterns in the test suite it did worse. The major problem, I think, 67 was the extra time to initialize the index. This had to be done for each call 68 of internal_dfa_match(). (The supplied patch used a static vector, initialized 69 only once - I suspect this was the cause of the problems with the tests.) 70 71 Overall, I concluded that the gains in some cases did not outweigh the losses 72 in others, so I abandoned this code. */ 73 74 75 #ifdef HAVE_CONFIG_H 76 #include "config.h" 77 #endif 78 79 #define NLBLOCK mb /* Block containing newline information */ 80 #define PSSTART start_subject /* Field containing processed string start */ 81 #define PSEND end_subject /* Field containing processed string end */ 82 83 #include "pcre2_internal.h" 84 85 #define PUBLIC_DFA_MATCH_OPTIONS \ 86 (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ 87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ 88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART) 89 90 91 /************************************************* 92 * Code parameters and static tables * 93 *************************************************/ 94 95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 96 into others, under special conditions. A gap of 20 between the blocks should be 97 enough. The resulting opcodes don't have to be less than 256 because they are 98 never stored, so we push them well clear of the normal opcodes. */ 99 100 #define OP_PROP_EXTRA 300 101 #define OP_EXTUNI_EXTRA 320 102 #define OP_ANYNL_EXTRA 340 103 #define OP_HSPACE_EXTRA 360 104 #define OP_VSPACE_EXTRA 380 105 106 107 /* This table identifies those opcodes that are followed immediately by a 108 character that is to be tested in some way. This makes it possible to 109 centralize the loading of these characters. In the case of Type * etc, the 110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 111 small value. Non-zero values in the table are the offsets from the opcode where 112 the character is to be found. ***NOTE*** If the start of this table is 113 modified, the three tables that follow must also be modified. */ 114 115 static const uint8_t coptable[] = { 116 0, /* End */ 117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 119 0, 0, 0, /* Any, AllAny, Anybyte */ 120 0, 0, /* \P, \p */ 121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 122 0, /* \X */ 123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 124 1, /* Char */ 125 1, /* Chari */ 126 1, /* not */ 127 1, /* noti */ 128 /* Positive single-char repeats */ 129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ 131 1+IMM2_SIZE, /* exact */ 132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ 135 1+IMM2_SIZE, /* exact I */ 136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ 137 /* Negative single-char repeats - only for chars < 256 */ 138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ 140 1+IMM2_SIZE, /* NOT exact */ 141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ 144 1+IMM2_SIZE, /* NOT exact I */ 145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ 146 /* Positive type repeats */ 147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ 149 1+IMM2_SIZE, /* Type exact */ 150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ 151 /* Character class & ref repeats */ 152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 153 0, 0, /* CRRANGE, CRMINRANGE */ 154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ 155 0, /* CLASS */ 156 0, /* NCLASS */ 157 0, /* XCLASS - variable length */ 158 0, /* REF */ 159 0, /* REFI */ 160 0, /* DNREF */ 161 0, /* DNREFI */ 162 0, /* RECURSE */ 163 0, /* CALLOUT */ 164 0, /* CALLOUT_STR */ 165 0, /* Alt */ 166 0, /* Ket */ 167 0, /* KetRmax */ 168 0, /* KetRmin */ 169 0, /* KetRpos */ 170 0, /* Reverse */ 171 0, /* Assert */ 172 0, /* Assert not */ 173 0, /* Assert behind */ 174 0, /* Assert behind not */ 175 0, 0, /* ONCE, ONCE_NC */ 176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 178 0, 0, /* CREF, DNCREF */ 179 0, 0, /* RREF, DNRREF */ 180 0, 0, /* FALSE, TRUE */ 181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 184 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 185 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ 186 }; 187 188 /* This table identifies those opcodes that inspect a character. It is used to 189 remember the fact that a character could have been inspected when the end of 190 the subject is reached. ***NOTE*** If the start of this table is modified, the 191 two tables that follow must also be modified. */ 192 193 static const uint8_t poptable[] = { 194 0, /* End */ 195 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 196 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 197 1, 1, 1, /* Any, AllAny, Anybyte */ 198 1, 1, /* \P, \p */ 199 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 200 1, /* \X */ 201 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 202 1, /* Char */ 203 1, /* Chari */ 204 1, /* not */ 205 1, /* noti */ 206 /* Positive single-char repeats */ 207 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 208 1, 1, 1, /* upto, minupto, exact */ 209 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 210 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 211 1, 1, 1, /* upto I, minupto I, exact I */ 212 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ 213 /* Negative single-char repeats - only for chars < 256 */ 214 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 215 1, 1, 1, /* NOT upto, minupto, exact */ 216 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 217 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 218 1, 1, 1, /* NOT upto I, minupto I, exact I */ 219 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ 220 /* Positive type repeats */ 221 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 222 1, 1, 1, /* Type upto, minupto, exact */ 223 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 224 /* Character class & ref repeats */ 225 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 226 1, 1, /* CRRANGE, CRMINRANGE */ 227 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ 228 1, /* CLASS */ 229 1, /* NCLASS */ 230 1, /* XCLASS - variable length */ 231 0, /* REF */ 232 0, /* REFI */ 233 0, /* DNREF */ 234 0, /* DNREFI */ 235 0, /* RECURSE */ 236 0, /* CALLOUT */ 237 0, /* CALLOUT_STR */ 238 0, /* Alt */ 239 0, /* Ket */ 240 0, /* KetRmax */ 241 0, /* KetRmin */ 242 0, /* KetRpos */ 243 0, /* Reverse */ 244 0, /* Assert */ 245 0, /* Assert not */ 246 0, /* Assert behind */ 247 0, /* Assert behind not */ 248 0, 0, /* ONCE, ONCE_NC */ 249 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 250 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 251 0, 0, /* CREF, DNCREF */ 252 0, 0, /* RREF, DNRREF */ 253 0, 0, /* FALSE, TRUE */ 254 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 255 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 256 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 257 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 258 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ 259 }; 260 261 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 262 and \w */ 263 264 static const uint8_t toptable1[] = { 265 0, 0, 0, 0, 0, 0, 266 ctype_digit, ctype_digit, 267 ctype_space, ctype_space, 268 ctype_word, ctype_word, 269 0, 0 /* OP_ANY, OP_ALLANY */ 270 }; 271 272 static const uint8_t toptable2[] = { 273 0, 0, 0, 0, 0, 0, 274 ctype_digit, 0, 275 ctype_space, 0, 276 ctype_word, 0, 277 1, 1 /* OP_ANY, OP_ALLANY */ 278 }; 279 280 281 /* Structure for holding data about a particular state, which is in effect the 282 current data for an active path through the match tree. It must consist 283 entirely of ints because the working vector we are passed, and which we put 284 these structures in, is a vector of ints. */ 285 286 typedef struct stateblock { 287 int offset; /* Offset to opcode (-ve has meaning) */ 288 int count; /* Count for repeats */ 289 int data; /* Some use extra data */ 290 } stateblock; 291 292 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) 293 294 295 296 /************************************************* 297 * Match a Regular Expression - DFA engine * 298 *************************************************/ 299 300 /* This internal function applies a compiled pattern to a subject string, 301 starting at a given point, using a DFA engine. This function is called from the 302 external one, possibly multiple times if the pattern is not anchored. The 303 function calls itself recursively for some kinds of subpattern. 304 305 Arguments: 306 mb the match_data block with fixed information 307 this_start_code the opening bracket of this subexpression's code 308 current_subject where we currently are in the subject string 309 start_offset start offset in the subject string 310 offsets vector to contain the matching string offsets 311 offsetcount size of same 312 workspace vector of workspace 313 wscount size of same 314 rlevel function call recursion level 315 316 Returns: > 0 => number of match offset pairs placed in offsets 317 = 0 => offsets overflowed; longest matches are present 318 -1 => failed to match 319 < -1 => some kind of unexpected problem 320 321 The following macros are used for adding states to the two state vectors (one 322 for the current character, one for the following character). */ 323 324 #define ADD_ACTIVE(x,y) \ 325 if (active_count++ < wscount) \ 326 { \ 327 next_active_state->offset = (x); \ 328 next_active_state->count = (y); \ 329 next_active_state++; \ 330 } \ 331 else return PCRE2_ERROR_DFA_WSSIZE 332 333 #define ADD_ACTIVE_DATA(x,y,z) \ 334 if (active_count++ < wscount) \ 335 { \ 336 next_active_state->offset = (x); \ 337 next_active_state->count = (y); \ 338 next_active_state->data = (z); \ 339 next_active_state++; \ 340 } \ 341 else return PCRE2_ERROR_DFA_WSSIZE 342 343 #define ADD_NEW(x,y) \ 344 if (new_count++ < wscount) \ 345 { \ 346 next_new_state->offset = (x); \ 347 next_new_state->count = (y); \ 348 next_new_state++; \ 349 } \ 350 else return PCRE2_ERROR_DFA_WSSIZE 351 352 #define ADD_NEW_DATA(x,y,z) \ 353 if (new_count++ < wscount) \ 354 { \ 355 next_new_state->offset = (x); \ 356 next_new_state->count = (y); \ 357 next_new_state->data = (z); \ 358 next_new_state++; \ 359 } \ 360 else return PCRE2_ERROR_DFA_WSSIZE 361 362 /* And now, here is the code */ 363 364 static int 365 internal_dfa_match( 366 dfa_match_block *mb, 367 PCRE2_SPTR this_start_code, 368 PCRE2_SPTR current_subject, 369 PCRE2_SIZE start_offset, 370 PCRE2_SIZE *offsets, 371 uint32_t offsetcount, 372 int *workspace, 373 int wscount, 374 int rlevel) 375 { 376 stateblock *active_states, *new_states, *temp_states; 377 stateblock *next_active_state, *next_new_state; 378 379 const uint8_t *ctypes, *lcc, *fcc; 380 PCRE2_SPTR ptr; 381 PCRE2_SPTR end_code; 382 PCRE2_SPTR first_op; 383 384 dfa_recursion_info new_recursive; 385 386 int active_count, new_count, match_count; 387 388 /* Some fields in the mb block are frequently referenced, so we load them into 389 independent variables in the hope that this will perform better. */ 390 391 PCRE2_SPTR start_subject = mb->start_subject; 392 PCRE2_SPTR end_subject = mb->end_subject; 393 PCRE2_SPTR start_code = mb->start_code; 394 395 #ifdef SUPPORT_UNICODE 396 BOOL utf = (mb->poptions & PCRE2_UTF) != 0; 397 #else 398 BOOL utf = FALSE; 399 #endif 400 401 BOOL reset_could_continue = FALSE; 402 403 rlevel++; 404 offsetcount &= (uint32_t)(-2); /* Round down */ 405 406 wscount -= 2; 407 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 408 (2 * INTS_PER_STATEBLOCK); 409 410 ctypes = mb->tables + ctypes_offset; 411 lcc = mb->tables + lcc_offset; 412 fcc = mb->tables + fcc_offset; 413 414 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */ 415 416 active_states = (stateblock *)(workspace + 2); 417 next_new_state = new_states = active_states + wscount; 418 new_count = 0; 419 420 first_op = this_start_code + 1 + LINK_SIZE + 421 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 422 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 423 ? IMM2_SIZE:0); 424 425 /* The first thing in any (sub) pattern is a bracket of some sort. Push all 426 the alternative states onto the list, and find out where the end is. This 427 makes is possible to use this function recursively, when we want to stop at a 428 matching internal ket rather than at the end. 429 430 If the first opcode in the first alternative is OP_REVERSE, we are dealing with 431 a backward assertion. In that case, we have to find out the maximum amount to 432 move back, and set up each alternative appropriately. */ 433 434 if (*first_op == OP_REVERSE) 435 { 436 size_t max_back = 0; 437 size_t gone_back; 438 439 end_code = this_start_code; 440 do 441 { 442 size_t back = (size_t)GET(end_code, 2+LINK_SIZE); 443 if (back > max_back) max_back = back; 444 end_code += GET(end_code, 1); 445 } 446 while (*end_code == OP_ALT); 447 448 /* If we can't go back the amount required for the longest lookbehind 449 pattern, go back as far as we can; some alternatives may still be viable. */ 450 451 #ifdef SUPPORT_UNICODE 452 /* In character mode we have to step back character by character */ 453 454 if (utf) 455 { 456 for (gone_back = 0; gone_back < max_back; gone_back++) 457 { 458 if (current_subject <= start_subject) break; 459 current_subject--; 460 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); 461 } 462 } 463 else 464 #endif 465 466 /* In byte-mode we can do this quickly. */ 467 468 { 469 size_t current_offset = (size_t)(current_subject - start_subject); 470 gone_back = (current_offset < max_back)? current_offset : max_back; 471 current_subject -= gone_back; 472 } 473 474 /* Save the earliest consulted character */ 475 476 if (current_subject < mb->start_used_ptr) 477 mb->start_used_ptr = current_subject; 478 479 /* Now we can process the individual branches. */ 480 481 end_code = this_start_code; 482 do 483 { 484 size_t back = (size_t)GET(end_code, 2+LINK_SIZE); 485 if (back <= gone_back) 486 { 487 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); 488 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back)); 489 } 490 end_code += GET(end_code, 1); 491 } 492 while (*end_code == OP_ALT); 493 } 494 495 /* This is the code for a "normal" subpattern (not a backward assertion). The 496 start of a whole pattern is always one of these. If we are at the top level, 497 we may be asked to restart matching from the same point that we reached for a 498 previous partial match. We still have to scan through the top-level branches to 499 find the end state. */ 500 501 else 502 { 503 end_code = this_start_code; 504 505 /* Restarting */ 506 507 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0) 508 { 509 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 510 new_count = workspace[1]; 511 if (!workspace[0]) 512 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock)); 513 } 514 515 /* Not restarting */ 516 517 else 518 { 519 int length = 1 + LINK_SIZE + 520 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 521 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 522 ? IMM2_SIZE:0); 523 do 524 { 525 ADD_NEW((int)(end_code - start_code + length), 0); 526 end_code += GET(end_code, 1); 527 length = 1 + LINK_SIZE; 528 } 529 while (*end_code == OP_ALT); 530 } 531 } 532 533 workspace[0] = 0; /* Bit indicating which vector is current */ 534 535 /* Loop for scanning the subject */ 536 537 ptr = current_subject; 538 for (;;) 539 { 540 int i, j; 541 int clen, dlen; 542 uint32_t c, d; 543 int forced_fail = 0; 544 BOOL partial_newline = FALSE; 545 BOOL could_continue = reset_could_continue; 546 reset_could_continue = FALSE; 547 548 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr; 549 550 /* Make the new state list into the active state list and empty the 551 new state list. */ 552 553 temp_states = active_states; 554 active_states = new_states; 555 new_states = temp_states; 556 active_count = new_count; 557 new_count = 0; 558 559 workspace[0] ^= 1; /* Remember for the restarting feature */ 560 workspace[1] = active_count; 561 562 /* Set the pointers for adding new states */ 563 564 next_active_state = active_states + active_count; 565 next_new_state = new_states; 566 567 /* Load the current character from the subject outside the loop, as many 568 different states may want to look at it, and we assume that at least one 569 will. */ 570 571 if (ptr < end_subject) 572 { 573 clen = 1; /* Number of data items in the character */ 574 #ifdef SUPPORT_UNICODE 575 GETCHARLENTEST(c, ptr, clen); 576 #else 577 c = *ptr; 578 #endif /* SUPPORT_UNICODE */ 579 } 580 else 581 { 582 clen = 0; /* This indicates the end of the subject */ 583 c = NOTACHAR; /* This value should never actually be used */ 584 } 585 586 /* Scan up the active states and act on each one. The result of an action 587 may be to add more states to the currently active list (e.g. on hitting a 588 parenthesis) or it may be to put states on the new list, for considering 589 when we move the character pointer on. */ 590 591 for (i = 0; i < active_count; i++) 592 { 593 stateblock *current_state = active_states + i; 594 BOOL caseless = FALSE; 595 PCRE2_SPTR code; 596 uint32_t codevalue; 597 int state_offset = current_state->offset; 598 int rrc; 599 int count; 600 601 /* A negative offset is a special case meaning "hold off going to this 602 (negated) state until the number of characters in the data field have 603 been skipped". If the could_continue flag was passed over from a previous 604 state, arrange for it to passed on. */ 605 606 if (state_offset < 0) 607 { 608 if (current_state->data > 0) 609 { 610 ADD_NEW_DATA(state_offset, current_state->count, 611 current_state->data - 1); 612 if (could_continue) reset_could_continue = TRUE; 613 continue; 614 } 615 else 616 { 617 current_state->offset = state_offset = -state_offset; 618 } 619 } 620 621 /* Check for a duplicate state with the same count, and skip if found. 622 See the note at the head of this module about the possibility of improving 623 performance here. */ 624 625 for (j = 0; j < i; j++) 626 { 627 if (active_states[j].offset == state_offset && 628 active_states[j].count == current_state->count) 629 goto NEXT_ACTIVE_STATE; 630 } 631 632 /* The state offset is the offset to the opcode */ 633 634 code = start_code + state_offset; 635 codevalue = *code; 636 637 /* If this opcode inspects a character, but we are at the end of the 638 subject, remember the fact for use when testing for a partial match. */ 639 640 if (clen == 0 && poptable[codevalue] != 0) 641 could_continue = TRUE; 642 643 /* If this opcode is followed by an inline character, load it. It is 644 tempting to test for the presence of a subject character here, but that 645 is wrong, because sometimes zero repetitions of the subject are 646 permitted. 647 648 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 649 argument that is not a data character - but is always one byte long because 650 the values are small. We have to take special action to deal with \P, \p, 651 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert 652 these ones to new opcodes. */ 653 654 if (coptable[codevalue] > 0) 655 { 656 dlen = 1; 657 #ifdef SUPPORT_UNICODE 658 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 659 #endif /* SUPPORT_UNICODE */ 660 d = code[coptable[codevalue]]; 661 if (codevalue >= OP_TYPESTAR) 662 { 663 switch(d) 664 { 665 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM; 666 case OP_NOTPROP: 667 case OP_PROP: codevalue += OP_PROP_EXTRA; break; 668 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 669 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 670 case OP_NOT_HSPACE: 671 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 672 case OP_NOT_VSPACE: 673 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 674 default: break; 675 } 676 } 677 } 678 else 679 { 680 dlen = 0; /* Not strictly necessary, but compilers moan */ 681 d = NOTACHAR; /* if these variables are not set. */ 682 } 683 684 685 /* Now process the individual opcodes */ 686 687 switch (codevalue) 688 { 689 /* ========================================================================== */ 690 /* These cases are never obeyed. This is a fudge that causes a compile- 691 time error if the vectors coptable or poptable, which are indexed by 692 opcode, are not the correct length. It seems to be the only way to do 693 such a check at compile time, as the sizeof() operator does not work 694 in the C preprocessor. */ 695 696 case OP_TABLE_LENGTH: 697 case OP_TABLE_LENGTH + 698 ((sizeof(coptable) == OP_TABLE_LENGTH) && 699 (sizeof(poptable) == OP_TABLE_LENGTH)): 700 break; 701 702 /* ========================================================================== */ 703 /* Reached a closing bracket. If not at the end of the pattern, carry 704 on with the next opcode. For repeating opcodes, also add the repeat 705 state. Note that KETRPOS will always be encountered at the end of the 706 subpattern, because the possessive subpattern repeats are always handled 707 using recursive calls. Thus, it never adds any new states. 708 709 At the end of the (sub)pattern, unless we have an empty string and 710 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the 711 start of the subject, save the match data, shifting up all previous 712 matches so we always have the longest first. */ 713 714 case OP_KET: 715 case OP_KETRMIN: 716 case OP_KETRMAX: 717 case OP_KETRPOS: 718 if (code != end_code) 719 { 720 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 721 if (codevalue != OP_KET) 722 { 723 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0); 724 } 725 } 726 else 727 { 728 if (ptr > current_subject || 729 ((mb->moptions & PCRE2_NOTEMPTY) == 0 && 730 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 || 731 current_subject > start_subject + mb->start_offset))) 732 { 733 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 734 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) 735 match_count = 0; 736 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; 737 if (count > 0) memmove(offsets + 2, offsets, 738 (size_t)count * sizeof(PCRE2_SIZE)); 739 if (offsetcount >= 2) 740 { 741 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject); 742 offsets[1] = (PCRE2_SIZE)(ptr - start_subject); 743 } 744 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count; 745 } 746 } 747 break; 748 749 /* ========================================================================== */ 750 /* These opcodes add to the current list of states without looking 751 at the current character. */ 752 753 /*-----------------------------------------------------------------*/ 754 case OP_ALT: 755 do { code += GET(code, 1); } while (*code == OP_ALT); 756 ADD_ACTIVE((int)(code - start_code), 0); 757 break; 758 759 /*-----------------------------------------------------------------*/ 760 case OP_BRA: 761 case OP_SBRA: 762 do 763 { 764 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 765 code += GET(code, 1); 766 } 767 while (*code == OP_ALT); 768 break; 769 770 /*-----------------------------------------------------------------*/ 771 case OP_CBRA: 772 case OP_SCBRA: 773 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); 774 code += GET(code, 1); 775 while (*code == OP_ALT) 776 { 777 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 778 code += GET(code, 1); 779 } 780 break; 781 782 /*-----------------------------------------------------------------*/ 783 case OP_BRAZERO: 784 case OP_BRAMINZERO: 785 ADD_ACTIVE(state_offset + 1, 0); 786 code += 1 + GET(code, 2); 787 while (*code == OP_ALT) code += GET(code, 1); 788 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 789 break; 790 791 /*-----------------------------------------------------------------*/ 792 case OP_SKIPZERO: 793 code += 1 + GET(code, 2); 794 while (*code == OP_ALT) code += GET(code, 1); 795 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 796 break; 797 798 /*-----------------------------------------------------------------*/ 799 case OP_CIRC: 800 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) 801 { ADD_ACTIVE(state_offset + 1, 0); } 802 break; 803 804 /*-----------------------------------------------------------------*/ 805 case OP_CIRCM: 806 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) || 807 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 ) 808 && WAS_NEWLINE(ptr))) 809 { ADD_ACTIVE(state_offset + 1, 0); } 810 break; 811 812 /*-----------------------------------------------------------------*/ 813 case OP_EOD: 814 if (ptr >= end_subject) 815 { 816 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 817 could_continue = TRUE; 818 else { ADD_ACTIVE(state_offset + 1, 0); } 819 } 820 break; 821 822 /*-----------------------------------------------------------------*/ 823 case OP_SOD: 824 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 825 break; 826 827 /*-----------------------------------------------------------------*/ 828 case OP_SOM: 829 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 830 break; 831 832 833 /* ========================================================================== */ 834 /* These opcodes inspect the next subject character, and sometimes 835 the previous one as well, but do not have an argument. The variable 836 clen contains the length of the current character and is zero if we are 837 at the end of the subject. */ 838 839 /*-----------------------------------------------------------------*/ 840 case OP_ANY: 841 if (clen > 0 && !IS_NEWLINE(ptr)) 842 { 843 if (ptr + 1 >= mb->end_subject && 844 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 845 NLBLOCK->nltype == NLTYPE_FIXED && 846 NLBLOCK->nllen == 2 && 847 c == NLBLOCK->nl[0]) 848 { 849 could_continue = partial_newline = TRUE; 850 } 851 else 852 { 853 ADD_NEW(state_offset + 1, 0); 854 } 855 } 856 break; 857 858 /*-----------------------------------------------------------------*/ 859 case OP_ALLANY: 860 if (clen > 0) 861 { ADD_NEW(state_offset + 1, 0); } 862 break; 863 864 /*-----------------------------------------------------------------*/ 865 case OP_EODN: 866 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 867 could_continue = TRUE; 868 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen)) 869 { ADD_ACTIVE(state_offset + 1, 0); } 870 break; 871 872 /*-----------------------------------------------------------------*/ 873 case OP_DOLL: 874 if ((mb->moptions & PCRE2_NOTEOL) == 0) 875 { 876 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 877 could_continue = TRUE; 878 else if (clen == 0 || 879 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 880 (ptr == end_subject - mb->nllen) 881 )) 882 { ADD_ACTIVE(state_offset + 1, 0); } 883 else if (ptr + 1 >= mb->end_subject && 884 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && 885 NLBLOCK->nltype == NLTYPE_FIXED && 886 NLBLOCK->nllen == 2 && 887 c == NLBLOCK->nl[0]) 888 { 889 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 890 { 891 reset_could_continue = TRUE; 892 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 893 } 894 else could_continue = partial_newline = TRUE; 895 } 896 } 897 break; 898 899 /*-----------------------------------------------------------------*/ 900 case OP_DOLLM: 901 if ((mb->moptions & PCRE2_NOTEOL) == 0) 902 { 903 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 904 could_continue = TRUE; 905 else if (clen == 0 || 906 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) 907 { ADD_ACTIVE(state_offset + 1, 0); } 908 else if (ptr + 1 >= mb->end_subject && 909 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && 910 NLBLOCK->nltype == NLTYPE_FIXED && 911 NLBLOCK->nllen == 2 && 912 c == NLBLOCK->nl[0]) 913 { 914 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 915 { 916 reset_could_continue = TRUE; 917 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 918 } 919 else could_continue = partial_newline = TRUE; 920 } 921 } 922 else if (IS_NEWLINE(ptr)) 923 { ADD_ACTIVE(state_offset + 1, 0); } 924 break; 925 926 /*-----------------------------------------------------------------*/ 927 928 case OP_DIGIT: 929 case OP_WHITESPACE: 930 case OP_WORDCHAR: 931 if (clen > 0 && c < 256 && 932 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 933 { ADD_NEW(state_offset + 1, 0); } 934 break; 935 936 /*-----------------------------------------------------------------*/ 937 case OP_NOT_DIGIT: 938 case OP_NOT_WHITESPACE: 939 case OP_NOT_WORDCHAR: 940 if (clen > 0 && (c >= 256 || 941 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 942 { ADD_NEW(state_offset + 1, 0); } 943 break; 944 945 /*-----------------------------------------------------------------*/ 946 case OP_WORD_BOUNDARY: 947 case OP_NOT_WORD_BOUNDARY: 948 { 949 int left_word, right_word; 950 951 if (ptr > start_subject) 952 { 953 PCRE2_SPTR temp = ptr - 1; 954 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp; 955 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 956 if (utf) { BACKCHAR(temp); } 957 #endif 958 GETCHARTEST(d, temp); 959 #ifdef SUPPORT_UNICODE 960 if ((mb->poptions & PCRE2_UCP) != 0) 961 { 962 if (d == '_') left_word = TRUE; else 963 { 964 uint32_t cat = UCD_CATEGORY(d); 965 left_word = (cat == ucp_L || cat == ucp_N); 966 } 967 } 968 else 969 #endif 970 left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 971 } 972 else left_word = FALSE; 973 974 if (clen > 0) 975 { 976 if (ptr >= mb->last_used_ptr) 977 { 978 PCRE2_SPTR temp = ptr + 1; 979 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 980 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); } 981 #endif 982 mb->last_used_ptr = temp; 983 } 984 #ifdef SUPPORT_UNICODE 985 if ((mb->poptions & PCRE2_UCP) != 0) 986 { 987 if (c == '_') right_word = TRUE; else 988 { 989 uint32_t cat = UCD_CATEGORY(c); 990 right_word = (cat == ucp_L || cat == ucp_N); 991 } 992 } 993 else 994 #endif 995 right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 996 } 997 else right_word = FALSE; 998 999 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 1000 { ADD_ACTIVE(state_offset + 1, 0); } 1001 } 1002 break; 1003 1004 1005 /*-----------------------------------------------------------------*/ 1006 /* Check the next character by Unicode property. We will get here only 1007 if the support is in the binary; otherwise a compile-time error occurs. 1008 */ 1009 1010 #ifdef SUPPORT_UNICODE 1011 case OP_PROP: 1012 case OP_NOTPROP: 1013 if (clen > 0) 1014 { 1015 BOOL OK; 1016 const uint32_t *cp; 1017 const ucd_record * prop = GET_UCD(c); 1018 switch(code[1]) 1019 { 1020 case PT_ANY: 1021 OK = TRUE; 1022 break; 1023 1024 case PT_LAMP: 1025 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1026 prop->chartype == ucp_Lt; 1027 break; 1028 1029 case PT_GC: 1030 OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; 1031 break; 1032 1033 case PT_PC: 1034 OK = prop->chartype == code[2]; 1035 break; 1036 1037 case PT_SC: 1038 OK = prop->script == code[2]; 1039 break; 1040 1041 /* These are specials for combination cases. */ 1042 1043 case PT_ALNUM: 1044 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1045 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1046 break; 1047 1048 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1049 which means that Perl space and POSIX space are now identical. PCRE 1050 was changed at release 8.34. */ 1051 1052 case PT_SPACE: /* Perl space */ 1053 case PT_PXSPACE: /* POSIX space */ 1054 switch(c) 1055 { 1056 HSPACE_CASES: 1057 VSPACE_CASES: 1058 OK = TRUE; 1059 break; 1060 1061 default: 1062 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1063 break; 1064 } 1065 break; 1066 1067 case PT_WORD: 1068 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1069 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1070 c == CHAR_UNDERSCORE; 1071 break; 1072 1073 case PT_CLIST: 1074 cp = PRIV(ucd_caseless_sets) + code[2]; 1075 for (;;) 1076 { 1077 if (c < *cp) { OK = FALSE; break; } 1078 if (c == *cp++) { OK = TRUE; break; } 1079 } 1080 break; 1081 1082 case PT_UCNC: 1083 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1084 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1085 c >= 0xe000; 1086 break; 1087 1088 /* Should never occur, but keep compilers from grumbling. */ 1089 1090 default: 1091 OK = codevalue != OP_PROP; 1092 break; 1093 } 1094 1095 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 1096 } 1097 break; 1098 #endif 1099 1100 1101 1102 /* ========================================================================== */ 1103 /* These opcodes likewise inspect the subject character, but have an 1104 argument that is not a data character. It is one of these opcodes: 1105 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 1106 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 1107 1108 case OP_TYPEPLUS: 1109 case OP_TYPEMINPLUS: 1110 case OP_TYPEPOSPLUS: 1111 count = current_state->count; /* Already matched */ 1112 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1113 if (clen > 0) 1114 { 1115 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1116 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1117 NLBLOCK->nltype == NLTYPE_FIXED && 1118 NLBLOCK->nllen == 2 && 1119 c == NLBLOCK->nl[0]) 1120 { 1121 could_continue = partial_newline = TRUE; 1122 } 1123 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1124 (c < 256 && 1125 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1126 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1127 { 1128 if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1129 { 1130 active_count--; /* Remove non-match possibility */ 1131 next_active_state--; 1132 } 1133 count++; 1134 ADD_NEW(state_offset, count); 1135 } 1136 } 1137 break; 1138 1139 /*-----------------------------------------------------------------*/ 1140 case OP_TYPEQUERY: 1141 case OP_TYPEMINQUERY: 1142 case OP_TYPEPOSQUERY: 1143 ADD_ACTIVE(state_offset + 2, 0); 1144 if (clen > 0) 1145 { 1146 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1147 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1148 NLBLOCK->nltype == NLTYPE_FIXED && 1149 NLBLOCK->nllen == 2 && 1150 c == NLBLOCK->nl[0]) 1151 { 1152 could_continue = partial_newline = TRUE; 1153 } 1154 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1155 (c < 256 && 1156 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1157 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1158 { 1159 if (codevalue == OP_TYPEPOSQUERY) 1160 { 1161 active_count--; /* Remove non-match possibility */ 1162 next_active_state--; 1163 } 1164 ADD_NEW(state_offset + 2, 0); 1165 } 1166 } 1167 break; 1168 1169 /*-----------------------------------------------------------------*/ 1170 case OP_TYPESTAR: 1171 case OP_TYPEMINSTAR: 1172 case OP_TYPEPOSSTAR: 1173 ADD_ACTIVE(state_offset + 2, 0); 1174 if (clen > 0) 1175 { 1176 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1177 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1178 NLBLOCK->nltype == NLTYPE_FIXED && 1179 NLBLOCK->nllen == 2 && 1180 c == NLBLOCK->nl[0]) 1181 { 1182 could_continue = partial_newline = TRUE; 1183 } 1184 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1185 (c < 256 && 1186 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1187 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1188 { 1189 if (codevalue == OP_TYPEPOSSTAR) 1190 { 1191 active_count--; /* Remove non-match possibility */ 1192 next_active_state--; 1193 } 1194 ADD_NEW(state_offset, 0); 1195 } 1196 } 1197 break; 1198 1199 /*-----------------------------------------------------------------*/ 1200 case OP_TYPEEXACT: 1201 count = current_state->count; /* Number already matched */ 1202 if (clen > 0) 1203 { 1204 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1205 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1206 NLBLOCK->nltype == NLTYPE_FIXED && 1207 NLBLOCK->nllen == 2 && 1208 c == NLBLOCK->nl[0]) 1209 { 1210 could_continue = partial_newline = TRUE; 1211 } 1212 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1213 (c < 256 && 1214 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1215 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1216 { 1217 if (++count >= (int)GET2(code, 1)) 1218 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } 1219 else 1220 { ADD_NEW(state_offset, count); } 1221 } 1222 } 1223 break; 1224 1225 /*-----------------------------------------------------------------*/ 1226 case OP_TYPEUPTO: 1227 case OP_TYPEMINUPTO: 1228 case OP_TYPEPOSUPTO: 1229 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); 1230 count = current_state->count; /* Number already matched */ 1231 if (clen > 0) 1232 { 1233 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1234 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1235 NLBLOCK->nltype == NLTYPE_FIXED && 1236 NLBLOCK->nllen == 2 && 1237 c == NLBLOCK->nl[0]) 1238 { 1239 could_continue = partial_newline = TRUE; 1240 } 1241 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1242 (c < 256 && 1243 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1244 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1245 { 1246 if (codevalue == OP_TYPEPOSUPTO) 1247 { 1248 active_count--; /* Remove non-match possibility */ 1249 next_active_state--; 1250 } 1251 if (++count >= (int)GET2(code, 1)) 1252 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } 1253 else 1254 { ADD_NEW(state_offset, count); } 1255 } 1256 } 1257 break; 1258 1259 /* ========================================================================== */ 1260 /* These are virtual opcodes that are used when something like 1261 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1262 argument. It keeps the code above fast for the other cases. The argument 1263 is in the d variable. */ 1264 1265 #ifdef SUPPORT_UNICODE 1266 case OP_PROP_EXTRA + OP_TYPEPLUS: 1267 case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1268 case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1269 count = current_state->count; /* Already matched */ 1270 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1271 if (clen > 0) 1272 { 1273 BOOL OK; 1274 const uint32_t *cp; 1275 const ucd_record * prop = GET_UCD(c); 1276 switch(code[2]) 1277 { 1278 case PT_ANY: 1279 OK = TRUE; 1280 break; 1281 1282 case PT_LAMP: 1283 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1284 prop->chartype == ucp_Lt; 1285 break; 1286 1287 case PT_GC: 1288 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1289 break; 1290 1291 case PT_PC: 1292 OK = prop->chartype == code[3]; 1293 break; 1294 1295 case PT_SC: 1296 OK = prop->script == code[3]; 1297 break; 1298 1299 /* These are specials for combination cases. */ 1300 1301 case PT_ALNUM: 1302 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1303 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1304 break; 1305 1306 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1307 which means that Perl space and POSIX space are now identical. PCRE 1308 was changed at release 8.34. */ 1309 1310 case PT_SPACE: /* Perl space */ 1311 case PT_PXSPACE: /* POSIX space */ 1312 switch(c) 1313 { 1314 HSPACE_CASES: 1315 VSPACE_CASES: 1316 OK = TRUE; 1317 break; 1318 1319 default: 1320 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1321 break; 1322 } 1323 break; 1324 1325 case PT_WORD: 1326 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1327 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1328 c == CHAR_UNDERSCORE; 1329 break; 1330 1331 case PT_CLIST: 1332 cp = PRIV(ucd_caseless_sets) + code[3]; 1333 for (;;) 1334 { 1335 if (c < *cp) { OK = FALSE; break; } 1336 if (c == *cp++) { OK = TRUE; break; } 1337 } 1338 break; 1339 1340 case PT_UCNC: 1341 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1342 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1343 c >= 0xe000; 1344 break; 1345 1346 /* Should never occur, but keep compilers from grumbling. */ 1347 1348 default: 1349 OK = codevalue != OP_PROP; 1350 break; 1351 } 1352 1353 if (OK == (d == OP_PROP)) 1354 { 1355 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1356 { 1357 active_count--; /* Remove non-match possibility */ 1358 next_active_state--; 1359 } 1360 count++; 1361 ADD_NEW(state_offset, count); 1362 } 1363 } 1364 break; 1365 1366 /*-----------------------------------------------------------------*/ 1367 case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1368 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1369 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1370 count = current_state->count; /* Already matched */ 1371 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1372 if (clen > 0) 1373 { 1374 uint32_t lgb, rgb; 1375 PCRE2_SPTR nptr = ptr + clen; 1376 int ncount = 0; 1377 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1378 { 1379 active_count--; /* Remove non-match possibility */ 1380 next_active_state--; 1381 } 1382 lgb = UCD_GRAPHBREAK(c); 1383 while (nptr < end_subject) 1384 { 1385 dlen = 1; 1386 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1387 rgb = UCD_GRAPHBREAK(d); 1388 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; 1389 ncount++; 1390 lgb = rgb; 1391 nptr += dlen; 1392 } 1393 count++; 1394 ADD_NEW_DATA(-state_offset, count, ncount); 1395 } 1396 break; 1397 #endif 1398 1399 /*-----------------------------------------------------------------*/ 1400 case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1401 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1402 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1403 count = current_state->count; /* Already matched */ 1404 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1405 if (clen > 0) 1406 { 1407 int ncount = 0; 1408 switch (c) 1409 { 1410 case CHAR_VT: 1411 case CHAR_FF: 1412 case CHAR_NEL: 1413 #ifndef EBCDIC 1414 case 0x2028: 1415 case 0x2029: 1416 #endif /* Not EBCDIC */ 1417 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 1418 goto ANYNL01; 1419 1420 case CHAR_CR: 1421 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 1422 /* Fall through */ 1423 1424 ANYNL01: 1425 case CHAR_LF: 1426 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1427 { 1428 active_count--; /* Remove non-match possibility */ 1429 next_active_state--; 1430 } 1431 count++; 1432 ADD_NEW_DATA(-state_offset, count, ncount); 1433 break; 1434 1435 default: 1436 break; 1437 } 1438 } 1439 break; 1440 1441 /*-----------------------------------------------------------------*/ 1442 case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1443 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1444 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1445 count = current_state->count; /* Already matched */ 1446 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1447 if (clen > 0) 1448 { 1449 BOOL OK; 1450 switch (c) 1451 { 1452 VSPACE_CASES: 1453 OK = TRUE; 1454 break; 1455 1456 default: 1457 OK = FALSE; 1458 break; 1459 } 1460 1461 if (OK == (d == OP_VSPACE)) 1462 { 1463 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1464 { 1465 active_count--; /* Remove non-match possibility */ 1466 next_active_state--; 1467 } 1468 count++; 1469 ADD_NEW_DATA(-state_offset, count, 0); 1470 } 1471 } 1472 break; 1473 1474 /*-----------------------------------------------------------------*/ 1475 case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1476 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1477 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1478 count = current_state->count; /* Already matched */ 1479 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1480 if (clen > 0) 1481 { 1482 BOOL OK; 1483 switch (c) 1484 { 1485 HSPACE_CASES: 1486 OK = TRUE; 1487 break; 1488 1489 default: 1490 OK = FALSE; 1491 break; 1492 } 1493 1494 if (OK == (d == OP_HSPACE)) 1495 { 1496 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1497 { 1498 active_count--; /* Remove non-match possibility */ 1499 next_active_state--; 1500 } 1501 count++; 1502 ADD_NEW_DATA(-state_offset, count, 0); 1503 } 1504 } 1505 break; 1506 1507 /*-----------------------------------------------------------------*/ 1508 #ifdef SUPPORT_UNICODE 1509 case OP_PROP_EXTRA + OP_TYPEQUERY: 1510 case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1511 case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1512 count = 4; 1513 goto QS1; 1514 1515 case OP_PROP_EXTRA + OP_TYPESTAR: 1516 case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1517 case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1518 count = 0; 1519 1520 QS1: 1521 1522 ADD_ACTIVE(state_offset + 4, 0); 1523 if (clen > 0) 1524 { 1525 BOOL OK; 1526 const uint32_t *cp; 1527 const ucd_record * prop = GET_UCD(c); 1528 switch(code[2]) 1529 { 1530 case PT_ANY: 1531 OK = TRUE; 1532 break; 1533 1534 case PT_LAMP: 1535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1536 prop->chartype == ucp_Lt; 1537 break; 1538 1539 case PT_GC: 1540 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1541 break; 1542 1543 case PT_PC: 1544 OK = prop->chartype == code[3]; 1545 break; 1546 1547 case PT_SC: 1548 OK = prop->script == code[3]; 1549 break; 1550 1551 /* These are specials for combination cases. */ 1552 1553 case PT_ALNUM: 1554 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1555 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1556 break; 1557 1558 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1559 which means that Perl space and POSIX space are now identical. PCRE 1560 was changed at release 8.34. */ 1561 1562 case PT_SPACE: /* Perl space */ 1563 case PT_PXSPACE: /* POSIX space */ 1564 switch(c) 1565 { 1566 HSPACE_CASES: 1567 VSPACE_CASES: 1568 OK = TRUE; 1569 break; 1570 1571 default: 1572 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1573 break; 1574 } 1575 break; 1576 1577 case PT_WORD: 1578 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1579 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1580 c == CHAR_UNDERSCORE; 1581 break; 1582 1583 case PT_CLIST: 1584 cp = PRIV(ucd_caseless_sets) + code[3]; 1585 for (;;) 1586 { 1587 if (c < *cp) { OK = FALSE; break; } 1588 if (c == *cp++) { OK = TRUE; break; } 1589 } 1590 break; 1591 1592 case PT_UCNC: 1593 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1594 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1595 c >= 0xe000; 1596 break; 1597 1598 /* Should never occur, but keep compilers from grumbling. */ 1599 1600 default: 1601 OK = codevalue != OP_PROP; 1602 break; 1603 } 1604 1605 if (OK == (d == OP_PROP)) 1606 { 1607 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1608 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1609 { 1610 active_count--; /* Remove non-match possibility */ 1611 next_active_state--; 1612 } 1613 ADD_NEW(state_offset + count, 0); 1614 } 1615 } 1616 break; 1617 1618 /*-----------------------------------------------------------------*/ 1619 case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1620 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1621 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1622 count = 2; 1623 goto QS2; 1624 1625 case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1626 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1627 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1628 count = 0; 1629 1630 QS2: 1631 1632 ADD_ACTIVE(state_offset + 2, 0); 1633 if (clen > 0) 1634 { 1635 uint32_t lgb, rgb; 1636 PCRE2_SPTR nptr = ptr + clen; 1637 int ncount = 0; 1638 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1639 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1640 { 1641 active_count--; /* Remove non-match possibility */ 1642 next_active_state--; 1643 } 1644 lgb = UCD_GRAPHBREAK(c); 1645 while (nptr < end_subject) 1646 { 1647 dlen = 1; 1648 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1649 rgb = UCD_GRAPHBREAK(d); 1650 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; 1651 ncount++; 1652 lgb = rgb; 1653 nptr += dlen; 1654 } 1655 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1656 } 1657 break; 1658 #endif 1659 1660 /*-----------------------------------------------------------------*/ 1661 case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1662 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1663 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1664 count = 2; 1665 goto QS3; 1666 1667 case OP_ANYNL_EXTRA + OP_TYPESTAR: 1668 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1669 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1670 count = 0; 1671 1672 QS3: 1673 ADD_ACTIVE(state_offset + 2, 0); 1674 if (clen > 0) 1675 { 1676 int ncount = 0; 1677 switch (c) 1678 { 1679 case CHAR_VT: 1680 case CHAR_FF: 1681 case CHAR_NEL: 1682 #ifndef EBCDIC 1683 case 0x2028: 1684 case 0x2029: 1685 #endif /* Not EBCDIC */ 1686 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 1687 goto ANYNL02; 1688 1689 case CHAR_CR: 1690 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 1691 /* Fall through */ 1692 1693 ANYNL02: 1694 case CHAR_LF: 1695 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1696 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1697 { 1698 active_count--; /* Remove non-match possibility */ 1699 next_active_state--; 1700 } 1701 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); 1702 break; 1703 1704 default: 1705 break; 1706 } 1707 } 1708 break; 1709 1710 /*-----------------------------------------------------------------*/ 1711 case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1712 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1713 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1714 count = 2; 1715 goto QS4; 1716 1717 case OP_VSPACE_EXTRA + OP_TYPESTAR: 1718 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1719 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1720 count = 0; 1721 1722 QS4: 1723 ADD_ACTIVE(state_offset + 2, 0); 1724 if (clen > 0) 1725 { 1726 BOOL OK; 1727 switch (c) 1728 { 1729 VSPACE_CASES: 1730 OK = TRUE; 1731 break; 1732 1733 default: 1734 OK = FALSE; 1735 break; 1736 } 1737 if (OK == (d == OP_VSPACE)) 1738 { 1739 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1740 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1741 { 1742 active_count--; /* Remove non-match possibility */ 1743 next_active_state--; 1744 } 1745 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1746 } 1747 } 1748 break; 1749 1750 /*-----------------------------------------------------------------*/ 1751 case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1752 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1753 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1754 count = 2; 1755 goto QS5; 1756 1757 case OP_HSPACE_EXTRA + OP_TYPESTAR: 1758 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1759 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1760 count = 0; 1761 1762 QS5: 1763 ADD_ACTIVE(state_offset + 2, 0); 1764 if (clen > 0) 1765 { 1766 BOOL OK; 1767 switch (c) 1768 { 1769 HSPACE_CASES: 1770 OK = TRUE; 1771 break; 1772 1773 default: 1774 OK = FALSE; 1775 break; 1776 } 1777 1778 if (OK == (d == OP_HSPACE)) 1779 { 1780 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1781 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1782 { 1783 active_count--; /* Remove non-match possibility */ 1784 next_active_state--; 1785 } 1786 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1787 } 1788 } 1789 break; 1790 1791 /*-----------------------------------------------------------------*/ 1792 #ifdef SUPPORT_UNICODE 1793 case OP_PROP_EXTRA + OP_TYPEEXACT: 1794 case OP_PROP_EXTRA + OP_TYPEUPTO: 1795 case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1796 case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1797 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1798 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } 1799 count = current_state->count; /* Number already matched */ 1800 if (clen > 0) 1801 { 1802 BOOL OK; 1803 const uint32_t *cp; 1804 const ucd_record * prop = GET_UCD(c); 1805 switch(code[1 + IMM2_SIZE + 1]) 1806 { 1807 case PT_ANY: 1808 OK = TRUE; 1809 break; 1810 1811 case PT_LAMP: 1812 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1813 prop->chartype == ucp_Lt; 1814 break; 1815 1816 case PT_GC: 1817 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; 1818 break; 1819 1820 case PT_PC: 1821 OK = prop->chartype == code[1 + IMM2_SIZE + 2]; 1822 break; 1823 1824 case PT_SC: 1825 OK = prop->script == code[1 + IMM2_SIZE + 2]; 1826 break; 1827 1828 /* These are specials for combination cases. */ 1829 1830 case PT_ALNUM: 1831 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1832 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1833 break; 1834 1835 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1836 which means that Perl space and POSIX space are now identical. PCRE 1837 was changed at release 8.34. */ 1838 1839 case PT_SPACE: /* Perl space */ 1840 case PT_PXSPACE: /* POSIX space */ 1841 switch(c) 1842 { 1843 HSPACE_CASES: 1844 VSPACE_CASES: 1845 OK = TRUE; 1846 break; 1847 1848 default: 1849 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1850 break; 1851 } 1852 break; 1853 1854 case PT_WORD: 1855 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1856 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1857 c == CHAR_UNDERSCORE; 1858 break; 1859 1860 case PT_CLIST: 1861 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; 1862 for (;;) 1863 { 1864 if (c < *cp) { OK = FALSE; break; } 1865 if (c == *cp++) { OK = TRUE; break; } 1866 } 1867 break; 1868 1869 case PT_UCNC: 1870 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1871 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1872 c >= 0xe000; 1873 break; 1874 1875 /* Should never occur, but keep compilers from grumbling. */ 1876 1877 default: 1878 OK = codevalue != OP_PROP; 1879 break; 1880 } 1881 1882 if (OK == (d == OP_PROP)) 1883 { 1884 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 1885 { 1886 active_count--; /* Remove non-match possibility */ 1887 next_active_state--; 1888 } 1889 if (++count >= (int)GET2(code, 1)) 1890 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } 1891 else 1892 { ADD_NEW(state_offset, count); } 1893 } 1894 } 1895 break; 1896 1897 /*-----------------------------------------------------------------*/ 1898 case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 1899 case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 1900 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 1901 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 1902 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 1903 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1904 count = current_state->count; /* Number already matched */ 1905 if (clen > 0) 1906 { 1907 uint32_t lgb, rgb; 1908 PCRE2_SPTR nptr = ptr + clen; 1909 int ncount = 0; 1910 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 1911 { 1912 active_count--; /* Remove non-match possibility */ 1913 next_active_state--; 1914 } 1915 lgb = UCD_GRAPHBREAK(c); 1916 while (nptr < end_subject) 1917 { 1918 dlen = 1; 1919 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 1920 rgb = UCD_GRAPHBREAK(d); 1921 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; 1922 ncount++; 1923 lgb = rgb; 1924 nptr += dlen; 1925 } 1926 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1927 reset_could_continue = TRUE; 1928 if (++count >= (int)GET2(code, 1)) 1929 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1930 else 1931 { ADD_NEW_DATA(-state_offset, count, ncount); } 1932 } 1933 break; 1934 #endif 1935 1936 /*-----------------------------------------------------------------*/ 1937 case OP_ANYNL_EXTRA + OP_TYPEEXACT: 1938 case OP_ANYNL_EXTRA + OP_TYPEUPTO: 1939 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 1940 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 1941 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 1942 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1943 count = current_state->count; /* Number already matched */ 1944 if (clen > 0) 1945 { 1946 int ncount = 0; 1947 switch (c) 1948 { 1949 case CHAR_VT: 1950 case CHAR_FF: 1951 case CHAR_NEL: 1952 #ifndef EBCDIC 1953 case 0x2028: 1954 case 0x2029: 1955 #endif /* Not EBCDIC */ 1956 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 1957 goto ANYNL03; 1958 1959 case CHAR_CR: 1960 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 1961 /* Fall through */ 1962 1963 ANYNL03: 1964 case CHAR_LF: 1965 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 1966 { 1967 active_count--; /* Remove non-match possibility */ 1968 next_active_state--; 1969 } 1970 if (++count >= (int)GET2(code, 1)) 1971 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1972 else 1973 { ADD_NEW_DATA(-state_offset, count, ncount); } 1974 break; 1975 1976 default: 1977 break; 1978 } 1979 } 1980 break; 1981 1982 /*-----------------------------------------------------------------*/ 1983 case OP_VSPACE_EXTRA + OP_TYPEEXACT: 1984 case OP_VSPACE_EXTRA + OP_TYPEUPTO: 1985 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 1986 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 1987 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 1988 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1989 count = current_state->count; /* Number already matched */ 1990 if (clen > 0) 1991 { 1992 BOOL OK; 1993 switch (c) 1994 { 1995 VSPACE_CASES: 1996 OK = TRUE; 1997 break; 1998 1999 default: 2000 OK = FALSE; 2001 } 2002 2003 if (OK == (d == OP_VSPACE)) 2004 { 2005 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 2006 { 2007 active_count--; /* Remove non-match possibility */ 2008 next_active_state--; 2009 } 2010 if (++count >= (int)GET2(code, 1)) 2011 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2012 else 2013 { ADD_NEW_DATA(-state_offset, count, 0); } 2014 } 2015 } 2016 break; 2017 2018 /*-----------------------------------------------------------------*/ 2019 case OP_HSPACE_EXTRA + OP_TYPEEXACT: 2020 case OP_HSPACE_EXTRA + OP_TYPEUPTO: 2021 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 2022 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 2023 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 2024 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2025 count = current_state->count; /* Number already matched */ 2026 if (clen > 0) 2027 { 2028 BOOL OK; 2029 switch (c) 2030 { 2031 HSPACE_CASES: 2032 OK = TRUE; 2033 break; 2034 2035 default: 2036 OK = FALSE; 2037 break; 2038 } 2039 2040 if (OK == (d == OP_HSPACE)) 2041 { 2042 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 2043 { 2044 active_count--; /* Remove non-match possibility */ 2045 next_active_state--; 2046 } 2047 if (++count >= (int)GET2(code, 1)) 2048 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2049 else 2050 { ADD_NEW_DATA(-state_offset, count, 0); } 2051 } 2052 } 2053 break; 2054 2055 /* ========================================================================== */ 2056 /* These opcodes are followed by a character that is usually compared 2057 to the current subject character; it is loaded into d. We still get 2058 here even if there is no subject character, because in some cases zero 2059 repetitions are permitted. */ 2060 2061 /*-----------------------------------------------------------------*/ 2062 case OP_CHAR: 2063 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 2064 break; 2065 2066 /*-----------------------------------------------------------------*/ 2067 case OP_CHARI: 2068 if (clen == 0) break; 2069 2070 #ifdef SUPPORT_UNICODE 2071 if (utf) 2072 { 2073 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 2074 { 2075 unsigned int othercase; 2076 if (c < 128) 2077 othercase = fcc[c]; 2078 else 2079 othercase = UCD_OTHERCASE(c); 2080 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 2081 } 2082 } 2083 else 2084 #endif /* SUPPORT_UNICODE */ 2085 /* Not UTF mode */ 2086 { 2087 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) 2088 { ADD_NEW(state_offset + 2, 0); } 2089 } 2090 break; 2091 2092 2093 #ifdef SUPPORT_UNICODE 2094 /*-----------------------------------------------------------------*/ 2095 /* This is a tricky one because it can match more than one character. 2096 Find out how many characters to skip, and then set up a negative state 2097 to wait for them to pass before continuing. */ 2098 2099 case OP_EXTUNI: 2100 if (clen > 0) 2101 { 2102 uint32_t lgb, rgb; 2103 PCRE2_SPTR nptr = ptr + clen; 2104 int ncount = 0; 2105 lgb = UCD_GRAPHBREAK(c); 2106 while (nptr < end_subject) 2107 { 2108 dlen = 1; 2109 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } 2110 rgb = UCD_GRAPHBREAK(d); 2111 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; 2112 ncount++; 2113 lgb = rgb; 2114 nptr += dlen; 2115 } 2116 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 2117 reset_could_continue = TRUE; 2118 ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 2119 } 2120 break; 2121 #endif 2122 2123 /*-----------------------------------------------------------------*/ 2124 /* This is a tricky like EXTUNI because it too can match more than one 2125 character (when CR is followed by LF). In this case, set up a negative 2126 state to wait for one character to pass before continuing. */ 2127 2128 case OP_ANYNL: 2129 if (clen > 0) switch(c) 2130 { 2131 case CHAR_VT: 2132 case CHAR_FF: 2133 case CHAR_NEL: 2134 #ifndef EBCDIC 2135 case 0x2028: 2136 case 0x2029: 2137 #endif /* Not EBCDIC */ 2138 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 2139 2140 case CHAR_LF: 2141 ADD_NEW(state_offset + 1, 0); 2142 break; 2143 2144 case CHAR_CR: 2145 if (ptr + 1 >= end_subject) 2146 { 2147 ADD_NEW(state_offset + 1, 0); 2148 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 2149 reset_could_continue = TRUE; 2150 } 2151 else if (UCHAR21TEST(ptr + 1) == CHAR_LF) 2152 { 2153 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 2154 } 2155 else 2156 { 2157 ADD_NEW(state_offset + 1, 0); 2158 } 2159 break; 2160 } 2161 break; 2162 2163 /*-----------------------------------------------------------------*/ 2164 case OP_NOT_VSPACE: 2165 if (clen > 0) switch(c) 2166 { 2167 VSPACE_CASES: 2168 break; 2169 2170 default: 2171 ADD_NEW(state_offset + 1, 0); 2172 break; 2173 } 2174 break; 2175 2176 /*-----------------------------------------------------------------*/ 2177 case OP_VSPACE: 2178 if (clen > 0) switch(c) 2179 { 2180 VSPACE_CASES: 2181 ADD_NEW(state_offset + 1, 0); 2182 break; 2183 2184 default: 2185 break; 2186 } 2187 break; 2188 2189 /*-----------------------------------------------------------------*/ 2190 case OP_NOT_HSPACE: 2191 if (clen > 0) switch(c) 2192 { 2193 HSPACE_CASES: 2194 break; 2195 2196 default: 2197 ADD_NEW(state_offset + 1, 0); 2198 break; 2199 } 2200 break; 2201 2202 /*-----------------------------------------------------------------*/ 2203 case OP_HSPACE: 2204 if (clen > 0) switch(c) 2205 { 2206 HSPACE_CASES: 2207 ADD_NEW(state_offset + 1, 0); 2208 break; 2209 2210 default: 2211 break; 2212 } 2213 break; 2214 2215 /*-----------------------------------------------------------------*/ 2216 /* Match a negated single character casefully. */ 2217 2218 case OP_NOT: 2219 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } 2220 break; 2221 2222 /*-----------------------------------------------------------------*/ 2223 /* Match a negated single character caselessly. */ 2224 2225 case OP_NOTI: 2226 if (clen > 0) 2227 { 2228 unsigned int otherd; 2229 #ifdef SUPPORT_UNICODE 2230 if (utf && d >= 128) 2231 otherd = UCD_OTHERCASE(d); 2232 else 2233 #endif /* SUPPORT_UNICODE */ 2234 otherd = TABLE_GET(d, fcc, d); 2235 if (c != d && c != otherd) 2236 { ADD_NEW(state_offset + dlen + 1, 0); } 2237 } 2238 break; 2239 2240 /*-----------------------------------------------------------------*/ 2241 case OP_PLUSI: 2242 case OP_MINPLUSI: 2243 case OP_POSPLUSI: 2244 case OP_NOTPLUSI: 2245 case OP_NOTMINPLUSI: 2246 case OP_NOTPOSPLUSI: 2247 caseless = TRUE; 2248 codevalue -= OP_STARI - OP_STAR; 2249 2250 /* Fall through */ 2251 case OP_PLUS: 2252 case OP_MINPLUS: 2253 case OP_POSPLUS: 2254 case OP_NOTPLUS: 2255 case OP_NOTMINPLUS: 2256 case OP_NOTPOSPLUS: 2257 count = current_state->count; /* Already matched */ 2258 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2259 if (clen > 0) 2260 { 2261 uint32_t otherd = NOTACHAR; 2262 if (caseless) 2263 { 2264 #ifdef SUPPORT_UNICODE 2265 if (utf && d >= 128) 2266 otherd = UCD_OTHERCASE(d); 2267 else 2268 #endif /* SUPPORT_UNICODE */ 2269 otherd = TABLE_GET(d, fcc, d); 2270 } 2271 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2272 { 2273 if (count > 0 && 2274 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2275 { 2276 active_count--; /* Remove non-match possibility */ 2277 next_active_state--; 2278 } 2279 count++; 2280 ADD_NEW(state_offset, count); 2281 } 2282 } 2283 break; 2284 2285 /*-----------------------------------------------------------------*/ 2286 case OP_QUERYI: 2287 case OP_MINQUERYI: 2288 case OP_POSQUERYI: 2289 case OP_NOTQUERYI: 2290 case OP_NOTMINQUERYI: 2291 case OP_NOTPOSQUERYI: 2292 caseless = TRUE; 2293 codevalue -= OP_STARI - OP_STAR; 2294 /* Fall through */ 2295 case OP_QUERY: 2296 case OP_MINQUERY: 2297 case OP_POSQUERY: 2298 case OP_NOTQUERY: 2299 case OP_NOTMINQUERY: 2300 case OP_NOTPOSQUERY: 2301 ADD_ACTIVE(state_offset + dlen + 1, 0); 2302 if (clen > 0) 2303 { 2304 uint32_t otherd = NOTACHAR; 2305 if (caseless) 2306 { 2307 #ifdef SUPPORT_UNICODE 2308 if (utf && d >= 128) 2309 otherd = UCD_OTHERCASE(d); 2310 else 2311 #endif /* SUPPORT_UNICODE */ 2312 otherd = TABLE_GET(d, fcc, d); 2313 } 2314 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2315 { 2316 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2317 { 2318 active_count--; /* Remove non-match possibility */ 2319 next_active_state--; 2320 } 2321 ADD_NEW(state_offset + dlen + 1, 0); 2322 } 2323 } 2324 break; 2325 2326 /*-----------------------------------------------------------------*/ 2327 case OP_STARI: 2328 case OP_MINSTARI: 2329 case OP_POSSTARI: 2330 case OP_NOTSTARI: 2331 case OP_NOTMINSTARI: 2332 case OP_NOTPOSSTARI: 2333 caseless = TRUE; 2334 codevalue -= OP_STARI - OP_STAR; 2335 /* Fall through */ 2336 case OP_STAR: 2337 case OP_MINSTAR: 2338 case OP_POSSTAR: 2339 case OP_NOTSTAR: 2340 case OP_NOTMINSTAR: 2341 case OP_NOTPOSSTAR: 2342 ADD_ACTIVE(state_offset + dlen + 1, 0); 2343 if (clen > 0) 2344 { 2345 uint32_t otherd = NOTACHAR; 2346 if (caseless) 2347 { 2348 #ifdef SUPPORT_UNICODE 2349 if (utf && d >= 128) 2350 otherd = UCD_OTHERCASE(d); 2351 else 2352 #endif /* SUPPORT_UNICODE */ 2353 otherd = TABLE_GET(d, fcc, d); 2354 } 2355 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2356 { 2357 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2358 { 2359 active_count--; /* Remove non-match possibility */ 2360 next_active_state--; 2361 } 2362 ADD_NEW(state_offset, 0); 2363 } 2364 } 2365 break; 2366 2367 /*-----------------------------------------------------------------*/ 2368 case OP_EXACTI: 2369 case OP_NOTEXACTI: 2370 caseless = TRUE; 2371 codevalue -= OP_STARI - OP_STAR; 2372 /* Fall through */ 2373 case OP_EXACT: 2374 case OP_NOTEXACT: 2375 count = current_state->count; /* Number already matched */ 2376 if (clen > 0) 2377 { 2378 uint32_t otherd = NOTACHAR; 2379 if (caseless) 2380 { 2381 #ifdef SUPPORT_UNICODE 2382 if (utf && d >= 128) 2383 otherd = UCD_OTHERCASE(d); 2384 else 2385 #endif /* SUPPORT_UNICODE */ 2386 otherd = TABLE_GET(d, fcc, d); 2387 } 2388 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2389 { 2390 if (++count >= (int)GET2(code, 1)) 2391 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2392 else 2393 { ADD_NEW(state_offset, count); } 2394 } 2395 } 2396 break; 2397 2398 /*-----------------------------------------------------------------*/ 2399 case OP_UPTOI: 2400 case OP_MINUPTOI: 2401 case OP_POSUPTOI: 2402 case OP_NOTUPTOI: 2403 case OP_NOTMINUPTOI: 2404 case OP_NOTPOSUPTOI: 2405 caseless = TRUE; 2406 codevalue -= OP_STARI - OP_STAR; 2407 /* Fall through */ 2408 case OP_UPTO: 2409 case OP_MINUPTO: 2410 case OP_POSUPTO: 2411 case OP_NOTUPTO: 2412 case OP_NOTMINUPTO: 2413 case OP_NOTPOSUPTO: 2414 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); 2415 count = current_state->count; /* Number already matched */ 2416 if (clen > 0) 2417 { 2418 uint32_t otherd = NOTACHAR; 2419 if (caseless) 2420 { 2421 #ifdef SUPPORT_UNICODE 2422 if (utf && d >= 128) 2423 otherd = UCD_OTHERCASE(d); 2424 else 2425 #endif /* SUPPORT_UNICODE */ 2426 otherd = TABLE_GET(d, fcc, d); 2427 } 2428 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2429 { 2430 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2431 { 2432 active_count--; /* Remove non-match possibility */ 2433 next_active_state--; 2434 } 2435 if (++count >= (int)GET2(code, 1)) 2436 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2437 else 2438 { ADD_NEW(state_offset, count); } 2439 } 2440 } 2441 break; 2442 2443 2444 /* ========================================================================== */ 2445 /* These are the class-handling opcodes */ 2446 2447 case OP_CLASS: 2448 case OP_NCLASS: 2449 case OP_XCLASS: 2450 { 2451 BOOL isinclass = FALSE; 2452 int next_state_offset; 2453 PCRE2_SPTR ecode; 2454 2455 /* For a simple class, there is always just a 32-byte table, and we 2456 can set isinclass from it. */ 2457 2458 if (codevalue != OP_XCLASS) 2459 { 2460 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); 2461 if (clen > 0) 2462 { 2463 isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2464 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0); 2465 } 2466 } 2467 2468 /* An extended class may have a table or a list of single characters, 2469 ranges, or both, and it may be positive or negative. There's a 2470 function that sorts all this out. */ 2471 2472 else 2473 { 2474 ecode = code + GET(code, 1); 2475 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); 2476 } 2477 2478 /* At this point, isinclass is set for all kinds of class, and ecode 2479 points to the byte after the end of the class. If there is a 2480 quantifier, this is where it will be. */ 2481 2482 next_state_offset = (int)(ecode - start_code); 2483 2484 switch (*ecode) 2485 { 2486 case OP_CRSTAR: 2487 case OP_CRMINSTAR: 2488 case OP_CRPOSSTAR: 2489 ADD_ACTIVE(next_state_offset + 1, 0); 2490 if (isinclass) 2491 { 2492 if (*ecode == OP_CRPOSSTAR) 2493 { 2494 active_count--; /* Remove non-match possibility */ 2495 next_active_state--; 2496 } 2497 ADD_NEW(state_offset, 0); 2498 } 2499 break; 2500 2501 case OP_CRPLUS: 2502 case OP_CRMINPLUS: 2503 case OP_CRPOSPLUS: 2504 count = current_state->count; /* Already matched */ 2505 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2506 if (isinclass) 2507 { 2508 if (count > 0 && *ecode == OP_CRPOSPLUS) 2509 { 2510 active_count--; /* Remove non-match possibility */ 2511 next_active_state--; 2512 } 2513 count++; 2514 ADD_NEW(state_offset, count); 2515 } 2516 break; 2517 2518 case OP_CRQUERY: 2519 case OP_CRMINQUERY: 2520 case OP_CRPOSQUERY: 2521 ADD_ACTIVE(next_state_offset + 1, 0); 2522 if (isinclass) 2523 { 2524 if (*ecode == OP_CRPOSQUERY) 2525 { 2526 active_count--; /* Remove non-match possibility */ 2527 next_active_state--; 2528 } 2529 ADD_NEW(next_state_offset + 1, 0); 2530 } 2531 break; 2532 2533 case OP_CRRANGE: 2534 case OP_CRMINRANGE: 2535 case OP_CRPOSRANGE: 2536 count = current_state->count; /* Already matched */ 2537 if (count >= (int)GET2(ecode, 1)) 2538 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2539 if (isinclass) 2540 { 2541 int max = (int)GET2(ecode, 1 + IMM2_SIZE); 2542 if (*ecode == OP_CRPOSRANGE) 2543 { 2544 active_count--; /* Remove non-match possibility */ 2545 next_active_state--; 2546 } 2547 if (++count >= max && max != 0) /* Max 0 => no limit */ 2548 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2549 else 2550 { ADD_NEW(state_offset, count); } 2551 } 2552 break; 2553 2554 default: 2555 if (isinclass) { ADD_NEW(next_state_offset, 0); } 2556 break; 2557 } 2558 } 2559 break; 2560 2561 /* ========================================================================== */ 2562 /* These are the opcodes for fancy brackets of various kinds. We have 2563 to use recursion in order to handle them. The "always failing" assertion 2564 (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2565 though the other "backtracking verbs" are not supported. */ 2566 2567 case OP_FAIL: 2568 forced_fail++; /* Count FAILs for multiple states */ 2569 break; 2570 2571 case OP_ASSERT: 2572 case OP_ASSERT_NOT: 2573 case OP_ASSERTBACK: 2574 case OP_ASSERTBACK_NOT: 2575 { 2576 PCRE2_SPTR endasscode = code + GET(code, 1); 2577 PCRE2_SIZE local_offsets[2]; 2578 int rc; 2579 int local_workspace[1000]; 2580 2581 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2582 2583 rc = internal_dfa_match( 2584 mb, /* static match data */ 2585 code, /* this subexpression's code */ 2586 ptr, /* where we currently are */ 2587 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2588 local_offsets, /* offset vector */ 2589 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ 2590 local_workspace, /* workspace vector */ 2591 sizeof(local_workspace)/sizeof(int), /* size of same */ 2592 rlevel); /* function recursion level */ 2593 2594 if (rc == PCRE2_ERROR_DFA_UITEM) return rc; 2595 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2596 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2597 } 2598 break; 2599 2600 /*-----------------------------------------------------------------*/ 2601 case OP_COND: 2602 case OP_SCOND: 2603 { 2604 PCRE2_SIZE local_offsets[1000]; 2605 int local_workspace[1000]; 2606 int codelink = (int)GET(code, 1); 2607 PCRE2_UCHAR condcode; 2608 2609 /* Because of the way auto-callout works during compile, a callout item 2610 is inserted between OP_COND and an assertion condition. This does not 2611 happen for the other conditions. */ 2612 2613 if (code[LINK_SIZE + 1] == OP_CALLOUT 2614 || code[LINK_SIZE + 1] == OP_CALLOUT_STR) 2615 { 2616 PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)? 2617 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] : 2618 (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE); 2619 2620 rrc = 0; 2621 if (mb->callout != NULL) 2622 { 2623 pcre2_callout_block cb; 2624 cb.version = 1; 2625 cb.capture_top = 1; 2626 cb.capture_last = 0; 2627 cb.offset_vector = offsets; 2628 cb.mark = NULL; /* No (*MARK) support */ 2629 cb.subject = start_subject; 2630 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject); 2631 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject); 2632 cb.current_position = (PCRE2_SIZE)(ptr - start_subject); 2633 cb.pattern_position = GET(code, LINK_SIZE + 2); 2634 cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE); 2635 2636 if (code[LINK_SIZE + 1] == OP_CALLOUT) 2637 { 2638 cb.callout_number = code[2 + 3*LINK_SIZE]; 2639 cb.callout_string_offset = 0; 2640 cb.callout_string = NULL; 2641 cb.callout_string_length = 0; 2642 } 2643 else 2644 { 2645 cb.callout_number = 0; 2646 cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE); 2647 cb.callout_string = code + (2 + 5*LINK_SIZE) + 1; 2648 cb.callout_string_length = 2649 callout_length - (1 + 4*LINK_SIZE) - 2; 2650 } 2651 2652 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) 2653 return rrc; /* Abandon */ 2654 } 2655 if (rrc > 0) break; /* Fail this thread */ 2656 code += callout_length; /* Skip callout data */ 2657 } 2658 2659 condcode = code[LINK_SIZE+1]; 2660 2661 /* Back reference conditions and duplicate named recursion conditions 2662 are not supported */ 2663 2664 if (condcode == OP_CREF || condcode == OP_DNCREF || 2665 condcode == OP_DNRREF) 2666 return PCRE2_ERROR_DFA_UCOND; 2667 2668 /* The DEFINE condition is always false, and the assertion (?!) is 2669 converted to OP_FAIL. */ 2670 2671 if (condcode == OP_FALSE || condcode == OP_FAIL) 2672 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2673 2674 /* There is also an always-true condition */ 2675 2676 else if (condcode == OP_TRUE) 2677 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2678 2679 /* The only supported version of OP_RREF is for the value RREF_ANY, 2680 which means "test if in any recursion". We can't test for specifically 2681 recursed groups. */ 2682 2683 else if (condcode == OP_RREF) 2684 { 2685 unsigned int value = GET2(code, LINK_SIZE + 2); 2686 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND; 2687 if (mb->recursive != NULL) 2688 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2689 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2690 } 2691 2692 /* Otherwise, the condition is an assertion */ 2693 2694 else 2695 { 2696 int rc; 2697 PCRE2_SPTR asscode = code + LINK_SIZE + 1; 2698 PCRE2_SPTR endasscode = asscode + GET(asscode, 1); 2699 2700 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2701 2702 rc = internal_dfa_match( 2703 mb, /* fixed match data */ 2704 asscode, /* this subexpression's code */ 2705 ptr, /* where we currently are */ 2706 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2707 local_offsets, /* offset vector */ 2708 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ 2709 local_workspace, /* workspace vector */ 2710 sizeof(local_workspace)/sizeof(int), /* size of same */ 2711 rlevel); /* function recursion level */ 2712 2713 if (rc == PCRE2_ERROR_DFA_UITEM) return rc; 2714 if ((rc >= 0) == 2715 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2716 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2717 else 2718 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2719 } 2720 } 2721 break; 2722 2723 /*-----------------------------------------------------------------*/ 2724 case OP_RECURSE: 2725 { 2726 dfa_recursion_info *ri; 2727 PCRE2_SIZE local_offsets[1000]; 2728 int local_workspace[1000]; 2729 PCRE2_SPTR callpat = start_code + GET(code, 1); 2730 uint32_t recno = (callpat == mb->start_code)? 0 : 2731 GET2(callpat, 1 + LINK_SIZE); 2732 int rc; 2733 2734 /* Check for repeating a recursion without advancing the subject 2735 pointer. This should catch convoluted mutual recursions. (Some simple 2736 cases are caught at compile time.) */ 2737 2738 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) 2739 if (recno == ri->group_num && ptr == ri->subject_position) 2740 return PCRE2_ERROR_RECURSELOOP; 2741 2742 /* Remember this recursion and where we started it so as to 2743 catch infinite loops. */ 2744 2745 new_recursive.group_num = recno; 2746 new_recursive.subject_position = ptr; 2747 new_recursive.prevrec = mb->recursive; 2748 mb->recursive = &new_recursive; 2749 2750 rc = internal_dfa_match( 2751 mb, /* fixed match data */ 2752 callpat, /* this subexpression's code */ 2753 ptr, /* where we currently are */ 2754 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2755 local_offsets, /* offset vector */ 2756 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ 2757 local_workspace, /* workspace vector */ 2758 sizeof(local_workspace)/sizeof(int), /* size of same */ 2759 rlevel); /* function recursion level */ 2760 2761 mb->recursive = new_recursive.prevrec; /* Done this recursion */ 2762 2763 /* Ran out of internal offsets */ 2764 2765 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE; 2766 2767 /* For each successful matched substring, set up the next state with a 2768 count of characters to skip before trying it. Note that the count is in 2769 characters, not bytes. */ 2770 2771 if (rc > 0) 2772 { 2773 for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2774 { 2775 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc]; 2776 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 2777 if (utf) 2778 { 2779 PCRE2_SPTR p = start_subject + local_offsets[rc]; 2780 PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; 2781 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 2782 } 2783 #endif 2784 if (charcount > 0) 2785 { 2786 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, 2787 (int)(charcount - 1)); 2788 } 2789 else 2790 { 2791 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2792 } 2793 } 2794 } 2795 else if (rc != PCRE2_ERROR_NOMATCH) return rc; 2796 } 2797 break; 2798 2799 /*-----------------------------------------------------------------*/ 2800 case OP_BRAPOS: 2801 case OP_SBRAPOS: 2802 case OP_CBRAPOS: 2803 case OP_SCBRAPOS: 2804 case OP_BRAPOSZERO: 2805 { 2806 PCRE2_SIZE charcount, matched_count; 2807 PCRE2_SPTR local_ptr = ptr; 2808 BOOL allow_zero; 2809 2810 if (codevalue == OP_BRAPOSZERO) 2811 { 2812 allow_zero = TRUE; 2813 codevalue = *(++code); /* Codevalue will be one of above BRAs */ 2814 } 2815 else allow_zero = FALSE; 2816 2817 /* Loop to match the subpattern as many times as possible as if it were 2818 a complete pattern. */ 2819 2820 for (matched_count = 0;; matched_count++) 2821 { 2822 PCRE2_SIZE local_offsets[2]; 2823 int local_workspace[1000]; 2824 2825 int rc = internal_dfa_match( 2826 mb, /* fixed match data */ 2827 code, /* this subexpression's code */ 2828 local_ptr, /* where we currently are */ 2829 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2830 local_offsets, /* offset vector */ 2831 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ 2832 local_workspace, /* workspace vector */ 2833 sizeof(local_workspace)/sizeof(int), /* size of same */ 2834 rlevel); /* function recursion level */ 2835 2836 /* Failed to match */ 2837 2838 if (rc < 0) 2839 { 2840 if (rc != PCRE2_ERROR_NOMATCH) return rc; 2841 break; 2842 } 2843 2844 /* Matched: break the loop if zero characters matched. */ 2845 2846 charcount = local_offsets[1] - local_offsets[0]; 2847 if (charcount == 0) break; 2848 local_ptr += charcount; /* Advance temporary position ptr */ 2849 } 2850 2851 /* At this point we have matched the subpattern matched_count 2852 times, and local_ptr is pointing to the character after the end of the 2853 last match. */ 2854 2855 if (matched_count > 0 || allow_zero) 2856 { 2857 PCRE2_SPTR end_subpattern = code; 2858 int next_state_offset; 2859 2860 do { end_subpattern += GET(end_subpattern, 1); } 2861 while (*end_subpattern == OP_ALT); 2862 next_state_offset = 2863 (int)(end_subpattern - start_code + LINK_SIZE + 1); 2864 2865 /* Optimization: if there are no more active states, and there 2866 are no new states yet set up, then skip over the subject string 2867 right here, to save looping. Otherwise, set up the new state to swing 2868 into action when the end of the matched substring is reached. */ 2869 2870 if (i + 1 >= active_count && new_count == 0) 2871 { 2872 ptr = local_ptr; 2873 clen = 0; 2874 ADD_NEW(next_state_offset, 0); 2875 } 2876 else 2877 { 2878 PCRE2_SPTR p = ptr; 2879 PCRE2_SPTR pp = local_ptr; 2880 charcount = (PCRE2_SIZE)(pp - p); 2881 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 2882 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 2883 #endif 2884 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); 2885 } 2886 } 2887 } 2888 break; 2889 2890 /*-----------------------------------------------------------------*/ 2891 case OP_ONCE: 2892 case OP_ONCE_NC: 2893 { 2894 PCRE2_SIZE local_offsets[2]; 2895 int local_workspace[1000]; 2896 2897 int rc = internal_dfa_match( 2898 mb, /* fixed match data */ 2899 code, /* this subexpression's code */ 2900 ptr, /* where we currently are */ 2901 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2902 local_offsets, /* offset vector */ 2903 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ 2904 local_workspace, /* workspace vector */ 2905 sizeof(local_workspace)/sizeof(int), /* size of same */ 2906 rlevel); /* function recursion level */ 2907 2908 if (rc >= 0) 2909 { 2910 PCRE2_SPTR end_subpattern = code; 2911 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0]; 2912 int next_state_offset, repeat_state_offset; 2913 2914 do { end_subpattern += GET(end_subpattern, 1); } 2915 while (*end_subpattern == OP_ALT); 2916 next_state_offset = 2917 (int)(end_subpattern - start_code + LINK_SIZE + 1); 2918 2919 /* If the end of this subpattern is KETRMAX or KETRMIN, we must 2920 arrange for the repeat state also to be added to the relevant list. 2921 Calculate the offset, or set -1 for no repeat. */ 2922 2923 repeat_state_offset = (*end_subpattern == OP_KETRMAX || 2924 *end_subpattern == OP_KETRMIN)? 2925 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; 2926 2927 /* If we have matched an empty string, add the next state at the 2928 current character pointer. This is important so that the duplicate 2929 checking kicks in, which is what breaks infinite loops that match an 2930 empty string. */ 2931 2932 if (charcount == 0) 2933 { 2934 ADD_ACTIVE(next_state_offset, 0); 2935 } 2936 2937 /* Optimization: if there are no more active states, and there 2938 are no new states yet set up, then skip over the subject string 2939 right here, to save looping. Otherwise, set up the new state to swing 2940 into action when the end of the matched substring is reached. */ 2941 2942 else if (i + 1 >= active_count && new_count == 0) 2943 { 2944 ptr += charcount; 2945 clen = 0; 2946 ADD_NEW(next_state_offset, 0); 2947 2948 /* If we are adding a repeat state at the new character position, 2949 we must fudge things so that it is the only current state. 2950 Otherwise, it might be a duplicate of one we processed before, and 2951 that would cause it to be skipped. */ 2952 2953 if (repeat_state_offset >= 0) 2954 { 2955 next_active_state = active_states; 2956 active_count = 0; 2957 i = -1; 2958 ADD_ACTIVE(repeat_state_offset, 0); 2959 } 2960 } 2961 else 2962 { 2963 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 2964 if (utf) 2965 { 2966 PCRE2_SPTR p = start_subject + local_offsets[0]; 2967 PCRE2_SPTR pp = start_subject + local_offsets[1]; 2968 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 2969 } 2970 #endif 2971 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); 2972 if (repeat_state_offset >= 0) 2973 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); } 2974 } 2975 } 2976 else if (rc != PCRE2_ERROR_NOMATCH) return rc; 2977 } 2978 break; 2979 2980 2981 /* ========================================================================== */ 2982 /* Handle callouts */ 2983 2984 case OP_CALLOUT: 2985 case OP_CALLOUT_STR: 2986 { 2987 unsigned int callout_length = (*code == OP_CALLOUT) 2988 ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE); 2989 rrc = 0; 2990 2991 if (mb->callout != NULL) 2992 { 2993 pcre2_callout_block cb; 2994 cb.version = 1; 2995 cb.capture_top = 1; 2996 cb.capture_last = 0; 2997 cb.offset_vector = offsets; 2998 cb.mark = NULL; /* No (*MARK) support */ 2999 cb.subject = start_subject; 3000 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject); 3001 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject); 3002 cb.current_position = (PCRE2_SIZE)(ptr - start_subject); 3003 cb.pattern_position = GET(code, 1); 3004 cb.next_item_length = GET(code, 1 + LINK_SIZE); 3005 3006 if (*code == OP_CALLOUT) 3007 { 3008 cb.callout_number = code[1 + 2*LINK_SIZE]; 3009 cb.callout_string_offset = 0; 3010 cb.callout_string = NULL; 3011 cb.callout_string_length = 0; 3012 } 3013 else 3014 { 3015 cb.callout_number = 0; 3016 cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE); 3017 cb.callout_string = code + (1 + 4*LINK_SIZE) + 1; 3018 cb.callout_string_length = 3019 callout_length - (1 + 4*LINK_SIZE) - 2; 3020 } 3021 3022 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) 3023 return rrc; /* Abandon */ 3024 } 3025 if (rrc == 0) 3026 { ADD_ACTIVE(state_offset + (int)callout_length, 0); } 3027 } 3028 break; 3029 3030 3031 /* ========================================================================== */ 3032 default: /* Unsupported opcode */ 3033 return PCRE2_ERROR_DFA_UITEM; 3034 } 3035 3036 NEXT_ACTIVE_STATE: continue; 3037 3038 } /* End of loop scanning active states */ 3039 3040 /* We have finished the processing at the current subject character. If no 3041 new states have been set for the next character, we have found all the 3042 matches that we are going to find. If we are at the top level and partial 3043 matching has been requested, check for appropriate conditions. 3044 3045 The "forced_ fail" variable counts the number of (*F) encountered for the 3046 character. If it is equal to the original active_count (saved in 3047 workspace[1]) it means that (*F) was found on every active state. In this 3048 case we don't want to give a partial match. 3049 3050 The "could_continue" variable is true if a state could have continued but 3051 for the fact that the end of the subject was reached. */ 3052 3053 if (new_count <= 0) 3054 { 3055 if (rlevel == 1 && /* Top level, and */ 3056 could_continue && /* Some could go on, and */ 3057 forced_fail != workspace[1] && /* Not all forced fail & */ 3058 ( /* either... */ 3059 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */ 3060 || /* or... */ 3061 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */ 3062 match_count < 0) /* no matches */ 3063 ) && /* And... */ 3064 ( 3065 partial_newline || /* Either partial NL */ 3066 ( /* or ... */ 3067 ptr >= end_subject && /* End of subject and */ 3068 ptr > mb->start_used_ptr) /* Inspected non-empty string */ 3069 ) 3070 ) 3071 match_count = PCRE2_ERROR_PARTIAL; 3072 break; /* In effect, "return", but see the comment below */ 3073 } 3074 3075 /* One or more states are active for the next character. */ 3076 3077 ptr += clen; /* Advance to next subject character */ 3078 } /* Loop to move along the subject string */ 3079 3080 /* Control gets here from "break" a few lines above. We do it this way because 3081 if we use "return" above, we have compiler trouble. Some compilers warn if 3082 there's nothing here because they think the function doesn't return a value. On 3083 the other hand, if we put a dummy statement here, some more clever compilers 3084 complain that it can't be reached. Sigh. */ 3085 3086 return match_count; 3087 } 3088 3089 3090 3091 /************************************************* 3092 * Match a pattern using the DFA algorithm * 3093 *************************************************/ 3094 3095 /* This function matches a compiled pattern to a subject string, using the 3096 alternate matching algorithm that finds all matches at once. 3097 3098 Arguments: 3099 code points to the compiled pattern 3100 subject subject string 3101 length length of subject string 3102 startoffset where to start matching in the subject 3103 options option bits 3104 match_data points to a match data structure 3105 gcontext points to a match context 3106 workspace pointer to workspace 3107 wscount size of workspace 3108 3109 Returns: > 0 => number of match offset pairs placed in offsets 3110 = 0 => offsets overflowed; longest matches are present 3111 -1 => failed to match 3112 < -1 => some kind of unexpected problem 3113 */ 3114 3115 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 3116 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 3117 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 3118 pcre2_match_context *mcontext, int *workspace, size_t wscount) 3119 { 3120 const pcre2_real_code *re = (const pcre2_real_code *)code; 3121 3122 PCRE2_SPTR start_match; 3123 PCRE2_SPTR end_subject; 3124 PCRE2_SPTR bumpalong_limit; 3125 PCRE2_SPTR req_cu_ptr; 3126 3127 BOOL utf, anchored, startline, firstline; 3128 3129 BOOL has_first_cu = FALSE; 3130 BOOL has_req_cu = FALSE; 3131 PCRE2_UCHAR first_cu = 0; 3132 PCRE2_UCHAR first_cu2 = 0; 3133 PCRE2_UCHAR req_cu = 0; 3134 PCRE2_UCHAR req_cu2 = 0; 3135 3136 const uint8_t *start_bits = NULL; 3137 3138 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro 3139 is used below, and it expects NLBLOCK to be defined as a pointer. */ 3140 3141 dfa_match_block actual_match_block; 3142 dfa_match_block *mb = &actual_match_block; 3143 3144 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated 3145 subject string. */ 3146 3147 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); 3148 3149 /* Plausibility checks */ 3150 3151 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; 3152 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL) 3153 return PCRE2_ERROR_NULL; 3154 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE; 3155 if (start_offset > length) return PCRE2_ERROR_BADOFFSET; 3156 3157 /* Check that the first field in the block is the magic number. If it is not, 3158 return with PCRE2_ERROR_BADMAGIC. */ 3159 3160 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; 3161 3162 /* Check the code unit width. */ 3163 3164 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) 3165 return PCRE2_ERROR_BADMODE; 3166 3167 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the 3168 options variable for this function. Users of PCRE2 who are not calling the 3169 function directly would like to have a way of setting these flags, in the same 3170 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with 3171 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and 3172 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be 3173 transferred to the options for this function. The bits are guaranteed to be 3174 adjacent, but do not have the same values. This bit of Boolean trickery assumes 3175 that the match-time bits are not more significant than the flag bits. If by 3176 accident this is not the case, a compile-time division by zero error will 3177 occur. */ 3178 3179 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) 3180 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) 3181 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); 3182 #undef FF 3183 #undef OO 3184 3185 /* If restarting after a partial match, do some sanity checks on the contents 3186 of the workspace. */ 3187 3188 if ((options & PCRE2_DFA_RESTART) != 0) 3189 { 3190 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || 3191 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK)) 3192 return PCRE2_ERROR_DFA_BADRESTART; 3193 } 3194 3195 /* Set some local values */ 3196 3197 utf = (re->overall_options & PCRE2_UTF) != 0; 3198 start_match = subject + start_offset; 3199 end_subject = subject + length; 3200 req_cu_ptr = start_match - 1; 3201 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 || 3202 (re->overall_options & PCRE2_ANCHORED) != 0; 3203 3204 /* The "must be at the start of a line" flags are used in a loop when finding 3205 where to start. */ 3206 3207 startline = (re->flags & PCRE2_STARTLINE) != 0; 3208 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; 3209 bumpalong_limit = end_subject; 3210 3211 /* Get data from the match context, if present, and fill in the fields in the 3212 match block. It is an error to set an offset limit without setting the flag at 3213 compile time. */ 3214 3215 if (mcontext == NULL) 3216 { 3217 mb->callout = NULL; 3218 mb->memctl = re->memctl; 3219 } 3220 else 3221 { 3222 if (mcontext->offset_limit != PCRE2_UNSET) 3223 { 3224 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) 3225 return PCRE2_ERROR_BADOFFSETLIMIT; 3226 bumpalong_limit = subject + mcontext->offset_limit; 3227 } 3228 mb->callout = mcontext->callout; 3229 mb->callout_data = mcontext->callout_data; 3230 mb->memctl = mcontext->memctl; 3231 } 3232 3233 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 3234 re->name_count * re->name_entry_size; 3235 mb->tables = re->tables; 3236 mb->start_subject = subject; 3237 mb->end_subject = end_subject; 3238 mb->start_offset = start_offset; 3239 mb->moptions = options; 3240 mb->poptions = re->overall_options; 3241 3242 /* Process the \R and newline settings. */ 3243 3244 mb->bsr_convention = re->bsr_convention; 3245 mb->nltype = NLTYPE_FIXED; 3246 switch(re->newline_convention) 3247 { 3248 case PCRE2_NEWLINE_CR: 3249 mb->nllen = 1; 3250 mb->nl[0] = CHAR_CR; 3251 break; 3252 3253 case PCRE2_NEWLINE_LF: 3254 mb->nllen = 1; 3255 mb->nl[0] = CHAR_NL; 3256 break; 3257 3258 case PCRE2_NEWLINE_CRLF: 3259 mb->nllen = 2; 3260 mb->nl[0] = CHAR_CR; 3261 mb->nl[1] = CHAR_NL; 3262 break; 3263 3264 case PCRE2_NEWLINE_ANY: 3265 mb->nltype = NLTYPE_ANY; 3266 break; 3267 3268 case PCRE2_NEWLINE_ANYCRLF: 3269 mb->nltype = NLTYPE_ANYCRLF; 3270 break; 3271 3272 default: return PCRE2_ERROR_INTERNAL; 3273 } 3274 3275 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, 3276 we must also check that a starting offset does not point into the middle of a 3277 multiunit character. We check only the portion of the subject that is going to 3278 be inspected during matching - from the offset minus the maximum back reference 3279 to the given length. This saves time when a small part of a large subject is 3280 being matched by the use of a starting offset. Note that the maximum lookbehind 3281 is a number of characters, not code units. */ 3282 3283 #ifdef SUPPORT_UNICODE 3284 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) 3285 { 3286 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ 3287 3288 if (start_offset > 0) 3289 { 3290 #if PCRE2_CODE_UNIT_WIDTH != 32 3291 unsigned int i; 3292 if (start_match < end_subject && NOT_FIRSTCU(*start_match)) 3293 return PCRE2_ERROR_BADUTFOFFSET; 3294 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) 3295 { 3296 check_subject--; 3297 while (check_subject > subject && 3298 #if PCRE2_CODE_UNIT_WIDTH == 8 3299 (*check_subject & 0xc0) == 0x80) 3300 #else /* 16-bit */ 3301 (*check_subject & 0xfc00) == 0xdc00) 3302 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 3303 check_subject--; 3304 } 3305 #else /* In the 32-bit library, one code unit equals one character. */ 3306 check_subject -= re->max_lookbehind; 3307 if (check_subject < subject) check_subject = subject; 3308 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ 3309 } 3310 3311 /* Validate the relevant portion of the subject. After an error, adjust the 3312 offset to be an absolute offset in the whole string. */ 3313 3314 match_data->rc = PRIV(valid_utf)(check_subject, 3315 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); 3316 if (match_data->rc != 0) 3317 { 3318 match_data->startchar += (PCRE2_SIZE)(check_subject - subject); 3319 return match_data->rc; 3320 } 3321 } 3322 #endif /* SUPPORT_UNICODE */ 3323 3324 /* Set up the first code unit to match, if available. The first_codeunit value 3325 is never set for an anchored regular expression, but the anchoring may be 3326 forced at run time, so we have to test for anchoring. The first code unit may 3327 be unset for an unanchored pattern, of course. If there's no first code unit 3328 there may be a bitmap of possible first characters. */ 3329 3330 if (!anchored) 3331 { 3332 if ((re->flags & PCRE2_FIRSTSET) != 0) 3333 { 3334 has_first_cu = TRUE; 3335 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); 3336 if ((re->flags & PCRE2_FIRSTCASELESS) != 0) 3337 { 3338 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); 3339 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 3340 if (utf && first_cu > 127) 3341 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); 3342 #endif 3343 } 3344 } 3345 else 3346 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) 3347 start_bits = re->start_bitmap; 3348 } 3349 3350 /* For anchored or unanchored matches, there may be a "last known required 3351 character" set. */ 3352 3353 if ((re->flags & PCRE2_LASTSET) != 0) 3354 { 3355 has_req_cu = TRUE; 3356 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); 3357 if ((re->flags & PCRE2_LASTCASELESS) != 0) 3358 { 3359 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); 3360 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 3361 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); 3362 #endif 3363 } 3364 } 3365 3366 /* Fill in fields that are always returned in the match data. */ 3367 3368 match_data->code = re; 3369 match_data->subject = subject; 3370 match_data->mark = NULL; 3371 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER; 3372 3373 /* Call the main matching function, looping for a non-anchored regex after a 3374 failed match. If not restarting, perform certain optimizations at the start of 3375 a match. */ 3376 3377 for (;;) 3378 { 3379 int rc; 3380 3381 /* ----------------- Start of match optimizations ---------------- */ 3382 3383 /* There are some optimizations that avoid running the match if a known 3384 starting point is not found, or if a known later code unit is not present. 3385 However, there is an option (settable at compile time) that disables 3386 these, for testing and for ensuring that all callouts do actually occur. 3387 The optimizations must also be avoided when restarting a DFA match. */ 3388 3389 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && 3390 (options & PCRE2_DFA_RESTART) == 0) 3391 { 3392 PCRE2_SPTR save_end_subject = end_subject; 3393 3394 /* If firstline is TRUE, the start of the match is constrained to the first 3395 line of a multiline string. That is, the match must be before or at the 3396 first newline. Implement this by temporarily adjusting end_subject so that 3397 we stop the optimization scans at a newline. If the match fails at the 3398 newline, later code breaks this loop. */ 3399 3400 if (firstline) 3401 { 3402 PCRE2_SPTR t = start_match; 3403 #ifdef SUPPORT_UNICODE 3404 if (utf) 3405 { 3406 while (t < mb->end_subject && !IS_NEWLINE(t)) 3407 { 3408 t++; 3409 ACROSSCHAR(t < end_subject, *t, t++); 3410 } 3411 } 3412 else 3413 #endif 3414 while (t < mb->end_subject && !IS_NEWLINE(t)) t++; 3415 end_subject = t; 3416 } 3417 3418 /* Advance to a unique first code unit if there is one. */ 3419 3420 if (has_first_cu) 3421 { 3422 PCRE2_UCHAR smc; 3423 if (first_cu != first_cu2) 3424 while (start_match < end_subject && 3425 (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) 3426 start_match++; 3427 else 3428 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) 3429 start_match++; 3430 } 3431 3432 /* Or to just after a linebreak for a multiline match */ 3433 3434 else if (startline) 3435 { 3436 if (start_match > mb->start_subject + start_offset) 3437 { 3438 #ifdef SUPPORT_UNICODE 3439 if (utf) 3440 { 3441 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 3442 { 3443 start_match++; 3444 ACROSSCHAR(start_match < end_subject, *start_match, 3445 start_match++); 3446 } 3447 } 3448 else 3449 #endif 3450 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 3451 start_match++; 3452 3453 /* If we have just passed a CR and the newline option is ANY or 3454 ANYCRLF, and we are now at a LF, advance the match position by one more 3455 code unit. */ 3456 3457 if (start_match[-1] == CHAR_CR && 3458 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && 3459 start_match < end_subject && 3460 UCHAR21TEST(start_match) == CHAR_NL) 3461 start_match++; 3462 } 3463 } 3464 3465 /* Or to a non-unique first code unit if any have been identified. The 3466 bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all 3467 code units greater than 254 set the 255 bit. */ 3468 3469 else if (start_bits != NULL) 3470 { 3471 while (start_match < end_subject) 3472 { 3473 register uint32_t c = UCHAR21TEST(start_match); 3474 #if PCRE2_CODE_UNIT_WIDTH != 8 3475 if (c > 255) c = 255; 3476 #endif 3477 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 3478 start_match++; 3479 } 3480 } 3481 3482 /* Restore fudged end_subject */ 3483 3484 end_subject = save_end_subject; 3485 3486 /* The following two optimizations are disabled for partial matching. */ 3487 3488 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) 3489 { 3490 /* The minimum matching length is a lower bound; no actual string of that 3491 length may actually match the pattern. Although the value is, strictly, 3492 in characters, we treat it as code units to avoid spending too much time 3493 in this optimization. */ 3494 3495 if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH; 3496 3497 /* If req_cu is set, we know that that code unit must appear in the 3498 subject for the match to succeed. If the first code unit is set, req_cu 3499 must be later in the subject; otherwise the test starts at the match 3500 point. This optimization can save a huge amount of backtracking in 3501 patterns with nested unlimited repeats that aren't going to match. 3502 Writing separate code for cased/caseless versions makes it go faster, as 3503 does using an autoincrement and backing off on a match. 3504 3505 HOWEVER: when the subject string is very, very long, searching to its end 3506 can take a long time, and give bad performance on quite ordinary 3507 patterns. This showed up when somebody was matching something like 3508 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is 3509 sufficiently long. */ 3510 3511 if (has_req_cu && end_subject - start_match < REQ_CU_MAX) 3512 { 3513 register PCRE2_SPTR p = start_match + (has_first_cu? 1:0); 3514 3515 /* We don't need to repeat the search if we haven't yet reached the 3516 place we found it at last time. */ 3517 3518 if (p > req_cu_ptr) 3519 { 3520 if (req_cu != req_cu2) 3521 { 3522 while (p < end_subject) 3523 { 3524 register uint32_t pp = UCHAR21INCTEST(p); 3525 if (pp == req_cu || pp == req_cu2) { p--; break; } 3526 } 3527 } 3528 else 3529 { 3530 while (p < end_subject) 3531 { 3532 if (UCHAR21INCTEST(p) == req_cu) { p--; break; } 3533 } 3534 } 3535 3536 /* If we can't find the required code unit, break the matching loop, 3537 forcing a match failure. */ 3538 3539 if (p >= end_subject) break; 3540 3541 /* If we have found the required code unit, save the point where we 3542 found it, so that we don't search again next time round the loop if 3543 the start hasn't passed this code unit yet. */ 3544 3545 req_cu_ptr = p; 3546 } 3547 } 3548 } 3549 } 3550 3551 /* ------------ End of start of match optimizations ------------ */ 3552 3553 /* Give no match if we have passed the bumpalong limit. */ 3554 3555 if (start_match > bumpalong_limit) break; 3556 3557 /* OK, now we can do the business */ 3558 3559 mb->start_used_ptr = start_match; 3560 mb->last_used_ptr = start_match; 3561 mb->recursive = NULL; 3562 3563 rc = internal_dfa_match( 3564 mb, /* fixed match data */ 3565 mb->start_code, /* this subexpression's code */ 3566 start_match, /* where we currently are */ 3567 start_offset, /* start offset in subject */ 3568 match_data->ovector, /* offset vector */ 3569 (uint32_t)match_data->oveccount * 2, /* actual size of same */ 3570 workspace, /* workspace vector */ 3571 (int)wscount, /* size of same */ 3572 0); /* function recurse level */ 3573 3574 /* Anything other than "no match" means we are done, always; otherwise, carry 3575 on only if not anchored. */ 3576 3577 if (rc != PCRE2_ERROR_NOMATCH || anchored) 3578 { 3579 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0) 3580 { 3581 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject); 3582 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); 3583 } 3584 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); 3585 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject); 3586 match_data->startchar = (PCRE2_SIZE)(start_match - subject); 3587 match_data->rc = rc; 3588 return rc; 3589 } 3590 3591 /* Advance to the next subject character unless we are at the end of a line 3592 and firstline is set. */ 3593 3594 if (firstline && IS_NEWLINE(start_match)) break; 3595 start_match++; 3596 #ifdef SUPPORT_UNICODE 3597 if (utf) 3598 { 3599 ACROSSCHAR(start_match < end_subject, *start_match, 3600 start_match++); 3601 } 3602 #endif 3603 if (start_match > end_subject) break; 3604 3605 /* If we have just passed a CR and we are now at a LF, and the pattern does 3606 not contain any explicit matches for \r or \n, and the newline option is CRLF 3607 or ANY or ANYCRLF, advance the match position by one more character. */ 3608 3609 if (UCHAR21TEST(start_match - 1) == CHAR_CR && 3610 start_match < end_subject && 3611 UCHAR21TEST(start_match) == CHAR_NL && 3612 (re->flags & PCRE2_HASCRORLF) == 0 && 3613 (mb->nltype == NLTYPE_ANY || 3614 mb->nltype == NLTYPE_ANYCRLF || 3615 mb->nllen == 2)) 3616 start_match++; 3617 3618 } /* "Bumpalong" loop */ 3619 3620 3621 return PCRE2_ERROR_NOMATCH; 3622 } 3623 3624 /* End of pcre2_dfa_match.c */ 3625