1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 /* This module contains the external function pcre2_dfa_match(), which is an 43 alternative matching function that uses a sort of DFA algorithm (not a true 44 FSM). This is NOT Perl-compatible, but it has advantages in certain 45 applications. */ 46 47 48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 49 the performance of his patterns greatly. I could not use it as it stood, as it 50 was not thread safe, and made assumptions about pattern sizes. Also, it caused 51 test 7 to loop, and test 9 to crash with a segfault. 52 53 The issue is the check for duplicate states, which is done by a simple linear 54 search up the state list. (Grep for "duplicate" below to find the code.) For 55 many patterns, there will never be many states active at one time, so a simple 56 linear search is fine. In patterns that have many active states, it might be a 57 bottleneck. The suggested code used an indexing scheme to remember which states 58 had previously been used for each character, and avoided the linear search when 59 it knew there was no chance of a duplicate. This was implemented when adding 60 states to the state lists. 61 62 I wrote some thread-safe, not-limited code to try something similar at the time 63 of checking for duplicates (instead of when adding states), using index vectors 64 on the stack. It did give a 13% improvement with one specially constructed 65 pattern for certain subject strings, but on other strings and on many of the 66 simpler patterns in the test suite it did worse. The major problem, I think, 67 was the extra time to initialize the index. This had to be done for each call 68 of internal_dfa_match(). (The supplied patch used a static vector, initialized 69 only once - I suspect this was the cause of the problems with the tests.) 70 71 Overall, I concluded that the gains in some cases did not outweigh the losses 72 in others, so I abandoned this code. */ 73 74 75 #ifdef HAVE_CONFIG_H 76 #include "config.h" 77 #endif 78 79 #define NLBLOCK mb /* Block containing newline information */ 80 #define PSSTART start_subject /* Field containing processed string start */ 81 #define PSEND end_subject /* Field containing processed string end */ 82 83 #include "pcre2_internal.h" 84 85 #define PUBLIC_DFA_MATCH_OPTIONS \ 86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ 87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ 88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART) 89 90 91 /************************************************* 92 * Code parameters and static tables * 93 *************************************************/ 94 95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 96 into others, under special conditions. A gap of 20 between the blocks should be 97 enough. The resulting opcodes don't have to be less than 256 because they are 98 never stored, so we push them well clear of the normal opcodes. */ 99 100 #define OP_PROP_EXTRA 300 101 #define OP_EXTUNI_EXTRA 320 102 #define OP_ANYNL_EXTRA 340 103 #define OP_HSPACE_EXTRA 360 104 #define OP_VSPACE_EXTRA 380 105 106 107 /* This table identifies those opcodes that are followed immediately by a 108 character that is to be tested in some way. This makes it possible to 109 centralize the loading of these characters. In the case of Type * etc, the 110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 111 small value. Non-zero values in the table are the offsets from the opcode where 112 the character is to be found. ***NOTE*** If the start of this table is 113 modified, the three tables that follow must also be modified. */ 114 115 static const uint8_t coptable[] = { 116 0, /* End */ 117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 119 0, 0, 0, /* Any, AllAny, Anybyte */ 120 0, 0, /* \P, \p */ 121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 122 0, /* \X */ 123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 124 1, /* Char */ 125 1, /* Chari */ 126 1, /* not */ 127 1, /* noti */ 128 /* Positive single-char repeats */ 129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ 131 1+IMM2_SIZE, /* exact */ 132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ 135 1+IMM2_SIZE, /* exact I */ 136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ 137 /* Negative single-char repeats - only for chars < 256 */ 138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ 140 1+IMM2_SIZE, /* NOT exact */ 141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ 144 1+IMM2_SIZE, /* NOT exact I */ 145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ 146 /* Positive type repeats */ 147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ 149 1+IMM2_SIZE, /* Type exact */ 150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ 151 /* Character class & ref repeats */ 152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 153 0, 0, /* CRRANGE, CRMINRANGE */ 154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ 155 0, /* CLASS */ 156 0, /* NCLASS */ 157 0, /* XCLASS - variable length */ 158 0, /* REF */ 159 0, /* REFI */ 160 0, /* DNREF */ 161 0, /* DNREFI */ 162 0, /* RECURSE */ 163 0, /* CALLOUT */ 164 0, /* CALLOUT_STR */ 165 0, /* Alt */ 166 0, /* Ket */ 167 0, /* KetRmax */ 168 0, /* KetRmin */ 169 0, /* KetRpos */ 170 0, /* Reverse */ 171 0, /* Assert */ 172 0, /* Assert not */ 173 0, /* Assert behind */ 174 0, /* Assert behind not */ 175 0, /* ONCE */ 176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 178 0, 0, /* CREF, DNCREF */ 179 0, 0, /* RREF, DNRREF */ 180 0, 0, /* FALSE, TRUE */ 181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 184 0, 0, /* COMMIT, COMMIT_ARG */ 185 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ 186 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ 187 }; 188 189 /* This table identifies those opcodes that inspect a character. It is used to 190 remember the fact that a character could have been inspected when the end of 191 the subject is reached. ***NOTE*** If the start of this table is modified, the 192 two tables that follow must also be modified. */ 193 194 static const uint8_t poptable[] = { 195 0, /* End */ 196 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 197 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 198 1, 1, 1, /* Any, AllAny, Anybyte */ 199 1, 1, /* \P, \p */ 200 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 201 1, /* \X */ 202 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */ 203 1, /* Char */ 204 1, /* Chari */ 205 1, /* not */ 206 1, /* noti */ 207 /* Positive single-char repeats */ 208 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 209 1, 1, 1, /* upto, minupto, exact */ 210 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 211 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 212 1, 1, 1, /* upto I, minupto I, exact I */ 213 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ 214 /* Negative single-char repeats - only for chars < 256 */ 215 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 216 1, 1, 1, /* NOT upto, minupto, exact */ 217 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 218 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 219 1, 1, 1, /* NOT upto I, minupto I, exact I */ 220 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ 221 /* Positive type repeats */ 222 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 223 1, 1, 1, /* Type upto, minupto, exact */ 224 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 225 /* Character class & ref repeats */ 226 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 227 1, 1, /* CRRANGE, CRMINRANGE */ 228 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ 229 1, /* CLASS */ 230 1, /* NCLASS */ 231 1, /* XCLASS - variable length */ 232 0, /* REF */ 233 0, /* REFI */ 234 0, /* DNREF */ 235 0, /* DNREFI */ 236 0, /* RECURSE */ 237 0, /* CALLOUT */ 238 0, /* CALLOUT_STR */ 239 0, /* Alt */ 240 0, /* Ket */ 241 0, /* KetRmax */ 242 0, /* KetRmin */ 243 0, /* KetRpos */ 244 0, /* Reverse */ 245 0, /* Assert */ 246 0, /* Assert not */ 247 0, /* Assert behind */ 248 0, /* Assert behind not */ 249 0, /* ONCE */ 250 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 251 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 252 0, 0, /* CREF, DNCREF */ 253 0, 0, /* RREF, DNRREF */ 254 0, 0, /* FALSE, TRUE */ 255 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 256 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 257 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 258 0, 0, /* COMMIT, COMMIT_ARG */ 259 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */ 260 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ 261 }; 262 263 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 264 and \w */ 265 266 static const uint8_t toptable1[] = { 267 0, 0, 0, 0, 0, 0, 268 ctype_digit, ctype_digit, 269 ctype_space, ctype_space, 270 ctype_word, ctype_word, 271 0, 0 /* OP_ANY, OP_ALLANY */ 272 }; 273 274 static const uint8_t toptable2[] = { 275 0, 0, 0, 0, 0, 0, 276 ctype_digit, 0, 277 ctype_space, 0, 278 ctype_word, 0, 279 1, 1 /* OP_ANY, OP_ALLANY */ 280 }; 281 282 283 /* Structure for holding data about a particular state, which is in effect the 284 current data for an active path through the match tree. It must consist 285 entirely of ints because the working vector we are passed, and which we put 286 these structures in, is a vector of ints. */ 287 288 typedef struct stateblock { 289 int offset; /* Offset to opcode (-ve has meaning) */ 290 int count; /* Count for repeats */ 291 int data; /* Some use extra data */ 292 } stateblock; 293 294 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) 295 296 297 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed 298 local working space and output vectors that were created on the stack. This has 299 caused issues for some patterns, especially in small-stack environments such as 300 Windows. A new scheme is now in use which sets up a vector on the stack, but if 301 this is too small, heap memory is used, up to the heap_limit. The main 302 parameters are all numbers of ints because the workspace is a vector of ints. 303 304 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is 305 defined in pcre2_internal.h so as to be available to pcre2test when it is 306 finding the minimum heap requirement for a match. */ 307 308 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int)) 309 310 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */ 311 #define RWS_RSIZE 1000 /* Work size for recursion */ 312 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */ 313 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */ 314 315 /* This structure is at the start of each workspace block. */ 316 317 typedef struct RWS_anchor { 318 struct RWS_anchor *next; 319 unsigned int size; /* Number of ints */ 320 unsigned int free; /* Number of ints */ 321 } RWS_anchor; 322 323 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int)) 324 325 326 327 /************************************************* 328 * Process a callout * 329 *************************************************/ 330 331 /* This function is called to perform a callout. 332 333 Arguments: 334 code current code pointer 335 offsets points to current capture offsets 336 current_subject start of current subject match 337 ptr current position in subject 338 mb the match block 339 extracode extra code offset when called from condition 340 lengthptr where to return the callout length 341 342 Returns: the return from the callout 343 */ 344 345 static int 346 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject, 347 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode, 348 PCRE2_SIZE *lengthptr) 349 { 350 pcre2_callout_block *cb = mb->cb; 351 352 *lengthptr = (code[extracode] == OP_CALLOUT)? 353 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] : 354 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode); 355 356 if (mb->callout == NULL) return 0; /* No callout provided */ 357 358 /* Fixed fields in the callout block are set once and for all at the start of 359 matching. */ 360 361 cb->offset_vector = offsets; 362 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject); 363 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject); 364 cb->pattern_position = GET(code, 1 + extracode); 365 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode); 366 367 if (code[extracode] == OP_CALLOUT) 368 { 369 cb->callout_number = code[1 + 2*LINK_SIZE + extracode]; 370 cb->callout_string_offset = 0; 371 cb->callout_string = NULL; 372 cb->callout_string_length = 0; 373 } 374 else 375 { 376 cb->callout_number = 0; 377 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode); 378 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1; 379 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2; 380 } 381 382 return (mb->callout)(cb, mb->callout_data); 383 } 384 385 386 387 /************************************************* 388 * Expand local workspace memory * 389 *************************************************/ 390 391 /* This function is called when internal_dfa_match() is about to be called 392 recursively and there is insufficient working space left in the current 393 workspace block. If there's an existing next block, use it; otherwise get a new 394 block unless the heap limit is reached. 395 396 Arguments: 397 rwsptr pointer to block pointer (updated) 398 ovecsize space needed for an ovector 399 mb the match block 400 401 Returns: 0 rwsptr has been updated 402 !0 an error code 403 */ 404 405 static int 406 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb) 407 { 408 RWS_anchor *rws = *rwsptr; 409 RWS_anchor *new; 410 411 if (rws->next != NULL) 412 { 413 new = rws->next; 414 } 415 416 /* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in 417 kibibytes. */ 418 419 else 420 { 421 unsigned int newsize = rws->size * 2; 422 unsigned int heapleft = (unsigned int) 423 (((1024/sizeof(int))*mb->heap_limit - mb->heap_used)); 424 if (newsize > heapleft) newsize = heapleft; 425 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE) 426 return PCRE2_ERROR_HEAPLIMIT; 427 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data); 428 if (new == NULL) return PCRE2_ERROR_NOMEMORY; 429 mb->heap_used += newsize; 430 new->next = NULL; 431 new->size = newsize; 432 rws->next = new; 433 } 434 435 new->free = new->size - RWS_ANCHOR_SIZE; 436 *rwsptr = new; 437 return 0; 438 } 439 440 441 442 /************************************************* 443 * Match a Regular Expression - DFA engine * 444 *************************************************/ 445 446 /* This internal function applies a compiled pattern to a subject string, 447 starting at a given point, using a DFA engine. This function is called from the 448 external one, possibly multiple times if the pattern is not anchored. The 449 function calls itself recursively for some kinds of subpattern. 450 451 Arguments: 452 mb the match_data block with fixed information 453 this_start_code the opening bracket of this subexpression's code 454 current_subject where we currently are in the subject string 455 start_offset start offset in the subject string 456 offsets vector to contain the matching string offsets 457 offsetcount size of same 458 workspace vector of workspace 459 wscount size of same 460 rlevel function call recursion level 461 462 Returns: > 0 => number of match offset pairs placed in offsets 463 = 0 => offsets overflowed; longest matches are present 464 -1 => failed to match 465 < -1 => some kind of unexpected problem 466 467 The following macros are used for adding states to the two state vectors (one 468 for the current character, one for the following character). */ 469 470 #define ADD_ACTIVE(x,y) \ 471 if (active_count++ < wscount) \ 472 { \ 473 next_active_state->offset = (x); \ 474 next_active_state->count = (y); \ 475 next_active_state++; \ 476 } \ 477 else return PCRE2_ERROR_DFA_WSSIZE 478 479 #define ADD_ACTIVE_DATA(x,y,z) \ 480 if (active_count++ < wscount) \ 481 { \ 482 next_active_state->offset = (x); \ 483 next_active_state->count = (y); \ 484 next_active_state->data = (z); \ 485 next_active_state++; \ 486 } \ 487 else return PCRE2_ERROR_DFA_WSSIZE 488 489 #define ADD_NEW(x,y) \ 490 if (new_count++ < wscount) \ 491 { \ 492 next_new_state->offset = (x); \ 493 next_new_state->count = (y); \ 494 next_new_state++; \ 495 } \ 496 else return PCRE2_ERROR_DFA_WSSIZE 497 498 #define ADD_NEW_DATA(x,y,z) \ 499 if (new_count++ < wscount) \ 500 { \ 501 next_new_state->offset = (x); \ 502 next_new_state->count = (y); \ 503 next_new_state->data = (z); \ 504 next_new_state++; \ 505 } \ 506 else return PCRE2_ERROR_DFA_WSSIZE 507 508 /* And now, here is the code */ 509 510 static int 511 internal_dfa_match( 512 dfa_match_block *mb, 513 PCRE2_SPTR this_start_code, 514 PCRE2_SPTR current_subject, 515 PCRE2_SIZE start_offset, 516 PCRE2_SIZE *offsets, 517 uint32_t offsetcount, 518 int *workspace, 519 int wscount, 520 uint32_t rlevel, 521 int *RWS) 522 { 523 stateblock *active_states, *new_states, *temp_states; 524 stateblock *next_active_state, *next_new_state; 525 const uint8_t *ctypes, *lcc, *fcc; 526 PCRE2_SPTR ptr; 527 PCRE2_SPTR end_code; 528 dfa_recursion_info new_recursive; 529 int active_count, new_count, match_count; 530 531 /* Some fields in the mb block are frequently referenced, so we load them into 532 independent variables in the hope that this will perform better. */ 533 534 PCRE2_SPTR start_subject = mb->start_subject; 535 PCRE2_SPTR end_subject = mb->end_subject; 536 PCRE2_SPTR start_code = mb->start_code; 537 538 #ifdef SUPPORT_UNICODE 539 BOOL utf = (mb->poptions & PCRE2_UTF) != 0; 540 #else 541 BOOL utf = FALSE; 542 #endif 543 544 BOOL reset_could_continue = FALSE; 545 546 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; 547 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; 548 offsetcount &= (uint32_t)(-2); /* Round down */ 549 550 wscount -= 2; 551 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 552 (2 * INTS_PER_STATEBLOCK); 553 554 ctypes = mb->tables + ctypes_offset; 555 lcc = mb->tables + lcc_offset; 556 fcc = mb->tables + fcc_offset; 557 558 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */ 559 560 active_states = (stateblock *)(workspace + 2); 561 next_new_state = new_states = active_states + wscount; 562 new_count = 0; 563 564 /* The first thing in any (sub) pattern is a bracket of some sort. Push all 565 the alternative states onto the list, and find out where the end is. This 566 makes is possible to use this function recursively, when we want to stop at a 567 matching internal ket rather than at the end. 568 569 If we are dealing with a backward assertion we have to find out the maximum 570 amount to move back, and set up each alternative appropriately. */ 571 572 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT) 573 { 574 size_t max_back = 0; 575 size_t gone_back; 576 577 end_code = this_start_code; 578 do 579 { 580 size_t back = (size_t)GET(end_code, 2+LINK_SIZE); 581 if (back > max_back) max_back = back; 582 end_code += GET(end_code, 1); 583 } 584 while (*end_code == OP_ALT); 585 586 /* If we can't go back the amount required for the longest lookbehind 587 pattern, go back as far as we can; some alternatives may still be viable. */ 588 589 #ifdef SUPPORT_UNICODE 590 /* In character mode we have to step back character by character */ 591 592 if (utf) 593 { 594 for (gone_back = 0; gone_back < max_back; gone_back++) 595 { 596 if (current_subject <= start_subject) break; 597 current_subject--; 598 ACROSSCHAR(current_subject > start_subject, current_subject, 599 current_subject--); 600 } 601 } 602 else 603 #endif 604 605 /* In byte-mode we can do this quickly. */ 606 607 { 608 size_t current_offset = (size_t)(current_subject - start_subject); 609 gone_back = (current_offset < max_back)? current_offset : max_back; 610 current_subject -= gone_back; 611 } 612 613 /* Save the earliest consulted character */ 614 615 if (current_subject < mb->start_used_ptr) 616 mb->start_used_ptr = current_subject; 617 618 /* Now we can process the individual branches. There will be an OP_REVERSE at 619 the start of each branch, except when the length of the branch is zero. */ 620 621 end_code = this_start_code; 622 do 623 { 624 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0; 625 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE); 626 if (back <= gone_back) 627 { 628 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen); 629 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back)); 630 } 631 end_code += GET(end_code, 1); 632 } 633 while (*end_code == OP_ALT); 634 } 635 636 /* This is the code for a "normal" subpattern (not a backward assertion). The 637 start of a whole pattern is always one of these. If we are at the top level, 638 we may be asked to restart matching from the same point that we reached for a 639 previous partial match. We still have to scan through the top-level branches to 640 find the end state. */ 641 642 else 643 { 644 end_code = this_start_code; 645 646 /* Restarting */ 647 648 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0) 649 { 650 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 651 new_count = workspace[1]; 652 if (!workspace[0]) 653 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock)); 654 } 655 656 /* Not restarting */ 657 658 else 659 { 660 int length = 1 + LINK_SIZE + 661 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 662 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 663 ? IMM2_SIZE:0); 664 do 665 { 666 ADD_NEW((int)(end_code - start_code + length), 0); 667 end_code += GET(end_code, 1); 668 length = 1 + LINK_SIZE; 669 } 670 while (*end_code == OP_ALT); 671 } 672 } 673 674 workspace[0] = 0; /* Bit indicating which vector is current */ 675 676 /* Loop for scanning the subject */ 677 678 ptr = current_subject; 679 for (;;) 680 { 681 int i, j; 682 int clen, dlen; 683 uint32_t c, d; 684 int forced_fail = 0; 685 BOOL partial_newline = FALSE; 686 BOOL could_continue = reset_could_continue; 687 reset_could_continue = FALSE; 688 689 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr; 690 691 /* Make the new state list into the active state list and empty the 692 new state list. */ 693 694 temp_states = active_states; 695 active_states = new_states; 696 new_states = temp_states; 697 active_count = new_count; 698 new_count = 0; 699 700 workspace[0] ^= 1; /* Remember for the restarting feature */ 701 workspace[1] = active_count; 702 703 /* Set the pointers for adding new states */ 704 705 next_active_state = active_states + active_count; 706 next_new_state = new_states; 707 708 /* Load the current character from the subject outside the loop, as many 709 different states may want to look at it, and we assume that at least one 710 will. */ 711 712 if (ptr < end_subject) 713 { 714 clen = 1; /* Number of data items in the character */ 715 #ifdef SUPPORT_UNICODE 716 GETCHARLENTEST(c, ptr, clen); 717 #else 718 c = *ptr; 719 #endif /* SUPPORT_UNICODE */ 720 } 721 else 722 { 723 clen = 0; /* This indicates the end of the subject */ 724 c = NOTACHAR; /* This value should never actually be used */ 725 } 726 727 /* Scan up the active states and act on each one. The result of an action 728 may be to add more states to the currently active list (e.g. on hitting a 729 parenthesis) or it may be to put states on the new list, for considering 730 when we move the character pointer on. */ 731 732 for (i = 0; i < active_count; i++) 733 { 734 stateblock *current_state = active_states + i; 735 BOOL caseless = FALSE; 736 PCRE2_SPTR code; 737 uint32_t codevalue; 738 int state_offset = current_state->offset; 739 int rrc; 740 int count; 741 742 /* A negative offset is a special case meaning "hold off going to this 743 (negated) state until the number of characters in the data field have 744 been skipped". If the could_continue flag was passed over from a previous 745 state, arrange for it to passed on. */ 746 747 if (state_offset < 0) 748 { 749 if (current_state->data > 0) 750 { 751 ADD_NEW_DATA(state_offset, current_state->count, 752 current_state->data - 1); 753 if (could_continue) reset_could_continue = TRUE; 754 continue; 755 } 756 else 757 { 758 current_state->offset = state_offset = -state_offset; 759 } 760 } 761 762 /* Check for a duplicate state with the same count, and skip if found. 763 See the note at the head of this module about the possibility of improving 764 performance here. */ 765 766 for (j = 0; j < i; j++) 767 { 768 if (active_states[j].offset == state_offset && 769 active_states[j].count == current_state->count) 770 goto NEXT_ACTIVE_STATE; 771 } 772 773 /* The state offset is the offset to the opcode */ 774 775 code = start_code + state_offset; 776 codevalue = *code; 777 778 /* If this opcode inspects a character, but we are at the end of the 779 subject, remember the fact for use when testing for a partial match. */ 780 781 if (clen == 0 && poptable[codevalue] != 0) 782 could_continue = TRUE; 783 784 /* If this opcode is followed by an inline character, load it. It is 785 tempting to test for the presence of a subject character here, but that 786 is wrong, because sometimes zero repetitions of the subject are 787 permitted. 788 789 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 790 argument that is not a data character - but is always one byte long because 791 the values are small. We have to take special action to deal with \P, \p, 792 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert 793 these ones to new opcodes. */ 794 795 if (coptable[codevalue] > 0) 796 { 797 dlen = 1; 798 #ifdef SUPPORT_UNICODE 799 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 800 #endif /* SUPPORT_UNICODE */ 801 d = code[coptable[codevalue]]; 802 if (codevalue >= OP_TYPESTAR) 803 { 804 switch(d) 805 { 806 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM; 807 case OP_NOTPROP: 808 case OP_PROP: codevalue += OP_PROP_EXTRA; break; 809 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 810 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 811 case OP_NOT_HSPACE: 812 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 813 case OP_NOT_VSPACE: 814 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 815 default: break; 816 } 817 } 818 } 819 else 820 { 821 dlen = 0; /* Not strictly necessary, but compilers moan */ 822 d = NOTACHAR; /* if these variables are not set. */ 823 } 824 825 826 /* Now process the individual opcodes */ 827 828 switch (codevalue) 829 { 830 /* ========================================================================== */ 831 /* These cases are never obeyed. This is a fudge that causes a compile- 832 time error if the vectors coptable or poptable, which are indexed by 833 opcode, are not the correct length. It seems to be the only way to do 834 such a check at compile time, as the sizeof() operator does not work 835 in the C preprocessor. */ 836 837 case OP_TABLE_LENGTH: 838 case OP_TABLE_LENGTH + 839 ((sizeof(coptable) == OP_TABLE_LENGTH) && 840 (sizeof(poptable) == OP_TABLE_LENGTH)): 841 return 0; 842 843 /* ========================================================================== */ 844 /* Reached a closing bracket. If not at the end of the pattern, carry 845 on with the next opcode. For repeating opcodes, also add the repeat 846 state. Note that KETRPOS will always be encountered at the end of the 847 subpattern, because the possessive subpattern repeats are always handled 848 using recursive calls. Thus, it never adds any new states. 849 850 At the end of the (sub)pattern, unless we have an empty string and 851 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the 852 start of the subject, save the match data, shifting up all previous 853 matches so we always have the longest first. */ 854 855 case OP_KET: 856 case OP_KETRMIN: 857 case OP_KETRMAX: 858 case OP_KETRPOS: 859 if (code != end_code) 860 { 861 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 862 if (codevalue != OP_KET) 863 { 864 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0); 865 } 866 } 867 else 868 { 869 if (ptr > current_subject || 870 ((mb->moptions & PCRE2_NOTEMPTY) == 0 && 871 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 || 872 current_subject > start_subject + mb->start_offset))) 873 { 874 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 875 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) 876 match_count = 0; 877 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; 878 if (count > 0) (void)memmove(offsets + 2, offsets, 879 (size_t)count * sizeof(PCRE2_SIZE)); 880 if (offsetcount >= 2) 881 { 882 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject); 883 offsets[1] = (PCRE2_SIZE)(ptr - start_subject); 884 } 885 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count; 886 } 887 } 888 break; 889 890 /* ========================================================================== */ 891 /* These opcodes add to the current list of states without looking 892 at the current character. */ 893 894 /*-----------------------------------------------------------------*/ 895 case OP_ALT: 896 do { code += GET(code, 1); } while (*code == OP_ALT); 897 ADD_ACTIVE((int)(code - start_code), 0); 898 break; 899 900 /*-----------------------------------------------------------------*/ 901 case OP_BRA: 902 case OP_SBRA: 903 do 904 { 905 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 906 code += GET(code, 1); 907 } 908 while (*code == OP_ALT); 909 break; 910 911 /*-----------------------------------------------------------------*/ 912 case OP_CBRA: 913 case OP_SCBRA: 914 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); 915 code += GET(code, 1); 916 while (*code == OP_ALT) 917 { 918 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 919 code += GET(code, 1); 920 } 921 break; 922 923 /*-----------------------------------------------------------------*/ 924 case OP_BRAZERO: 925 case OP_BRAMINZERO: 926 ADD_ACTIVE(state_offset + 1, 0); 927 code += 1 + GET(code, 2); 928 while (*code == OP_ALT) code += GET(code, 1); 929 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 930 break; 931 932 /*-----------------------------------------------------------------*/ 933 case OP_SKIPZERO: 934 code += 1 + GET(code, 2); 935 while (*code == OP_ALT) code += GET(code, 1); 936 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 937 break; 938 939 /*-----------------------------------------------------------------*/ 940 case OP_CIRC: 941 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) 942 { ADD_ACTIVE(state_offset + 1, 0); } 943 break; 944 945 /*-----------------------------------------------------------------*/ 946 case OP_CIRCM: 947 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) || 948 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 ) 949 && WAS_NEWLINE(ptr))) 950 { ADD_ACTIVE(state_offset + 1, 0); } 951 break; 952 953 /*-----------------------------------------------------------------*/ 954 case OP_EOD: 955 if (ptr >= end_subject) 956 { 957 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 958 could_continue = TRUE; 959 else { ADD_ACTIVE(state_offset + 1, 0); } 960 } 961 break; 962 963 /*-----------------------------------------------------------------*/ 964 case OP_SOD: 965 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 966 break; 967 968 /*-----------------------------------------------------------------*/ 969 case OP_SOM: 970 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 971 break; 972 973 974 /* ========================================================================== */ 975 /* These opcodes inspect the next subject character, and sometimes 976 the previous one as well, but do not have an argument. The variable 977 clen contains the length of the current character and is zero if we are 978 at the end of the subject. */ 979 980 /*-----------------------------------------------------------------*/ 981 case OP_ANY: 982 if (clen > 0 && !IS_NEWLINE(ptr)) 983 { 984 if (ptr + 1 >= mb->end_subject && 985 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 986 NLBLOCK->nltype == NLTYPE_FIXED && 987 NLBLOCK->nllen == 2 && 988 c == NLBLOCK->nl[0]) 989 { 990 could_continue = partial_newline = TRUE; 991 } 992 else 993 { 994 ADD_NEW(state_offset + 1, 0); 995 } 996 } 997 break; 998 999 /*-----------------------------------------------------------------*/ 1000 case OP_ALLANY: 1001 if (clen > 0) 1002 { ADD_NEW(state_offset + 1, 0); } 1003 break; 1004 1005 /*-----------------------------------------------------------------*/ 1006 case OP_EODN: 1007 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1008 could_continue = TRUE; 1009 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen)) 1010 { ADD_ACTIVE(state_offset + 1, 0); } 1011 break; 1012 1013 /*-----------------------------------------------------------------*/ 1014 case OP_DOLL: 1015 if ((mb->moptions & PCRE2_NOTEOL) == 0) 1016 { 1017 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1018 could_continue = TRUE; 1019 else if (clen == 0 || 1020 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 1021 (ptr == end_subject - mb->nllen) 1022 )) 1023 { ADD_ACTIVE(state_offset + 1, 0); } 1024 else if (ptr + 1 >= mb->end_subject && 1025 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && 1026 NLBLOCK->nltype == NLTYPE_FIXED && 1027 NLBLOCK->nllen == 2 && 1028 c == NLBLOCK->nl[0]) 1029 { 1030 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1031 { 1032 reset_could_continue = TRUE; 1033 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 1034 } 1035 else could_continue = partial_newline = TRUE; 1036 } 1037 } 1038 break; 1039 1040 /*-----------------------------------------------------------------*/ 1041 case OP_DOLLM: 1042 if ((mb->moptions & PCRE2_NOTEOL) == 0) 1043 { 1044 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1045 could_continue = TRUE; 1046 else if (clen == 0 || 1047 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) 1048 { ADD_ACTIVE(state_offset + 1, 0); } 1049 else if (ptr + 1 >= mb->end_subject && 1050 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && 1051 NLBLOCK->nltype == NLTYPE_FIXED && 1052 NLBLOCK->nllen == 2 && 1053 c == NLBLOCK->nl[0]) 1054 { 1055 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 1056 { 1057 reset_could_continue = TRUE; 1058 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 1059 } 1060 else could_continue = partial_newline = TRUE; 1061 } 1062 } 1063 else if (IS_NEWLINE(ptr)) 1064 { ADD_ACTIVE(state_offset + 1, 0); } 1065 break; 1066 1067 /*-----------------------------------------------------------------*/ 1068 1069 case OP_DIGIT: 1070 case OP_WHITESPACE: 1071 case OP_WORDCHAR: 1072 if (clen > 0 && c < 256 && 1073 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 1074 { ADD_NEW(state_offset + 1, 0); } 1075 break; 1076 1077 /*-----------------------------------------------------------------*/ 1078 case OP_NOT_DIGIT: 1079 case OP_NOT_WHITESPACE: 1080 case OP_NOT_WORDCHAR: 1081 if (clen > 0 && (c >= 256 || 1082 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 1083 { ADD_NEW(state_offset + 1, 0); } 1084 break; 1085 1086 /*-----------------------------------------------------------------*/ 1087 case OP_WORD_BOUNDARY: 1088 case OP_NOT_WORD_BOUNDARY: 1089 { 1090 int left_word, right_word; 1091 1092 if (ptr > start_subject) 1093 { 1094 PCRE2_SPTR temp = ptr - 1; 1095 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp; 1096 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 1097 if (utf) { BACKCHAR(temp); } 1098 #endif 1099 GETCHARTEST(d, temp); 1100 #ifdef SUPPORT_UNICODE 1101 if ((mb->poptions & PCRE2_UCP) != 0) 1102 { 1103 if (d == '_') left_word = TRUE; else 1104 { 1105 uint32_t cat = UCD_CATEGORY(d); 1106 left_word = (cat == ucp_L || cat == ucp_N); 1107 } 1108 } 1109 else 1110 #endif 1111 left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 1112 } 1113 else left_word = FALSE; 1114 1115 if (clen > 0) 1116 { 1117 if (ptr >= mb->last_used_ptr) 1118 { 1119 PCRE2_SPTR temp = ptr + 1; 1120 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 1121 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); } 1122 #endif 1123 mb->last_used_ptr = temp; 1124 } 1125 #ifdef SUPPORT_UNICODE 1126 if ((mb->poptions & PCRE2_UCP) != 0) 1127 { 1128 if (c == '_') right_word = TRUE; else 1129 { 1130 uint32_t cat = UCD_CATEGORY(c); 1131 right_word = (cat == ucp_L || cat == ucp_N); 1132 } 1133 } 1134 else 1135 #endif 1136 right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 1137 } 1138 else right_word = FALSE; 1139 1140 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 1141 { ADD_ACTIVE(state_offset + 1, 0); } 1142 } 1143 break; 1144 1145 1146 /*-----------------------------------------------------------------*/ 1147 /* Check the next character by Unicode property. We will get here only 1148 if the support is in the binary; otherwise a compile-time error occurs. 1149 */ 1150 1151 #ifdef SUPPORT_UNICODE 1152 case OP_PROP: 1153 case OP_NOTPROP: 1154 if (clen > 0) 1155 { 1156 BOOL OK; 1157 const uint32_t *cp; 1158 const ucd_record * prop = GET_UCD(c); 1159 switch(code[1]) 1160 { 1161 case PT_ANY: 1162 OK = TRUE; 1163 break; 1164 1165 case PT_LAMP: 1166 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1167 prop->chartype == ucp_Lt; 1168 break; 1169 1170 case PT_GC: 1171 OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; 1172 break; 1173 1174 case PT_PC: 1175 OK = prop->chartype == code[2]; 1176 break; 1177 1178 case PT_SC: 1179 OK = prop->script == code[2]; 1180 break; 1181 1182 /* These are specials for combination cases. */ 1183 1184 case PT_ALNUM: 1185 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1186 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1187 break; 1188 1189 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1190 which means that Perl space and POSIX space are now identical. PCRE 1191 was changed at release 8.34. */ 1192 1193 case PT_SPACE: /* Perl space */ 1194 case PT_PXSPACE: /* POSIX space */ 1195 switch(c) 1196 { 1197 HSPACE_CASES: 1198 VSPACE_CASES: 1199 OK = TRUE; 1200 break; 1201 1202 default: 1203 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1204 break; 1205 } 1206 break; 1207 1208 case PT_WORD: 1209 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1210 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1211 c == CHAR_UNDERSCORE; 1212 break; 1213 1214 case PT_CLIST: 1215 cp = PRIV(ucd_caseless_sets) + code[2]; 1216 for (;;) 1217 { 1218 if (c < *cp) { OK = FALSE; break; } 1219 if (c == *cp++) { OK = TRUE; break; } 1220 } 1221 break; 1222 1223 case PT_UCNC: 1224 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1225 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1226 c >= 0xe000; 1227 break; 1228 1229 /* Should never occur, but keep compilers from grumbling. */ 1230 1231 default: 1232 OK = codevalue != OP_PROP; 1233 break; 1234 } 1235 1236 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 1237 } 1238 break; 1239 #endif 1240 1241 1242 1243 /* ========================================================================== */ 1244 /* These opcodes likewise inspect the subject character, but have an 1245 argument that is not a data character. It is one of these opcodes: 1246 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 1247 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 1248 1249 case OP_TYPEPLUS: 1250 case OP_TYPEMINPLUS: 1251 case OP_TYPEPOSPLUS: 1252 count = current_state->count; /* Already matched */ 1253 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1254 if (clen > 0) 1255 { 1256 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1257 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1258 NLBLOCK->nltype == NLTYPE_FIXED && 1259 NLBLOCK->nllen == 2 && 1260 c == NLBLOCK->nl[0]) 1261 { 1262 could_continue = partial_newline = TRUE; 1263 } 1264 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1265 (c < 256 && 1266 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1267 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1268 { 1269 if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1270 { 1271 active_count--; /* Remove non-match possibility */ 1272 next_active_state--; 1273 } 1274 count++; 1275 ADD_NEW(state_offset, count); 1276 } 1277 } 1278 break; 1279 1280 /*-----------------------------------------------------------------*/ 1281 case OP_TYPEQUERY: 1282 case OP_TYPEMINQUERY: 1283 case OP_TYPEPOSQUERY: 1284 ADD_ACTIVE(state_offset + 2, 0); 1285 if (clen > 0) 1286 { 1287 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1288 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1289 NLBLOCK->nltype == NLTYPE_FIXED && 1290 NLBLOCK->nllen == 2 && 1291 c == NLBLOCK->nl[0]) 1292 { 1293 could_continue = partial_newline = TRUE; 1294 } 1295 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1296 (c < 256 && 1297 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1298 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1299 { 1300 if (codevalue == OP_TYPEPOSQUERY) 1301 { 1302 active_count--; /* Remove non-match possibility */ 1303 next_active_state--; 1304 } 1305 ADD_NEW(state_offset + 2, 0); 1306 } 1307 } 1308 break; 1309 1310 /*-----------------------------------------------------------------*/ 1311 case OP_TYPESTAR: 1312 case OP_TYPEMINSTAR: 1313 case OP_TYPEPOSSTAR: 1314 ADD_ACTIVE(state_offset + 2, 0); 1315 if (clen > 0) 1316 { 1317 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1318 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1319 NLBLOCK->nltype == NLTYPE_FIXED && 1320 NLBLOCK->nllen == 2 && 1321 c == NLBLOCK->nl[0]) 1322 { 1323 could_continue = partial_newline = TRUE; 1324 } 1325 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1326 (c < 256 && 1327 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1328 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1329 { 1330 if (codevalue == OP_TYPEPOSSTAR) 1331 { 1332 active_count--; /* Remove non-match possibility */ 1333 next_active_state--; 1334 } 1335 ADD_NEW(state_offset, 0); 1336 } 1337 } 1338 break; 1339 1340 /*-----------------------------------------------------------------*/ 1341 case OP_TYPEEXACT: 1342 count = current_state->count; /* Number already matched */ 1343 if (clen > 0) 1344 { 1345 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1346 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1347 NLBLOCK->nltype == NLTYPE_FIXED && 1348 NLBLOCK->nllen == 2 && 1349 c == NLBLOCK->nl[0]) 1350 { 1351 could_continue = partial_newline = TRUE; 1352 } 1353 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1354 (c < 256 && 1355 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1356 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1357 { 1358 if (++count >= (int)GET2(code, 1)) 1359 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } 1360 else 1361 { ADD_NEW(state_offset, count); } 1362 } 1363 } 1364 break; 1365 1366 /*-----------------------------------------------------------------*/ 1367 case OP_TYPEUPTO: 1368 case OP_TYPEMINUPTO: 1369 case OP_TYPEPOSUPTO: 1370 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); 1371 count = current_state->count; /* Number already matched */ 1372 if (clen > 0) 1373 { 1374 if (d == OP_ANY && ptr + 1 >= mb->end_subject && 1375 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 && 1376 NLBLOCK->nltype == NLTYPE_FIXED && 1377 NLBLOCK->nllen == 2 && 1378 c == NLBLOCK->nl[0]) 1379 { 1380 could_continue = partial_newline = TRUE; 1381 } 1382 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1383 (c < 256 && 1384 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1385 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1386 { 1387 if (codevalue == OP_TYPEPOSUPTO) 1388 { 1389 active_count--; /* Remove non-match possibility */ 1390 next_active_state--; 1391 } 1392 if (++count >= (int)GET2(code, 1)) 1393 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } 1394 else 1395 { ADD_NEW(state_offset, count); } 1396 } 1397 } 1398 break; 1399 1400 /* ========================================================================== */ 1401 /* These are virtual opcodes that are used when something like 1402 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1403 argument. It keeps the code above fast for the other cases. The argument 1404 is in the d variable. */ 1405 1406 #ifdef SUPPORT_UNICODE 1407 case OP_PROP_EXTRA + OP_TYPEPLUS: 1408 case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1409 case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1410 count = current_state->count; /* Already matched */ 1411 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1412 if (clen > 0) 1413 { 1414 BOOL OK; 1415 const uint32_t *cp; 1416 const ucd_record * prop = GET_UCD(c); 1417 switch(code[2]) 1418 { 1419 case PT_ANY: 1420 OK = TRUE; 1421 break; 1422 1423 case PT_LAMP: 1424 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1425 prop->chartype == ucp_Lt; 1426 break; 1427 1428 case PT_GC: 1429 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1430 break; 1431 1432 case PT_PC: 1433 OK = prop->chartype == code[3]; 1434 break; 1435 1436 case PT_SC: 1437 OK = prop->script == code[3]; 1438 break; 1439 1440 /* These are specials for combination cases. */ 1441 1442 case PT_ALNUM: 1443 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1444 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1445 break; 1446 1447 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1448 which means that Perl space and POSIX space are now identical. PCRE 1449 was changed at release 8.34. */ 1450 1451 case PT_SPACE: /* Perl space */ 1452 case PT_PXSPACE: /* POSIX space */ 1453 switch(c) 1454 { 1455 HSPACE_CASES: 1456 VSPACE_CASES: 1457 OK = TRUE; 1458 break; 1459 1460 default: 1461 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1462 break; 1463 } 1464 break; 1465 1466 case PT_WORD: 1467 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1468 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1469 c == CHAR_UNDERSCORE; 1470 break; 1471 1472 case PT_CLIST: 1473 cp = PRIV(ucd_caseless_sets) + code[3]; 1474 for (;;) 1475 { 1476 if (c < *cp) { OK = FALSE; break; } 1477 if (c == *cp++) { OK = TRUE; break; } 1478 } 1479 break; 1480 1481 case PT_UCNC: 1482 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1483 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1484 c >= 0xe000; 1485 break; 1486 1487 /* Should never occur, but keep compilers from grumbling. */ 1488 1489 default: 1490 OK = codevalue != OP_PROP; 1491 break; 1492 } 1493 1494 if (OK == (d == OP_PROP)) 1495 { 1496 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1497 { 1498 active_count--; /* Remove non-match possibility */ 1499 next_active_state--; 1500 } 1501 count++; 1502 ADD_NEW(state_offset, count); 1503 } 1504 } 1505 break; 1506 1507 /*-----------------------------------------------------------------*/ 1508 case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1509 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1510 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1511 count = current_state->count; /* Already matched */ 1512 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1513 if (clen > 0) 1514 { 1515 int ncount = 0; 1516 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1517 { 1518 active_count--; /* Remove non-match possibility */ 1519 next_active_state--; 1520 } 1521 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, 1522 &ncount); 1523 count++; 1524 ADD_NEW_DATA(-state_offset, count, ncount); 1525 } 1526 break; 1527 #endif 1528 1529 /*-----------------------------------------------------------------*/ 1530 case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1531 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1532 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1533 count = current_state->count; /* Already matched */ 1534 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1535 if (clen > 0) 1536 { 1537 int ncount = 0; 1538 switch (c) 1539 { 1540 case CHAR_VT: 1541 case CHAR_FF: 1542 case CHAR_NEL: 1543 #ifndef EBCDIC 1544 case 0x2028: 1545 case 0x2029: 1546 #endif /* Not EBCDIC */ 1547 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 1548 goto ANYNL01; 1549 1550 case CHAR_CR: 1551 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 1552 /* Fall through */ 1553 1554 ANYNL01: 1555 case CHAR_LF: 1556 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1557 { 1558 active_count--; /* Remove non-match possibility */ 1559 next_active_state--; 1560 } 1561 count++; 1562 ADD_NEW_DATA(-state_offset, count, ncount); 1563 break; 1564 1565 default: 1566 break; 1567 } 1568 } 1569 break; 1570 1571 /*-----------------------------------------------------------------*/ 1572 case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1573 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1574 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1575 count = current_state->count; /* Already matched */ 1576 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1577 if (clen > 0) 1578 { 1579 BOOL OK; 1580 switch (c) 1581 { 1582 VSPACE_CASES: 1583 OK = TRUE; 1584 break; 1585 1586 default: 1587 OK = FALSE; 1588 break; 1589 } 1590 1591 if (OK == (d == OP_VSPACE)) 1592 { 1593 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1594 { 1595 active_count--; /* Remove non-match possibility */ 1596 next_active_state--; 1597 } 1598 count++; 1599 ADD_NEW_DATA(-state_offset, count, 0); 1600 } 1601 } 1602 break; 1603 1604 /*-----------------------------------------------------------------*/ 1605 case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1606 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1607 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1608 count = current_state->count; /* Already matched */ 1609 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1610 if (clen > 0) 1611 { 1612 BOOL OK; 1613 switch (c) 1614 { 1615 HSPACE_CASES: 1616 OK = TRUE; 1617 break; 1618 1619 default: 1620 OK = FALSE; 1621 break; 1622 } 1623 1624 if (OK == (d == OP_HSPACE)) 1625 { 1626 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1627 { 1628 active_count--; /* Remove non-match possibility */ 1629 next_active_state--; 1630 } 1631 count++; 1632 ADD_NEW_DATA(-state_offset, count, 0); 1633 } 1634 } 1635 break; 1636 1637 /*-----------------------------------------------------------------*/ 1638 #ifdef SUPPORT_UNICODE 1639 case OP_PROP_EXTRA + OP_TYPEQUERY: 1640 case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1641 case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1642 count = 4; 1643 goto QS1; 1644 1645 case OP_PROP_EXTRA + OP_TYPESTAR: 1646 case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1647 case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1648 count = 0; 1649 1650 QS1: 1651 1652 ADD_ACTIVE(state_offset + 4, 0); 1653 if (clen > 0) 1654 { 1655 BOOL OK; 1656 const uint32_t *cp; 1657 const ucd_record * prop = GET_UCD(c); 1658 switch(code[2]) 1659 { 1660 case PT_ANY: 1661 OK = TRUE; 1662 break; 1663 1664 case PT_LAMP: 1665 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1666 prop->chartype == ucp_Lt; 1667 break; 1668 1669 case PT_GC: 1670 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1671 break; 1672 1673 case PT_PC: 1674 OK = prop->chartype == code[3]; 1675 break; 1676 1677 case PT_SC: 1678 OK = prop->script == code[3]; 1679 break; 1680 1681 /* These are specials for combination cases. */ 1682 1683 case PT_ALNUM: 1684 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1685 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1686 break; 1687 1688 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1689 which means that Perl space and POSIX space are now identical. PCRE 1690 was changed at release 8.34. */ 1691 1692 case PT_SPACE: /* Perl space */ 1693 case PT_PXSPACE: /* POSIX space */ 1694 switch(c) 1695 { 1696 HSPACE_CASES: 1697 VSPACE_CASES: 1698 OK = TRUE; 1699 break; 1700 1701 default: 1702 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1703 break; 1704 } 1705 break; 1706 1707 case PT_WORD: 1708 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1709 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1710 c == CHAR_UNDERSCORE; 1711 break; 1712 1713 case PT_CLIST: 1714 cp = PRIV(ucd_caseless_sets) + code[3]; 1715 for (;;) 1716 { 1717 if (c < *cp) { OK = FALSE; break; } 1718 if (c == *cp++) { OK = TRUE; break; } 1719 } 1720 break; 1721 1722 case PT_UCNC: 1723 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1724 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1725 c >= 0xe000; 1726 break; 1727 1728 /* Should never occur, but keep compilers from grumbling. */ 1729 1730 default: 1731 OK = codevalue != OP_PROP; 1732 break; 1733 } 1734 1735 if (OK == (d == OP_PROP)) 1736 { 1737 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1738 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1739 { 1740 active_count--; /* Remove non-match possibility */ 1741 next_active_state--; 1742 } 1743 ADD_NEW(state_offset + count, 0); 1744 } 1745 } 1746 break; 1747 1748 /*-----------------------------------------------------------------*/ 1749 case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1750 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1751 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1752 count = 2; 1753 goto QS2; 1754 1755 case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1756 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1757 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1758 count = 0; 1759 1760 QS2: 1761 1762 ADD_ACTIVE(state_offset + 2, 0); 1763 if (clen > 0) 1764 { 1765 int ncount = 0; 1766 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1767 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1768 { 1769 active_count--; /* Remove non-match possibility */ 1770 next_active_state--; 1771 } 1772 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, 1773 &ncount); 1774 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1775 } 1776 break; 1777 #endif 1778 1779 /*-----------------------------------------------------------------*/ 1780 case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1781 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1782 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1783 count = 2; 1784 goto QS3; 1785 1786 case OP_ANYNL_EXTRA + OP_TYPESTAR: 1787 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1788 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1789 count = 0; 1790 1791 QS3: 1792 ADD_ACTIVE(state_offset + 2, 0); 1793 if (clen > 0) 1794 { 1795 int ncount = 0; 1796 switch (c) 1797 { 1798 case CHAR_VT: 1799 case CHAR_FF: 1800 case CHAR_NEL: 1801 #ifndef EBCDIC 1802 case 0x2028: 1803 case 0x2029: 1804 #endif /* Not EBCDIC */ 1805 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 1806 goto ANYNL02; 1807 1808 case CHAR_CR: 1809 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 1810 /* Fall through */ 1811 1812 ANYNL02: 1813 case CHAR_LF: 1814 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1815 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1816 { 1817 active_count--; /* Remove non-match possibility */ 1818 next_active_state--; 1819 } 1820 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount); 1821 break; 1822 1823 default: 1824 break; 1825 } 1826 } 1827 break; 1828 1829 /*-----------------------------------------------------------------*/ 1830 case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1831 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1832 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1833 count = 2; 1834 goto QS4; 1835 1836 case OP_VSPACE_EXTRA + OP_TYPESTAR: 1837 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1838 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1839 count = 0; 1840 1841 QS4: 1842 ADD_ACTIVE(state_offset + 2, 0); 1843 if (clen > 0) 1844 { 1845 BOOL OK; 1846 switch (c) 1847 { 1848 VSPACE_CASES: 1849 OK = TRUE; 1850 break; 1851 1852 default: 1853 OK = FALSE; 1854 break; 1855 } 1856 if (OK == (d == OP_VSPACE)) 1857 { 1858 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1859 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1860 { 1861 active_count--; /* Remove non-match possibility */ 1862 next_active_state--; 1863 } 1864 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1865 } 1866 } 1867 break; 1868 1869 /*-----------------------------------------------------------------*/ 1870 case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1871 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1872 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1873 count = 2; 1874 goto QS5; 1875 1876 case OP_HSPACE_EXTRA + OP_TYPESTAR: 1877 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1878 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1879 count = 0; 1880 1881 QS5: 1882 ADD_ACTIVE(state_offset + 2, 0); 1883 if (clen > 0) 1884 { 1885 BOOL OK; 1886 switch (c) 1887 { 1888 HSPACE_CASES: 1889 OK = TRUE; 1890 break; 1891 1892 default: 1893 OK = FALSE; 1894 break; 1895 } 1896 1897 if (OK == (d == OP_HSPACE)) 1898 { 1899 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1900 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1901 { 1902 active_count--; /* Remove non-match possibility */ 1903 next_active_state--; 1904 } 1905 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0); 1906 } 1907 } 1908 break; 1909 1910 /*-----------------------------------------------------------------*/ 1911 #ifdef SUPPORT_UNICODE 1912 case OP_PROP_EXTRA + OP_TYPEEXACT: 1913 case OP_PROP_EXTRA + OP_TYPEUPTO: 1914 case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1915 case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1916 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1917 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } 1918 count = current_state->count; /* Number already matched */ 1919 if (clen > 0) 1920 { 1921 BOOL OK; 1922 const uint32_t *cp; 1923 const ucd_record * prop = GET_UCD(c); 1924 switch(code[1 + IMM2_SIZE + 1]) 1925 { 1926 case PT_ANY: 1927 OK = TRUE; 1928 break; 1929 1930 case PT_LAMP: 1931 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1932 prop->chartype == ucp_Lt; 1933 break; 1934 1935 case PT_GC: 1936 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; 1937 break; 1938 1939 case PT_PC: 1940 OK = prop->chartype == code[1 + IMM2_SIZE + 2]; 1941 break; 1942 1943 case PT_SC: 1944 OK = prop->script == code[1 + IMM2_SIZE + 2]; 1945 break; 1946 1947 /* These are specials for combination cases. */ 1948 1949 case PT_ALNUM: 1950 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1951 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1952 break; 1953 1954 /* Perl space used to exclude VT, but from Perl 5.18 it is included, 1955 which means that Perl space and POSIX space are now identical. PCRE 1956 was changed at release 8.34. */ 1957 1958 case PT_SPACE: /* Perl space */ 1959 case PT_PXSPACE: /* POSIX space */ 1960 switch(c) 1961 { 1962 HSPACE_CASES: 1963 VSPACE_CASES: 1964 OK = TRUE; 1965 break; 1966 1967 default: 1968 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; 1969 break; 1970 } 1971 break; 1972 1973 case PT_WORD: 1974 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1975 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1976 c == CHAR_UNDERSCORE; 1977 break; 1978 1979 case PT_CLIST: 1980 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2]; 1981 for (;;) 1982 { 1983 if (c < *cp) { OK = FALSE; break; } 1984 if (c == *cp++) { OK = TRUE; break; } 1985 } 1986 break; 1987 1988 case PT_UCNC: 1989 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || 1990 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || 1991 c >= 0xe000; 1992 break; 1993 1994 /* Should never occur, but keep compilers from grumbling. */ 1995 1996 default: 1997 OK = codevalue != OP_PROP; 1998 break; 1999 } 2000 2001 if (OK == (d == OP_PROP)) 2002 { 2003 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 2004 { 2005 active_count--; /* Remove non-match possibility */ 2006 next_active_state--; 2007 } 2008 if (++count >= (int)GET2(code, 1)) 2009 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } 2010 else 2011 { ADD_NEW(state_offset, count); } 2012 } 2013 } 2014 break; 2015 2016 /*-----------------------------------------------------------------*/ 2017 case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 2018 case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 2019 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 2020 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 2021 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 2022 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2023 count = current_state->count; /* Number already matched */ 2024 if (clen > 0) 2025 { 2026 PCRE2_SPTR nptr; 2027 int ncount = 0; 2028 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 2029 { 2030 active_count--; /* Remove non-match possibility */ 2031 next_active_state--; 2032 } 2033 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf, 2034 &ncount); 2035 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 2036 reset_could_continue = TRUE; 2037 if (++count >= (int)GET2(code, 1)) 2038 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 2039 else 2040 { ADD_NEW_DATA(-state_offset, count, ncount); } 2041 } 2042 break; 2043 #endif 2044 2045 /*-----------------------------------------------------------------*/ 2046 case OP_ANYNL_EXTRA + OP_TYPEEXACT: 2047 case OP_ANYNL_EXTRA + OP_TYPEUPTO: 2048 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 2049 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 2050 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 2051 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2052 count = current_state->count; /* Number already matched */ 2053 if (clen > 0) 2054 { 2055 int ncount = 0; 2056 switch (c) 2057 { 2058 case CHAR_VT: 2059 case CHAR_FF: 2060 case CHAR_NEL: 2061 #ifndef EBCDIC 2062 case 0x2028: 2063 case 0x2029: 2064 #endif /* Not EBCDIC */ 2065 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 2066 goto ANYNL03; 2067 2068 case CHAR_CR: 2069 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1; 2070 /* Fall through */ 2071 2072 ANYNL03: 2073 case CHAR_LF: 2074 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 2075 { 2076 active_count--; /* Remove non-match possibility */ 2077 next_active_state--; 2078 } 2079 if (++count >= (int)GET2(code, 1)) 2080 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 2081 else 2082 { ADD_NEW_DATA(-state_offset, count, ncount); } 2083 break; 2084 2085 default: 2086 break; 2087 } 2088 } 2089 break; 2090 2091 /*-----------------------------------------------------------------*/ 2092 case OP_VSPACE_EXTRA + OP_TYPEEXACT: 2093 case OP_VSPACE_EXTRA + OP_TYPEUPTO: 2094 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 2095 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 2096 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 2097 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2098 count = current_state->count; /* Number already matched */ 2099 if (clen > 0) 2100 { 2101 BOOL OK; 2102 switch (c) 2103 { 2104 VSPACE_CASES: 2105 OK = TRUE; 2106 break; 2107 2108 default: 2109 OK = FALSE; 2110 } 2111 2112 if (OK == (d == OP_VSPACE)) 2113 { 2114 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 2115 { 2116 active_count--; /* Remove non-match possibility */ 2117 next_active_state--; 2118 } 2119 if (++count >= (int)GET2(code, 1)) 2120 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2121 else 2122 { ADD_NEW_DATA(-state_offset, count, 0); } 2123 } 2124 } 2125 break; 2126 2127 /*-----------------------------------------------------------------*/ 2128 case OP_HSPACE_EXTRA + OP_TYPEEXACT: 2129 case OP_HSPACE_EXTRA + OP_TYPEUPTO: 2130 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 2131 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 2132 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 2133 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2134 count = current_state->count; /* Number already matched */ 2135 if (clen > 0) 2136 { 2137 BOOL OK; 2138 switch (c) 2139 { 2140 HSPACE_CASES: 2141 OK = TRUE; 2142 break; 2143 2144 default: 2145 OK = FALSE; 2146 break; 2147 } 2148 2149 if (OK == (d == OP_HSPACE)) 2150 { 2151 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 2152 { 2153 active_count--; /* Remove non-match possibility */ 2154 next_active_state--; 2155 } 2156 if (++count >= (int)GET2(code, 1)) 2157 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2158 else 2159 { ADD_NEW_DATA(-state_offset, count, 0); } 2160 } 2161 } 2162 break; 2163 2164 /* ========================================================================== */ 2165 /* These opcodes are followed by a character that is usually compared 2166 to the current subject character; it is loaded into d. We still get 2167 here even if there is no subject character, because in some cases zero 2168 repetitions are permitted. */ 2169 2170 /*-----------------------------------------------------------------*/ 2171 case OP_CHAR: 2172 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 2173 break; 2174 2175 /*-----------------------------------------------------------------*/ 2176 case OP_CHARI: 2177 if (clen == 0) break; 2178 2179 #ifdef SUPPORT_UNICODE 2180 if (utf) 2181 { 2182 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 2183 { 2184 unsigned int othercase; 2185 if (c < 128) 2186 othercase = fcc[c]; 2187 else 2188 othercase = UCD_OTHERCASE(c); 2189 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 2190 } 2191 } 2192 else 2193 #endif /* SUPPORT_UNICODE */ 2194 /* Not UTF mode */ 2195 { 2196 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) 2197 { ADD_NEW(state_offset + 2, 0); } 2198 } 2199 break; 2200 2201 2202 #ifdef SUPPORT_UNICODE 2203 /*-----------------------------------------------------------------*/ 2204 /* This is a tricky one because it can match more than one character. 2205 Find out how many characters to skip, and then set up a negative state 2206 to wait for them to pass before continuing. */ 2207 2208 case OP_EXTUNI: 2209 if (clen > 0) 2210 { 2211 int ncount = 0; 2212 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, 2213 end_subject, utf, &ncount); 2214 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0) 2215 reset_could_continue = TRUE; 2216 ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 2217 } 2218 break; 2219 #endif 2220 2221 /*-----------------------------------------------------------------*/ 2222 /* This is a tricky like EXTUNI because it too can match more than one 2223 character (when CR is followed by LF). In this case, set up a negative 2224 state to wait for one character to pass before continuing. */ 2225 2226 case OP_ANYNL: 2227 if (clen > 0) switch(c) 2228 { 2229 case CHAR_VT: 2230 case CHAR_FF: 2231 case CHAR_NEL: 2232 #ifndef EBCDIC 2233 case 0x2028: 2234 case 0x2029: 2235 #endif /* Not EBCDIC */ 2236 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break; 2237 /* Fall through */ 2238 2239 case CHAR_LF: 2240 ADD_NEW(state_offset + 1, 0); 2241 break; 2242 2243 case CHAR_CR: 2244 if (ptr + 1 >= end_subject) 2245 { 2246 ADD_NEW(state_offset + 1, 0); 2247 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0) 2248 reset_could_continue = TRUE; 2249 } 2250 else if (UCHAR21TEST(ptr + 1) == CHAR_LF) 2251 { 2252 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 2253 } 2254 else 2255 { 2256 ADD_NEW(state_offset + 1, 0); 2257 } 2258 break; 2259 } 2260 break; 2261 2262 /*-----------------------------------------------------------------*/ 2263 case OP_NOT_VSPACE: 2264 if (clen > 0) switch(c) 2265 { 2266 VSPACE_CASES: 2267 break; 2268 2269 default: 2270 ADD_NEW(state_offset + 1, 0); 2271 break; 2272 } 2273 break; 2274 2275 /*-----------------------------------------------------------------*/ 2276 case OP_VSPACE: 2277 if (clen > 0) switch(c) 2278 { 2279 VSPACE_CASES: 2280 ADD_NEW(state_offset + 1, 0); 2281 break; 2282 2283 default: 2284 break; 2285 } 2286 break; 2287 2288 /*-----------------------------------------------------------------*/ 2289 case OP_NOT_HSPACE: 2290 if (clen > 0) switch(c) 2291 { 2292 HSPACE_CASES: 2293 break; 2294 2295 default: 2296 ADD_NEW(state_offset + 1, 0); 2297 break; 2298 } 2299 break; 2300 2301 /*-----------------------------------------------------------------*/ 2302 case OP_HSPACE: 2303 if (clen > 0) switch(c) 2304 { 2305 HSPACE_CASES: 2306 ADD_NEW(state_offset + 1, 0); 2307 break; 2308 2309 default: 2310 break; 2311 } 2312 break; 2313 2314 /*-----------------------------------------------------------------*/ 2315 /* Match a negated single character casefully. */ 2316 2317 case OP_NOT: 2318 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } 2319 break; 2320 2321 /*-----------------------------------------------------------------*/ 2322 /* Match a negated single character caselessly. */ 2323 2324 case OP_NOTI: 2325 if (clen > 0) 2326 { 2327 uint32_t otherd; 2328 #ifdef SUPPORT_UNICODE 2329 if (utf && d >= 128) 2330 otherd = UCD_OTHERCASE(d); 2331 else 2332 #endif /* SUPPORT_UNICODE */ 2333 otherd = TABLE_GET(d, fcc, d); 2334 if (c != d && c != otherd) 2335 { ADD_NEW(state_offset + dlen + 1, 0); } 2336 } 2337 break; 2338 2339 /*-----------------------------------------------------------------*/ 2340 case OP_PLUSI: 2341 case OP_MINPLUSI: 2342 case OP_POSPLUSI: 2343 case OP_NOTPLUSI: 2344 case OP_NOTMINPLUSI: 2345 case OP_NOTPOSPLUSI: 2346 caseless = TRUE; 2347 codevalue -= OP_STARI - OP_STAR; 2348 2349 /* Fall through */ 2350 case OP_PLUS: 2351 case OP_MINPLUS: 2352 case OP_POSPLUS: 2353 case OP_NOTPLUS: 2354 case OP_NOTMINPLUS: 2355 case OP_NOTPOSPLUS: 2356 count = current_state->count; /* Already matched */ 2357 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2358 if (clen > 0) 2359 { 2360 uint32_t otherd = NOTACHAR; 2361 if (caseless) 2362 { 2363 #ifdef SUPPORT_UNICODE 2364 if (utf && d >= 128) 2365 otherd = UCD_OTHERCASE(d); 2366 else 2367 #endif /* SUPPORT_UNICODE */ 2368 otherd = TABLE_GET(d, fcc, d); 2369 } 2370 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2371 { 2372 if (count > 0 && 2373 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2374 { 2375 active_count--; /* Remove non-match possibility */ 2376 next_active_state--; 2377 } 2378 count++; 2379 ADD_NEW(state_offset, count); 2380 } 2381 } 2382 break; 2383 2384 /*-----------------------------------------------------------------*/ 2385 case OP_QUERYI: 2386 case OP_MINQUERYI: 2387 case OP_POSQUERYI: 2388 case OP_NOTQUERYI: 2389 case OP_NOTMINQUERYI: 2390 case OP_NOTPOSQUERYI: 2391 caseless = TRUE; 2392 codevalue -= OP_STARI - OP_STAR; 2393 /* Fall through */ 2394 case OP_QUERY: 2395 case OP_MINQUERY: 2396 case OP_POSQUERY: 2397 case OP_NOTQUERY: 2398 case OP_NOTMINQUERY: 2399 case OP_NOTPOSQUERY: 2400 ADD_ACTIVE(state_offset + dlen + 1, 0); 2401 if (clen > 0) 2402 { 2403 uint32_t otherd = NOTACHAR; 2404 if (caseless) 2405 { 2406 #ifdef SUPPORT_UNICODE 2407 if (utf && d >= 128) 2408 otherd = UCD_OTHERCASE(d); 2409 else 2410 #endif /* SUPPORT_UNICODE */ 2411 otherd = TABLE_GET(d, fcc, d); 2412 } 2413 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2414 { 2415 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2416 { 2417 active_count--; /* Remove non-match possibility */ 2418 next_active_state--; 2419 } 2420 ADD_NEW(state_offset + dlen + 1, 0); 2421 } 2422 } 2423 break; 2424 2425 /*-----------------------------------------------------------------*/ 2426 case OP_STARI: 2427 case OP_MINSTARI: 2428 case OP_POSSTARI: 2429 case OP_NOTSTARI: 2430 case OP_NOTMINSTARI: 2431 case OP_NOTPOSSTARI: 2432 caseless = TRUE; 2433 codevalue -= OP_STARI - OP_STAR; 2434 /* Fall through */ 2435 case OP_STAR: 2436 case OP_MINSTAR: 2437 case OP_POSSTAR: 2438 case OP_NOTSTAR: 2439 case OP_NOTMINSTAR: 2440 case OP_NOTPOSSTAR: 2441 ADD_ACTIVE(state_offset + dlen + 1, 0); 2442 if (clen > 0) 2443 { 2444 uint32_t otherd = NOTACHAR; 2445 if (caseless) 2446 { 2447 #ifdef SUPPORT_UNICODE 2448 if (utf && d >= 128) 2449 otherd = UCD_OTHERCASE(d); 2450 else 2451 #endif /* SUPPORT_UNICODE */ 2452 otherd = TABLE_GET(d, fcc, d); 2453 } 2454 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2455 { 2456 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2457 { 2458 active_count--; /* Remove non-match possibility */ 2459 next_active_state--; 2460 } 2461 ADD_NEW(state_offset, 0); 2462 } 2463 } 2464 break; 2465 2466 /*-----------------------------------------------------------------*/ 2467 case OP_EXACTI: 2468 case OP_NOTEXACTI: 2469 caseless = TRUE; 2470 codevalue -= OP_STARI - OP_STAR; 2471 /* Fall through */ 2472 case OP_EXACT: 2473 case OP_NOTEXACT: 2474 count = current_state->count; /* Number already matched */ 2475 if (clen > 0) 2476 { 2477 uint32_t otherd = NOTACHAR; 2478 if (caseless) 2479 { 2480 #ifdef SUPPORT_UNICODE 2481 if (utf && d >= 128) 2482 otherd = UCD_OTHERCASE(d); 2483 else 2484 #endif /* SUPPORT_UNICODE */ 2485 otherd = TABLE_GET(d, fcc, d); 2486 } 2487 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2488 { 2489 if (++count >= (int)GET2(code, 1)) 2490 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2491 else 2492 { ADD_NEW(state_offset, count); } 2493 } 2494 } 2495 break; 2496 2497 /*-----------------------------------------------------------------*/ 2498 case OP_UPTOI: 2499 case OP_MINUPTOI: 2500 case OP_POSUPTOI: 2501 case OP_NOTUPTOI: 2502 case OP_NOTMINUPTOI: 2503 case OP_NOTPOSUPTOI: 2504 caseless = TRUE; 2505 codevalue -= OP_STARI - OP_STAR; 2506 /* Fall through */ 2507 case OP_UPTO: 2508 case OP_MINUPTO: 2509 case OP_POSUPTO: 2510 case OP_NOTUPTO: 2511 case OP_NOTMINUPTO: 2512 case OP_NOTPOSUPTO: 2513 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); 2514 count = current_state->count; /* Number already matched */ 2515 if (clen > 0) 2516 { 2517 uint32_t otherd = NOTACHAR; 2518 if (caseless) 2519 { 2520 #ifdef SUPPORT_UNICODE 2521 if (utf && d >= 128) 2522 otherd = UCD_OTHERCASE(d); 2523 else 2524 #endif /* SUPPORT_UNICODE */ 2525 otherd = TABLE_GET(d, fcc, d); 2526 } 2527 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2528 { 2529 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2530 { 2531 active_count--; /* Remove non-match possibility */ 2532 next_active_state--; 2533 } 2534 if (++count >= (int)GET2(code, 1)) 2535 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2536 else 2537 { ADD_NEW(state_offset, count); } 2538 } 2539 } 2540 break; 2541 2542 2543 /* ========================================================================== */ 2544 /* These are the class-handling opcodes */ 2545 2546 case OP_CLASS: 2547 case OP_NCLASS: 2548 case OP_XCLASS: 2549 { 2550 BOOL isinclass = FALSE; 2551 int next_state_offset; 2552 PCRE2_SPTR ecode; 2553 2554 /* For a simple class, there is always just a 32-byte table, and we 2555 can set isinclass from it. */ 2556 2557 if (codevalue != OP_XCLASS) 2558 { 2559 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); 2560 if (clen > 0) 2561 { 2562 isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2563 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0); 2564 } 2565 } 2566 2567 /* An extended class may have a table or a list of single characters, 2568 ranges, or both, and it may be positive or negative. There's a 2569 function that sorts all this out. */ 2570 2571 else 2572 { 2573 ecode = code + GET(code, 1); 2574 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); 2575 } 2576 2577 /* At this point, isinclass is set for all kinds of class, and ecode 2578 points to the byte after the end of the class. If there is a 2579 quantifier, this is where it will be. */ 2580 2581 next_state_offset = (int)(ecode - start_code); 2582 2583 switch (*ecode) 2584 { 2585 case OP_CRSTAR: 2586 case OP_CRMINSTAR: 2587 case OP_CRPOSSTAR: 2588 ADD_ACTIVE(next_state_offset + 1, 0); 2589 if (isinclass) 2590 { 2591 if (*ecode == OP_CRPOSSTAR) 2592 { 2593 active_count--; /* Remove non-match possibility */ 2594 next_active_state--; 2595 } 2596 ADD_NEW(state_offset, 0); 2597 } 2598 break; 2599 2600 case OP_CRPLUS: 2601 case OP_CRMINPLUS: 2602 case OP_CRPOSPLUS: 2603 count = current_state->count; /* Already matched */ 2604 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2605 if (isinclass) 2606 { 2607 if (count > 0 && *ecode == OP_CRPOSPLUS) 2608 { 2609 active_count--; /* Remove non-match possibility */ 2610 next_active_state--; 2611 } 2612 count++; 2613 ADD_NEW(state_offset, count); 2614 } 2615 break; 2616 2617 case OP_CRQUERY: 2618 case OP_CRMINQUERY: 2619 case OP_CRPOSQUERY: 2620 ADD_ACTIVE(next_state_offset + 1, 0); 2621 if (isinclass) 2622 { 2623 if (*ecode == OP_CRPOSQUERY) 2624 { 2625 active_count--; /* Remove non-match possibility */ 2626 next_active_state--; 2627 } 2628 ADD_NEW(next_state_offset + 1, 0); 2629 } 2630 break; 2631 2632 case OP_CRRANGE: 2633 case OP_CRMINRANGE: 2634 case OP_CRPOSRANGE: 2635 count = current_state->count; /* Already matched */ 2636 if (count >= (int)GET2(ecode, 1)) 2637 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2638 if (isinclass) 2639 { 2640 int max = (int)GET2(ecode, 1 + IMM2_SIZE); 2641 2642 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1)) 2643 { 2644 active_count--; /* Remove non-match possibility */ 2645 next_active_state--; 2646 } 2647 2648 if (++count >= max && max != 0) /* Max 0 => no limit */ 2649 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2650 else 2651 { ADD_NEW(state_offset, count); } 2652 } 2653 break; 2654 2655 default: 2656 if (isinclass) { ADD_NEW(next_state_offset, 0); } 2657 break; 2658 } 2659 } 2660 break; 2661 2662 /* ========================================================================== */ 2663 /* These are the opcodes for fancy brackets of various kinds. We have 2664 to use recursion in order to handle them. The "always failing" assertion 2665 (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2666 though the other "backtracking verbs" are not supported. */ 2667 2668 case OP_FAIL: 2669 forced_fail++; /* Count FAILs for multiple states */ 2670 break; 2671 2672 case OP_ASSERT: 2673 case OP_ASSERT_NOT: 2674 case OP_ASSERTBACK: 2675 case OP_ASSERTBACK_NOT: 2676 { 2677 int rc; 2678 int *local_workspace; 2679 PCRE2_SIZE *local_offsets; 2680 PCRE2_SPTR endasscode = code + GET(code, 1); 2681 RWS_anchor *rws = (RWS_anchor *)RWS; 2682 2683 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) 2684 { 2685 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); 2686 if (rc != 0) return rc; 2687 RWS = (int *)rws; 2688 } 2689 2690 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); 2691 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; 2692 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; 2693 2694 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2695 2696 rc = internal_dfa_match( 2697 mb, /* static match data */ 2698 code, /* this subexpression's code */ 2699 ptr, /* where we currently are */ 2700 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2701 local_offsets, /* offset vector */ 2702 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ 2703 local_workspace, /* workspace vector */ 2704 RWS_RSIZE, /* size of same */ 2705 rlevel, /* function recursion level */ 2706 RWS); /* recursion workspace */ 2707 2708 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; 2709 2710 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; 2711 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2712 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2713 } 2714 break; 2715 2716 /*-----------------------------------------------------------------*/ 2717 case OP_COND: 2718 case OP_SCOND: 2719 { 2720 int codelink = (int)GET(code, 1); 2721 PCRE2_UCHAR condcode; 2722 2723 /* Because of the way auto-callout works during compile, a callout item 2724 is inserted between OP_COND and an assertion condition. This does not 2725 happen for the other conditions. */ 2726 2727 if (code[LINK_SIZE + 1] == OP_CALLOUT 2728 || code[LINK_SIZE + 1] == OP_CALLOUT_STR) 2729 { 2730 PCRE2_SIZE callout_length; 2731 rrc = do_callout(code, offsets, current_subject, ptr, mb, 2732 1 + LINK_SIZE, &callout_length); 2733 if (rrc < 0) return rrc; /* Abandon */ 2734 if (rrc > 0) break; /* Fail this thread */ 2735 code += callout_length; /* Skip callout data */ 2736 } 2737 2738 condcode = code[LINK_SIZE+1]; 2739 2740 /* Back reference conditions and duplicate named recursion conditions 2741 are not supported */ 2742 2743 if (condcode == OP_CREF || condcode == OP_DNCREF || 2744 condcode == OP_DNRREF) 2745 return PCRE2_ERROR_DFA_UCOND; 2746 2747 /* The DEFINE condition is always false, and the assertion (?!) is 2748 converted to OP_FAIL. */ 2749 2750 if (condcode == OP_FALSE || condcode == OP_FAIL) 2751 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2752 2753 /* There is also an always-true condition */ 2754 2755 else if (condcode == OP_TRUE) 2756 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2757 2758 /* The only supported version of OP_RREF is for the value RREF_ANY, 2759 which means "test if in any recursion". We can't test for specifically 2760 recursed groups. */ 2761 2762 else if (condcode == OP_RREF) 2763 { 2764 unsigned int value = GET2(code, LINK_SIZE + 2); 2765 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND; 2766 if (mb->recursive != NULL) 2767 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2768 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2769 } 2770 2771 /* Otherwise, the condition is an assertion */ 2772 2773 else 2774 { 2775 int rc; 2776 int *local_workspace; 2777 PCRE2_SIZE *local_offsets; 2778 PCRE2_SPTR asscode = code + LINK_SIZE + 1; 2779 PCRE2_SPTR endasscode = asscode + GET(asscode, 1); 2780 RWS_anchor *rws = (RWS_anchor *)RWS; 2781 2782 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) 2783 { 2784 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); 2785 if (rc != 0) return rc; 2786 RWS = (int *)rws; 2787 } 2788 2789 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); 2790 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; 2791 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; 2792 2793 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2794 2795 rc = internal_dfa_match( 2796 mb, /* fixed match data */ 2797 asscode, /* this subexpression's code */ 2798 ptr, /* where we currently are */ 2799 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2800 local_offsets, /* offset vector */ 2801 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ 2802 local_workspace, /* workspace vector */ 2803 RWS_RSIZE, /* size of same */ 2804 rlevel, /* function recursion level */ 2805 RWS); /* recursion workspace */ 2806 2807 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; 2808 2809 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; 2810 if ((rc >= 0) == 2811 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2812 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2813 else 2814 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2815 } 2816 } 2817 break; 2818 2819 /*-----------------------------------------------------------------*/ 2820 case OP_RECURSE: 2821 { 2822 int rc; 2823 int *local_workspace; 2824 PCRE2_SIZE *local_offsets; 2825 RWS_anchor *rws = (RWS_anchor *)RWS; 2826 dfa_recursion_info *ri; 2827 PCRE2_SPTR callpat = start_code + GET(code, 1); 2828 uint32_t recno = (callpat == mb->start_code)? 0 : 2829 GET2(callpat, 1 + LINK_SIZE); 2830 2831 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE) 2832 { 2833 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb); 2834 if (rc != 0) return rc; 2835 RWS = (int *)rws; 2836 } 2837 2838 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); 2839 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE; 2840 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE; 2841 2842 /* Check for repeating a recursion without advancing the subject 2843 pointer. This should catch convoluted mutual recursions. (Some simple 2844 cases are caught at compile time.) */ 2845 2846 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) 2847 if (recno == ri->group_num && ptr == ri->subject_position) 2848 return PCRE2_ERROR_RECURSELOOP; 2849 2850 /* Remember this recursion and where we started it so as to 2851 catch infinite loops. */ 2852 2853 new_recursive.group_num = recno; 2854 new_recursive.subject_position = ptr; 2855 new_recursive.prevrec = mb->recursive; 2856 mb->recursive = &new_recursive; 2857 2858 rc = internal_dfa_match( 2859 mb, /* fixed match data */ 2860 callpat, /* this subexpression's code */ 2861 ptr, /* where we currently are */ 2862 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2863 local_offsets, /* offset vector */ 2864 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */ 2865 local_workspace, /* workspace vector */ 2866 RWS_RSIZE, /* size of same */ 2867 rlevel, /* function recursion level */ 2868 RWS); /* recursion workspace */ 2869 2870 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE; 2871 mb->recursive = new_recursive.prevrec; /* Done this recursion */ 2872 2873 /* Ran out of internal offsets */ 2874 2875 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE; 2876 2877 /* For each successful matched substring, set up the next state with a 2878 count of characters to skip before trying it. Note that the count is in 2879 characters, not bytes. */ 2880 2881 if (rc > 0) 2882 { 2883 for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2884 { 2885 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc]; 2886 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 2887 if (utf) 2888 { 2889 PCRE2_SPTR p = start_subject + local_offsets[rc]; 2890 PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; 2891 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 2892 } 2893 #endif 2894 if (charcount > 0) 2895 { 2896 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, 2897 (int)(charcount - 1)); 2898 } 2899 else 2900 { 2901 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2902 } 2903 } 2904 } 2905 else if (rc != PCRE2_ERROR_NOMATCH) return rc; 2906 } 2907 break; 2908 2909 /*-----------------------------------------------------------------*/ 2910 case OP_BRAPOS: 2911 case OP_SBRAPOS: 2912 case OP_CBRAPOS: 2913 case OP_SCBRAPOS: 2914 case OP_BRAPOSZERO: 2915 { 2916 int rc; 2917 int *local_workspace; 2918 PCRE2_SIZE *local_offsets; 2919 PCRE2_SIZE charcount, matched_count; 2920 PCRE2_SPTR local_ptr = ptr; 2921 RWS_anchor *rws = (RWS_anchor *)RWS; 2922 BOOL allow_zero; 2923 2924 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) 2925 { 2926 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); 2927 if (rc != 0) return rc; 2928 RWS = (int *)rws; 2929 } 2930 2931 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); 2932 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; 2933 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; 2934 2935 if (codevalue == OP_BRAPOSZERO) 2936 { 2937 allow_zero = TRUE; 2938 codevalue = *(++code); /* Codevalue will be one of above BRAs */ 2939 } 2940 else allow_zero = FALSE; 2941 2942 /* Loop to match the subpattern as many times as possible as if it were 2943 a complete pattern. */ 2944 2945 for (matched_count = 0;; matched_count++) 2946 { 2947 rc = internal_dfa_match( 2948 mb, /* fixed match data */ 2949 code, /* this subexpression's code */ 2950 local_ptr, /* where we currently are */ 2951 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 2952 local_offsets, /* offset vector */ 2953 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ 2954 local_workspace, /* workspace vector */ 2955 RWS_RSIZE, /* size of same */ 2956 rlevel, /* function recursion level */ 2957 RWS); /* recursion workspace */ 2958 2959 /* Failed to match */ 2960 2961 if (rc < 0) 2962 { 2963 if (rc != PCRE2_ERROR_NOMATCH) return rc; 2964 break; 2965 } 2966 2967 /* Matched: break the loop if zero characters matched. */ 2968 2969 charcount = local_offsets[1] - local_offsets[0]; 2970 if (charcount == 0) break; 2971 local_ptr += charcount; /* Advance temporary position ptr */ 2972 } 2973 2974 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; 2975 2976 /* At this point we have matched the subpattern matched_count 2977 times, and local_ptr is pointing to the character after the end of the 2978 last match. */ 2979 2980 if (matched_count > 0 || allow_zero) 2981 { 2982 PCRE2_SPTR end_subpattern = code; 2983 int next_state_offset; 2984 2985 do { end_subpattern += GET(end_subpattern, 1); } 2986 while (*end_subpattern == OP_ALT); 2987 next_state_offset = 2988 (int)(end_subpattern - start_code + LINK_SIZE + 1); 2989 2990 /* Optimization: if there are no more active states, and there 2991 are no new states yet set up, then skip over the subject string 2992 right here, to save looping. Otherwise, set up the new state to swing 2993 into action when the end of the matched substring is reached. */ 2994 2995 if (i + 1 >= active_count && new_count == 0) 2996 { 2997 ptr = local_ptr; 2998 clen = 0; 2999 ADD_NEW(next_state_offset, 0); 3000 } 3001 else 3002 { 3003 PCRE2_SPTR p = ptr; 3004 PCRE2_SPTR pp = local_ptr; 3005 charcount = (PCRE2_SIZE)(pp - p); 3006 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 3007 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 3008 #endif 3009 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); 3010 } 3011 } 3012 } 3013 break; 3014 3015 /*-----------------------------------------------------------------*/ 3016 case OP_ONCE: 3017 { 3018 int rc; 3019 int *local_workspace; 3020 PCRE2_SIZE *local_offsets; 3021 RWS_anchor *rws = (RWS_anchor *)RWS; 3022 3023 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE) 3024 { 3025 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb); 3026 if (rc != 0) return rc; 3027 RWS = (int *)rws; 3028 } 3029 3030 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free); 3031 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE; 3032 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE; 3033 3034 rc = internal_dfa_match( 3035 mb, /* fixed match data */ 3036 code, /* this subexpression's code */ 3037 ptr, /* where we currently are */ 3038 (PCRE2_SIZE)(ptr - start_subject), /* start offset */ 3039 local_offsets, /* offset vector */ 3040 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */ 3041 local_workspace, /* workspace vector */ 3042 RWS_RSIZE, /* size of same */ 3043 rlevel, /* function recursion level */ 3044 RWS); /* recursion workspace */ 3045 3046 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE; 3047 3048 if (rc >= 0) 3049 { 3050 PCRE2_SPTR end_subpattern = code; 3051 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0]; 3052 int next_state_offset, repeat_state_offset; 3053 3054 do { end_subpattern += GET(end_subpattern, 1); } 3055 while (*end_subpattern == OP_ALT); 3056 next_state_offset = 3057 (int)(end_subpattern - start_code + LINK_SIZE + 1); 3058 3059 /* If the end of this subpattern is KETRMAX or KETRMIN, we must 3060 arrange for the repeat state also to be added to the relevant list. 3061 Calculate the offset, or set -1 for no repeat. */ 3062 3063 repeat_state_offset = (*end_subpattern == OP_KETRMAX || 3064 *end_subpattern == OP_KETRMIN)? 3065 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; 3066 3067 /* If we have matched an empty string, add the next state at the 3068 current character pointer. This is important so that the duplicate 3069 checking kicks in, which is what breaks infinite loops that match an 3070 empty string. */ 3071 3072 if (charcount == 0) 3073 { 3074 ADD_ACTIVE(next_state_offset, 0); 3075 } 3076 3077 /* Optimization: if there are no more active states, and there 3078 are no new states yet set up, then skip over the subject string 3079 right here, to save looping. Otherwise, set up the new state to swing 3080 into action when the end of the matched substring is reached. */ 3081 3082 else if (i + 1 >= active_count && new_count == 0) 3083 { 3084 ptr += charcount; 3085 clen = 0; 3086 ADD_NEW(next_state_offset, 0); 3087 3088 /* If we are adding a repeat state at the new character position, 3089 we must fudge things so that it is the only current state. 3090 Otherwise, it might be a duplicate of one we processed before, and 3091 that would cause it to be skipped. */ 3092 3093 if (repeat_state_offset >= 0) 3094 { 3095 next_active_state = active_states; 3096 active_count = 0; 3097 i = -1; 3098 ADD_ACTIVE(repeat_state_offset, 0); 3099 } 3100 } 3101 else 3102 { 3103 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 3104 if (utf) 3105 { 3106 PCRE2_SPTR p = start_subject + local_offsets[0]; 3107 PCRE2_SPTR pp = start_subject + local_offsets[1]; 3108 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; 3109 } 3110 #endif 3111 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); 3112 if (repeat_state_offset >= 0) 3113 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); } 3114 } 3115 } 3116 else if (rc != PCRE2_ERROR_NOMATCH) return rc; 3117 } 3118 break; 3119 3120 3121 /* ========================================================================== */ 3122 /* Handle callouts */ 3123 3124 case OP_CALLOUT: 3125 case OP_CALLOUT_STR: 3126 { 3127 PCRE2_SIZE callout_length; 3128 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0, 3129 &callout_length); 3130 if (rrc < 0) return rrc; /* Abandon */ 3131 if (rrc == 0) 3132 { ADD_ACTIVE(state_offset + (int)callout_length, 0); } 3133 } 3134 break; 3135 3136 3137 /* ========================================================================== */ 3138 default: /* Unsupported opcode */ 3139 return PCRE2_ERROR_DFA_UITEM; 3140 } 3141 3142 NEXT_ACTIVE_STATE: continue; 3143 3144 } /* End of loop scanning active states */ 3145 3146 /* We have finished the processing at the current subject character. If no 3147 new states have been set for the next character, we have found all the 3148 matches that we are going to find. If we are at the top level and partial 3149 matching has been requested, check for appropriate conditions. 3150 3151 The "forced_ fail" variable counts the number of (*F) encountered for the 3152 character. If it is equal to the original active_count (saved in 3153 workspace[1]) it means that (*F) was found on every active state. In this 3154 case we don't want to give a partial match. 3155 3156 The "could_continue" variable is true if a state could have continued but 3157 for the fact that the end of the subject was reached. */ 3158 3159 if (new_count <= 0) 3160 { 3161 if (rlevel == 1 && /* Top level, and */ 3162 could_continue && /* Some could go on, and */ 3163 forced_fail != workspace[1] && /* Not all forced fail & */ 3164 ( /* either... */ 3165 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */ 3166 || /* or... */ 3167 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */ 3168 match_count < 0) /* no matches */ 3169 ) && /* And... */ 3170 ( 3171 partial_newline || /* Either partial NL */ 3172 ( /* or ... */ 3173 ptr >= end_subject && /* End of subject and */ 3174 ptr > mb->start_used_ptr) /* Inspected non-empty string */ 3175 ) 3176 ) 3177 match_count = PCRE2_ERROR_PARTIAL; 3178 break; /* Exit from loop along the subject string */ 3179 } 3180 3181 /* One or more states are active for the next character. */ 3182 3183 ptr += clen; /* Advance to next subject character */ 3184 } /* Loop to move along the subject string */ 3185 3186 /* Control gets here from "break" a few lines above. If we have a match and 3187 PCRE2_ENDANCHORED is set, the match fails. */ 3188 3189 if (match_count >= 0 && 3190 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 && 3191 ptr < end_subject) 3192 match_count = PCRE2_ERROR_NOMATCH; 3193 3194 return match_count; 3195 } 3196 3197 3198 3199 /************************************************* 3200 * Match a pattern using the DFA algorithm * 3201 *************************************************/ 3202 3203 /* This function matches a compiled pattern to a subject string, using the 3204 alternate matching algorithm that finds all matches at once. 3205 3206 Arguments: 3207 code points to the compiled pattern 3208 subject subject string 3209 length length of subject string 3210 startoffset where to start matching in the subject 3211 options option bits 3212 match_data points to a match data structure 3213 gcontext points to a match context 3214 workspace pointer to workspace 3215 wscount size of workspace 3216 3217 Returns: > 0 => number of match offset pairs placed in offsets 3218 = 0 => offsets overflowed; longest matches are present 3219 -1 => failed to match 3220 < -1 => some kind of unexpected problem 3221 */ 3222 3223 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 3224 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 3225 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 3226 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount) 3227 { 3228 int rc; 3229 const pcre2_real_code *re = (const pcre2_real_code *)code; 3230 3231 PCRE2_SPTR start_match; 3232 PCRE2_SPTR end_subject; 3233 PCRE2_SPTR bumpalong_limit; 3234 PCRE2_SPTR req_cu_ptr; 3235 3236 BOOL utf, anchored, startline, firstline; 3237 BOOL has_first_cu = FALSE; 3238 BOOL has_req_cu = FALSE; 3239 3240 PCRE2_UCHAR first_cu = 0; 3241 PCRE2_UCHAR first_cu2 = 0; 3242 PCRE2_UCHAR req_cu = 0; 3243 PCRE2_UCHAR req_cu2 = 0; 3244 3245 const uint8_t *start_bits = NULL; 3246 3247 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro 3248 is used below, and it expects NLBLOCK to be defined as a pointer. */ 3249 3250 pcre2_callout_block cb; 3251 dfa_match_block actual_match_block; 3252 dfa_match_block *mb = &actual_match_block; 3253 3254 /* Set up a starting block of memory for use during recursive calls to 3255 internal_dfa_match(). By putting this on the stack, it minimizes resource use 3256 in the case when it is not needed. If this is too small, more memory is 3257 obtained from the heap. At the start of each block is an anchor structure.*/ 3258 3259 int base_recursion_workspace[RWS_BASE_SIZE]; 3260 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace; 3261 rws->next = NULL; 3262 rws->size = RWS_BASE_SIZE; 3263 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE; 3264 3265 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated 3266 subject string. */ 3267 3268 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); 3269 3270 /* Plausibility checks */ 3271 3272 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; 3273 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL) 3274 return PCRE2_ERROR_NULL; 3275 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE; 3276 if (start_offset > length) return PCRE2_ERROR_BADOFFSET; 3277 3278 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same 3279 time. */ 3280 3281 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 && 3282 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) 3283 return PCRE2_ERROR_BADOPTION; 3284 3285 /* Check that the first field in the block is the magic number. If it is not, 3286 return with PCRE2_ERROR_BADMAGIC. */ 3287 3288 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; 3289 3290 /* Check the code unit width. */ 3291 3292 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) 3293 return PCRE2_ERROR_BADMODE; 3294 3295 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the 3296 options variable for this function. Users of PCRE2 who are not calling the 3297 function directly would like to have a way of setting these flags, in the same 3298 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with 3299 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and 3300 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be 3301 transferred to the options for this function. The bits are guaranteed to be 3302 adjacent, but do not have the same values. This bit of Boolean trickery assumes 3303 that the match-time bits are not more significant than the flag bits. If by 3304 accident this is not the case, a compile-time division by zero error will 3305 occur. */ 3306 3307 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) 3308 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) 3309 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); 3310 #undef FF 3311 #undef OO 3312 3313 /* If restarting after a partial match, do some sanity checks on the contents 3314 of the workspace. */ 3315 3316 if ((options & PCRE2_DFA_RESTART) != 0) 3317 { 3318 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || 3319 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK)) 3320 return PCRE2_ERROR_DFA_BADRESTART; 3321 } 3322 3323 /* Set some local values */ 3324 3325 utf = (re->overall_options & PCRE2_UTF) != 0; 3326 start_match = subject + start_offset; 3327 end_subject = subject + length; 3328 req_cu_ptr = start_match - 1; 3329 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 || 3330 (re->overall_options & PCRE2_ANCHORED) != 0; 3331 3332 /* The "must be at the start of a line" flags are used in a loop when finding 3333 where to start. */ 3334 3335 startline = (re->flags & PCRE2_STARTLINE) != 0; 3336 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; 3337 bumpalong_limit = end_subject; 3338 3339 /* Initialize and set up the fixed fields in the callout block, with a pointer 3340 in the match block. */ 3341 3342 mb->cb = &cb; 3343 cb.version = 2; 3344 cb.subject = subject; 3345 cb.subject_length = (PCRE2_SIZE)(end_subject - subject); 3346 cb.callout_flags = 0; 3347 cb.capture_top = 1; /* No capture support */ 3348 cb.capture_last = 0; 3349 cb.mark = NULL; /* No (*MARK) support */ 3350 3351 /* Get data from the match context, if present, and fill in the remaining 3352 fields in the match block. It is an error to set an offset limit without 3353 setting the flag at compile time. */ 3354 3355 if (mcontext == NULL) 3356 { 3357 mb->callout = NULL; 3358 mb->memctl = re->memctl; 3359 mb->match_limit = PRIV(default_match_context).match_limit; 3360 mb->match_limit_depth = PRIV(default_match_context).depth_limit; 3361 mb->heap_limit = PRIV(default_match_context).heap_limit; 3362 } 3363 else 3364 { 3365 if (mcontext->offset_limit != PCRE2_UNSET) 3366 { 3367 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) 3368 return PCRE2_ERROR_BADOFFSETLIMIT; 3369 bumpalong_limit = subject + mcontext->offset_limit; 3370 } 3371 mb->callout = mcontext->callout; 3372 mb->callout_data = mcontext->callout_data; 3373 mb->memctl = mcontext->memctl; 3374 mb->match_limit = mcontext->match_limit; 3375 mb->match_limit_depth = mcontext->depth_limit; 3376 mb->heap_limit = mcontext->heap_limit; 3377 } 3378 3379 if (mb->match_limit > re->limit_match) 3380 mb->match_limit = re->limit_match; 3381 3382 if (mb->match_limit_depth > re->limit_depth) 3383 mb->match_limit_depth = re->limit_depth; 3384 3385 if (mb->heap_limit > re->limit_heap) 3386 mb->heap_limit = re->limit_heap; 3387 3388 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + 3389 re->name_count * re->name_entry_size; 3390 mb->tables = re->tables; 3391 mb->start_subject = subject; 3392 mb->end_subject = end_subject; 3393 mb->start_offset = start_offset; 3394 mb->moptions = options; 3395 mb->poptions = re->overall_options; 3396 mb->match_call_count = 0; 3397 mb->heap_used = 0; 3398 3399 /* Process the \R and newline settings. */ 3400 3401 mb->bsr_convention = re->bsr_convention; 3402 mb->nltype = NLTYPE_FIXED; 3403 switch(re->newline_convention) 3404 { 3405 case PCRE2_NEWLINE_CR: 3406 mb->nllen = 1; 3407 mb->nl[0] = CHAR_CR; 3408 break; 3409 3410 case PCRE2_NEWLINE_LF: 3411 mb->nllen = 1; 3412 mb->nl[0] = CHAR_NL; 3413 break; 3414 3415 case PCRE2_NEWLINE_NUL: 3416 mb->nllen = 1; 3417 mb->nl[0] = CHAR_NUL; 3418 break; 3419 3420 case PCRE2_NEWLINE_CRLF: 3421 mb->nllen = 2; 3422 mb->nl[0] = CHAR_CR; 3423 mb->nl[1] = CHAR_NL; 3424 break; 3425 3426 case PCRE2_NEWLINE_ANY: 3427 mb->nltype = NLTYPE_ANY; 3428 break; 3429 3430 case PCRE2_NEWLINE_ANYCRLF: 3431 mb->nltype = NLTYPE_ANYCRLF; 3432 break; 3433 3434 default: return PCRE2_ERROR_INTERNAL; 3435 } 3436 3437 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, 3438 we must also check that a starting offset does not point into the middle of a 3439 multiunit character. We check only the portion of the subject that is going to 3440 be inspected during matching - from the offset minus the maximum back reference 3441 to the given length. This saves time when a small part of a large subject is 3442 being matched by the use of a starting offset. Note that the maximum lookbehind 3443 is a number of characters, not code units. */ 3444 3445 #ifdef SUPPORT_UNICODE 3446 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) 3447 { 3448 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ 3449 3450 if (start_offset > 0) 3451 { 3452 #if PCRE2_CODE_UNIT_WIDTH != 32 3453 unsigned int i; 3454 if (start_match < end_subject && NOT_FIRSTCU(*start_match)) 3455 return PCRE2_ERROR_BADUTFOFFSET; 3456 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) 3457 { 3458 check_subject--; 3459 while (check_subject > subject && 3460 #if PCRE2_CODE_UNIT_WIDTH == 8 3461 (*check_subject & 0xc0) == 0x80) 3462 #else /* 16-bit */ 3463 (*check_subject & 0xfc00) == 0xdc00) 3464 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 3465 check_subject--; 3466 } 3467 #else /* In the 32-bit library, one code unit equals one character. */ 3468 check_subject -= re->max_lookbehind; 3469 if (check_subject < subject) check_subject = subject; 3470 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ 3471 } 3472 3473 /* Validate the relevant portion of the subject. After an error, adjust the 3474 offset to be an absolute offset in the whole string. */ 3475 3476 match_data->rc = PRIV(valid_utf)(check_subject, 3477 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); 3478 if (match_data->rc != 0) 3479 { 3480 match_data->startchar += (PCRE2_SIZE)(check_subject - subject); 3481 return match_data->rc; 3482 } 3483 } 3484 #endif /* SUPPORT_UNICODE */ 3485 3486 /* Set up the first code unit to match, if available. If there's no first code 3487 unit there may be a bitmap of possible first characters. */ 3488 3489 if ((re->flags & PCRE2_FIRSTSET) != 0) 3490 { 3491 has_first_cu = TRUE; 3492 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); 3493 if ((re->flags & PCRE2_FIRSTCASELESS) != 0) 3494 { 3495 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); 3496 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 3497 if (utf && first_cu > 127) 3498 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); 3499 #endif 3500 } 3501 } 3502 else 3503 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) 3504 start_bits = re->start_bitmap; 3505 3506 /* There may be a "last known required code unit" set. */ 3507 3508 if ((re->flags & PCRE2_LASTSET) != 0) 3509 { 3510 has_req_cu = TRUE; 3511 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); 3512 if ((re->flags & PCRE2_LASTCASELESS) != 0) 3513 { 3514 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); 3515 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 3516 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); 3517 #endif 3518 } 3519 } 3520 3521 /* Fill in fields that are always returned in the match data. */ 3522 3523 match_data->code = re; 3524 match_data->subject = subject; 3525 match_data->mark = NULL; 3526 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER; 3527 3528 /* Call the main matching function, looping for a non-anchored regex after a 3529 failed match. If not restarting, perform certain optimizations at the start of 3530 a match. */ 3531 3532 for (;;) 3533 { 3534 /* ----------------- Start of match optimizations ---------------- */ 3535 3536 /* There are some optimizations that avoid running the match if a known 3537 starting point is not found, or if a known later code unit is not present. 3538 However, there is an option (settable at compile time) that disables 3539 these, for testing and for ensuring that all callouts do actually occur. 3540 The optimizations must also be avoided when restarting a DFA match. */ 3541 3542 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && 3543 (options & PCRE2_DFA_RESTART) == 0) 3544 { 3545 /* If firstline is TRUE, the start of the match is constrained to the first 3546 line of a multiline string. That is, the match must be before or at the 3547 first newline following the start of matching. Temporarily adjust 3548 end_subject so that we stop the optimization scans for a first code unit 3549 immediately after the first character of a newline (the first code unit can 3550 legitimately be a newline). If the match fails at the newline, later code 3551 breaks this loop. */ 3552 3553 if (firstline) 3554 { 3555 PCRE2_SPTR t = start_match; 3556 #ifdef SUPPORT_UNICODE 3557 if (utf) 3558 { 3559 while (t < end_subject && !IS_NEWLINE(t)) 3560 { 3561 t++; 3562 ACROSSCHAR(t < end_subject, t, t++); 3563 } 3564 } 3565 else 3566 #endif 3567 while (t < end_subject && !IS_NEWLINE(t)) t++; 3568 end_subject = t; 3569 } 3570 3571 /* Anchored: check the first code unit if one is recorded. This may seem 3572 pointless but it can help in detecting a no match case without scanning for 3573 the required code unit. */ 3574 3575 if (anchored) 3576 { 3577 if (has_first_cu || start_bits != NULL) 3578 { 3579 BOOL ok = start_match < end_subject; 3580 if (ok) 3581 { 3582 PCRE2_UCHAR c = UCHAR21TEST(start_match); 3583 ok = has_first_cu && (c == first_cu || c == first_cu2); 3584 if (!ok && start_bits != NULL) 3585 { 3586 #if PCRE2_CODE_UNIT_WIDTH != 8 3587 if (c > 255) c = 255; 3588 #endif 3589 ok = (start_bits[c/8] & (1 << (c&7))) != 0; 3590 } 3591 } 3592 if (!ok) break; 3593 } 3594 } 3595 3596 /* Not anchored. Advance to a unique first code unit if there is one. In 3597 8-bit mode, the use of memchr() gives a big speed up, even though we have 3598 to call it twice in caseless mode, in order to find the earliest occurrence 3599 of the character in either of its cases. */ 3600 3601 else 3602 { 3603 if (has_first_cu) 3604 { 3605 if (first_cu != first_cu2) /* Caseless */ 3606 { 3607 #if PCRE2_CODE_UNIT_WIDTH != 8 3608 PCRE2_UCHAR smc; 3609 while (start_match < end_subject && 3610 (smc = UCHAR21TEST(start_match)) != first_cu && 3611 smc != first_cu2) 3612 start_match++; 3613 #else /* 8-bit code units */ 3614 PCRE2_SPTR pp1 = 3615 memchr(start_match, first_cu, end_subject-start_match); 3616 PCRE2_SPTR pp2 = 3617 memchr(start_match, first_cu2, end_subject-start_match); 3618 if (pp1 == NULL) 3619 start_match = (pp2 == NULL)? end_subject : pp2; 3620 else 3621 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; 3622 #endif 3623 } 3624 3625 /* The caseful case */ 3626 3627 else 3628 { 3629 #if PCRE2_CODE_UNIT_WIDTH != 8 3630 while (start_match < end_subject && UCHAR21TEST(start_match) != 3631 first_cu) 3632 start_match++; 3633 #else 3634 start_match = memchr(start_match, first_cu, end_subject - start_match); 3635 if (start_match == NULL) start_match = end_subject; 3636 #endif 3637 } 3638 3639 /* If we can't find the required code unit, having reached the true end 3640 of the subject, break the bumpalong loop, to force a match failure, 3641 except when doing partial matching, when we let the next cycle run at 3642 the end of the subject. To see why, consider the pattern /(?<=abc)def/, 3643 which partially matches "abc", even though the string does not contain 3644 the starting character "d". If we have not reached the true end of the 3645 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) 3646 we also let the cycle run, because the matching string is legitimately 3647 allowed to start with the first code unit of a newline. */ 3648 3649 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && 3650 start_match >= mb->end_subject) 3651 break; 3652 } 3653 3654 /* If there's no first code unit, advance to just after a linebreak for a 3655 multiline match if required. */ 3656 3657 else if (startline) 3658 { 3659 if (start_match > mb->start_subject + start_offset) 3660 { 3661 #ifdef SUPPORT_UNICODE 3662 if (utf) 3663 { 3664 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 3665 { 3666 start_match++; 3667 ACROSSCHAR(start_match < end_subject, start_match, start_match++); 3668 } 3669 } 3670 else 3671 #endif 3672 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 3673 start_match++; 3674 3675 /* If we have just passed a CR and the newline option is ANY or 3676 ANYCRLF, and we are now at a LF, advance the match position by one 3677 more code unit. */ 3678 3679 if (start_match[-1] == CHAR_CR && 3680 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && 3681 start_match < end_subject && 3682 UCHAR21TEST(start_match) == CHAR_NL) 3683 start_match++; 3684 } 3685 } 3686 3687 /* If there's no first code unit or a requirement for a multiline line 3688 start, advance to a non-unique first code unit if any have been 3689 identified. The bitmap contains only 256 bits. When code units are 16 or 3690 32 bits wide, all code units greater than 254 set the 255 bit. */ 3691 3692 else if (start_bits != NULL) 3693 { 3694 while (start_match < end_subject) 3695 { 3696 uint32_t c = UCHAR21TEST(start_match); 3697 #if PCRE2_CODE_UNIT_WIDTH != 8 3698 if (c > 255) c = 255; 3699 #endif 3700 if ((start_bits[c/8] & (1 << (c&7))) != 0) break; 3701 start_match++; 3702 } 3703 3704 /* See comment above in first_cu checking about the next line. */ 3705 3706 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && 3707 start_match >= mb->end_subject) 3708 break; 3709 } 3710 } /* End of first code unit handling */ 3711 3712 /* Restore fudged end_subject */ 3713 3714 end_subject = mb->end_subject; 3715 3716 /* The following two optimizations are disabled for partial matching. */ 3717 3718 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0) 3719 { 3720 /* The minimum matching length is a lower bound; no actual string of that 3721 length may actually match the pattern. Although the value is, strictly, 3722 in characters, we treat it as code units to avoid spending too much time 3723 in this optimization. */ 3724 3725 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT; 3726 3727 /* If req_cu is set, we know that that code unit must appear in the 3728 subject for the match to succeed. If the first code unit is set, req_cu 3729 must be later in the subject; otherwise the test starts at the match 3730 point. This optimization can save a huge amount of backtracking in 3731 patterns with nested unlimited repeats that aren't going to match. 3732 Writing separate code for cased/caseless versions makes it go faster, as 3733 does using an autoincrement and backing off on a match. 3734 3735 HOWEVER: when the subject string is very, very long, searching to its end 3736 can take a long time, and give bad performance on quite ordinary 3737 patterns. This showed up when somebody was matching something like 3738 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is 3739 sufficiently long. */ 3740 3741 if (has_req_cu && end_subject - start_match < REQ_CU_MAX) 3742 { 3743 PCRE2_SPTR p = start_match + (has_first_cu? 1:0); 3744 3745 /* We don't need to repeat the search if we haven't yet reached the 3746 place we found it at last time. */ 3747 3748 if (p > req_cu_ptr) 3749 { 3750 if (req_cu != req_cu2) 3751 { 3752 while (p < end_subject) 3753 { 3754 uint32_t pp = UCHAR21INCTEST(p); 3755 if (pp == req_cu || pp == req_cu2) { p--; break; } 3756 } 3757 } 3758 else 3759 { 3760 while (p < end_subject) 3761 { 3762 if (UCHAR21INCTEST(p) == req_cu) { p--; break; } 3763 } 3764 } 3765 3766 /* If we can't find the required code unit, break the matching loop, 3767 forcing a match failure. */ 3768 3769 if (p >= end_subject) break; 3770 3771 /* If we have found the required code unit, save the point where we 3772 found it, so that we don't search again next time round the loop if 3773 the start hasn't passed this code unit yet. */ 3774 3775 req_cu_ptr = p; 3776 } 3777 } 3778 } 3779 } 3780 3781 /* ------------ End of start of match optimizations ------------ */ 3782 3783 /* Give no match if we have passed the bumpalong limit. */ 3784 3785 if (start_match > bumpalong_limit) break; 3786 3787 /* OK, now we can do the business */ 3788 3789 mb->start_used_ptr = start_match; 3790 mb->last_used_ptr = start_match; 3791 mb->recursive = NULL; 3792 3793 rc = internal_dfa_match( 3794 mb, /* fixed match data */ 3795 mb->start_code, /* this subexpression's code */ 3796 start_match, /* where we currently are */ 3797 start_offset, /* start offset in subject */ 3798 match_data->ovector, /* offset vector */ 3799 (uint32_t)match_data->oveccount * 2, /* actual size of same */ 3800 workspace, /* workspace vector */ 3801 (int)wscount, /* size of same */ 3802 0, /* function recurse level */ 3803 base_recursion_workspace); /* initial workspace for recursion */ 3804 3805 /* Anything other than "no match" means we are done, always; otherwise, carry 3806 on only if not anchored. */ 3807 3808 if (rc != PCRE2_ERROR_NOMATCH || anchored) 3809 { 3810 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0) 3811 { 3812 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject); 3813 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); 3814 } 3815 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); 3816 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject); 3817 match_data->startchar = (PCRE2_SIZE)(start_match - subject); 3818 match_data->rc = rc; 3819 goto EXIT; 3820 } 3821 3822 /* Advance to the next subject character unless we are at the end of a line 3823 and firstline is set. */ 3824 3825 if (firstline && IS_NEWLINE(start_match)) break; 3826 start_match++; 3827 #ifdef SUPPORT_UNICODE 3828 if (utf) 3829 { 3830 ACROSSCHAR(start_match < end_subject, start_match, start_match++); 3831 } 3832 #endif 3833 if (start_match > end_subject) break; 3834 3835 /* If we have just passed a CR and we are now at a LF, and the pattern does 3836 not contain any explicit matches for \r or \n, and the newline option is CRLF 3837 or ANY or ANYCRLF, advance the match position by one more character. */ 3838 3839 if (UCHAR21TEST(start_match - 1) == CHAR_CR && 3840 start_match < end_subject && 3841 UCHAR21TEST(start_match) == CHAR_NL && 3842 (re->flags & PCRE2_HASCRORLF) == 0 && 3843 (mb->nltype == NLTYPE_ANY || 3844 mb->nltype == NLTYPE_ANYCRLF || 3845 mb->nllen == 2)) 3846 start_match++; 3847 3848 } /* "Bumpalong" loop */ 3849 3850 NOMATCH_EXIT: 3851 rc = PCRE2_ERROR_NOMATCH; 3852 3853 EXIT: 3854 while (rws->next != NULL) 3855 { 3856 RWS_anchor *next = rws->next; 3857 rws->next = next->next; 3858 mb->memctl.free(next, mb->memctl.memory_data); 3859 } 3860 3861 return rc; 3862 } 3863 3864 /* End of pcre2_dfa_match.c */ 3865