1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 #ifdef HAVE_CONFIG_H 43 #include "config.h" 44 #endif 45 46 #define NLBLOCK cb /* Block containing newline information */ 47 #define PSSTART start_pattern /* Field containing processed string start */ 48 #define PSEND end_pattern /* Field containing processed string end */ 49 50 #include "pcre2_internal.h" 51 52 /* In rare error cases debugging might require calling pcre2_printint(). */ 53 54 #if 0 55 #ifdef EBCDIC 56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255) 57 #else 58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127) 59 #endif 60 #include "pcre2_printint.c" 61 #define DEBUG_CALL_PRINTINT 62 #endif 63 64 /* Other debugging code can be enabled by these defines. */ 65 66 /* #define DEBUG_SHOW_CAPTURES */ 67 /* #define DEBUG_SHOW_PARSED */ 68 69 /* There are a few things that vary with different code unit sizes. Handle them 70 by defining macros in order to minimize #if usage. */ 71 72 #if PCRE2_CODE_UNIT_WIDTH == 8 73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 74 #define XDIGIT(c) xdigitab[c] 75 76 #else /* Either 16-bit or 32-bit */ 77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) 78 79 #if PCRE2_CODE_UNIT_WIDTH == 16 80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 81 82 #else /* 32-bit */ 83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 84 #endif 85 #endif 86 87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which 88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of 89 them will be able to (i.e. assume a 64-bit world). */ 90 91 #if PCRE2_SIZE_MAX <= UINT32_MAX 92 #define PUTOFFSET(s,p) *p++ = s 93 #define GETOFFSET(s,p) s = *p++ 94 #define GETPLUSOFFSET(s,p) s = *(++p) 95 #define READPLUSOFFSET(s,p) s = p[1] 96 #define SKIPOFFSET(p) p++ 97 #define SIZEOFFSET 1 98 #else 99 #define PUTOFFSET(s,p) \ 100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); } 101 #define GETOFFSET(s,p) \ 102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; } 103 #define GETPLUSOFFSET(s,p) \ 104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; } 105 #define READPLUSOFFSET(s,p) \ 106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } 107 #define SKIPOFFSET(p) p += 2 108 #define SIZEOFFSET 2 109 #endif 110 111 /* Macros for manipulating elements of the parsed pattern vector. */ 112 113 #define META_CODE(x) (x & 0xffff0000u) 114 #define META_DATA(x) (x & 0x0000ffffu) 115 #define META_DIFF(x,y) ((x-y)>>16) 116 117 /* Function definitions to allow mutual recursion */ 118 119 #ifdef SUPPORT_UNICODE 120 static unsigned int 121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, 122 compile_block *, const uint32_t *, unsigned int); 123 #endif 124 125 static int 126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, 127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *, 128 compile_block *, PCRE2_SIZE *); 129 130 static int 131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, 132 compile_block *); 133 134 static BOOL 135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, 136 compile_block *); 137 138 139 140 /************************************************* 141 * Code parameters and static tables * 142 *************************************************/ 143 144 #define MAX_GROUP_NUMBER 65535u 145 #define MAX_REPEAT_COUNT 65535u 146 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1) 147 148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in 149 different ways in the different pattern scans. The parsing and group- 150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit 151 aligned for this. Having defined the size in code units, we set up 152 C16_WORK_SIZE as the number of elements in the 16-bit vector. 153 154 During the first compiling phase, when determining how much memory is required, 155 the regex is partly compiled into this space, but the compiled parts are 156 discarded as soon as they can be, so that hopefully there will never be an 157 overrun. The code does, however, check for an overrun, which can occur for 158 pathological patterns. The size of the workspace depends on LINK_SIZE because 159 the length of compiled items varies with this. 160 161 In the real compile phase, this workspace is not currently used. */ 162 163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */ 164 165 #define C16_WORK_SIZE \ 166 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)) 167 168 /* A uint32_t vector is used for caching information about the size of 169 capturing groups, to improve performance. A default is created on the stack of 170 this size. */ 171 172 #define GROUPINFO_DEFAULT_SIZE 256 173 174 /* The overrun tests check for a slightly smaller size so that they detect the 175 overrun before it actually does run off the end of the data block. */ 176 177 #define WORK_SIZE_SAFETY_MARGIN (100) 178 179 /* This value determines the size of the initial vector that is used for 180 remembering named groups during the pre-compile. It is allocated on the stack, 181 but if it is too small, it is expanded, in a similar way to the workspace. The 182 value is the number of slots in the list. */ 183 184 #define NAMED_GROUP_LIST_SIZE 20 185 186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector 187 of uint32_t. For short patterns this lives on the stack, with this size. Heap 188 memory is used for longer patterns. */ 189 190 #define PARSED_PATTERN_DEFAULT_SIZE 1024 191 192 /* Maximum length value to check against when making sure that the variable 193 that holds the compiled pattern length does not overflow. We make it a bit less 194 than INT_MAX to allow for adding in group terminating code units, so that we 195 don't have to check them every time. */ 196 197 #define OFLOW_MAX (INT_MAX - 20) 198 199 /* Code values for parsed patterns, which are stored in a vector of 32-bit 200 unsigned ints. Values less than META_END are literal data values. The coding 201 for identifying the item is in the top 16-bits, leaving 16 bits for the 202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF 203 macros are used to manipulate parsed pattern elements. 204 205 NOTE: When these definitions are changed, the table of extra lengths for each 206 code (meta_extra_lengths, just below) must be updated to remain in step. */ 207 208 #define META_END 0x80000000u /* End of pattern */ 209 210 #define META_ALT 0x80010000u /* alternation */ 211 #define META_ATOMIC 0x80020000u /* atomic group */ 212 #define META_BACKREF 0x80030000u /* Back ref */ 213 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ 214 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ 215 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ 216 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ 217 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */ 218 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ 219 #define META_CLASS 0x800a0000u /* start non-empty class */ 220 #define META_CLASS_EMPTY 0x800b0000u /* empty class */ 221 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ 222 #define META_CLASS_END 0x800d0000u /* end of non-empty class */ 223 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ 224 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ 225 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ 226 #define META_COND_NAME 0x80110000u /* (?(<name>)... */ 227 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */ 228 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */ 229 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ 230 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */ 231 #define META_DOLLAR 0x80160000u /* $ metacharacter */ 232 #define META_DOT 0x80170000u /* . metacharacter */ 233 #define META_ESCAPE 0x80180000u /* \d and friends */ 234 #define META_KET 0x80190000u /* closing parenthesis */ 235 #define META_NOCAPTURE 0x801a0000u /* no capture parens */ 236 #define META_OPTIONS 0x801b0000u /* (?i) and friends */ 237 #define META_POSIX 0x801c0000u /* POSIX class item */ 238 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ 239 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ 240 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ 241 #define META_RECURSE 0x80200000u /* Recursion */ 242 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ 243 244 /* These must be kept together to make it easy to check that an assertion 245 is present where expected in a conditional group. */ 246 247 #define META_LOOKAHEAD 0x80220000u /* (?= */ 248 #define META_LOOKAHEADNOT 0x80230000u /* (?! */ 249 #define META_LOOKBEHIND 0x80240000u /* (?<= */ 250 #define META_LOOKBEHINDNOT 0x80250000u /* (?<! */ 251 252 /* These must be kept in this order, with consecutive values, and the _ARG 253 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument 254 versions. */ 255 256 #define META_MARK 0x80260000u /* (*MARK) */ 257 #define META_ACCEPT 0x80270000u /* (*ACCEPT) */ 258 #define META_FAIL 0x80280000u /* (*FAIL) */ 259 #define META_COMMIT 0x80290000u /* These */ 260 #define META_COMMIT_ARG 0x802a0000u /* pairs */ 261 #define META_PRUNE 0x802b0000u /* must */ 262 #define META_PRUNE_ARG 0x802c0000u /* be */ 263 #define META_SKIP 0x802d0000u /* kept */ 264 #define META_SKIP_ARG 0x802e0000u /* in */ 265 #define META_THEN 0x802f0000u /* this */ 266 #define META_THEN_ARG 0x80300000u /* order */ 267 268 /* These must be kept in groups of adjacent 3 values, and all together. */ 269 270 #define META_ASTERISK 0x80310000u /* * */ 271 #define META_ASTERISK_PLUS 0x80320000u /* *+ */ 272 #define META_ASTERISK_QUERY 0x80330000u /* *? */ 273 #define META_PLUS 0x80340000u /* + */ 274 #define META_PLUS_PLUS 0x80350000u /* ++ */ 275 #define META_PLUS_QUERY 0x80360000u /* +? */ 276 #define META_QUERY 0x80370000u /* ? */ 277 #define META_QUERY_PLUS 0x80380000u /* ?+ */ 278 #define META_QUERY_QUERY 0x80390000u /* ?? */ 279 #define META_MINMAX 0x803a0000u /* {n,m} repeat */ 280 #define META_MINMAX_PLUS 0x803b0000u /* {n,m}+ repeat */ 281 #define META_MINMAX_QUERY 0x803c0000u /* {n,m}? repeat */ 282 283 #define META_FIRST_QUANTIFIER META_ASTERISK 284 #define META_LAST_QUANTIFIER META_MINMAX_QUERY 285 286 /* Table of extra lengths for each of the meta codes. Must be kept in step with 287 the definitions above. For some items these values are a basic length to which 288 a variable amount has to be added. */ 289 290 static unsigned char meta_extra_lengths[] = { 291 0, /* META_END */ 292 0, /* META_ALT */ 293 0, /* META_ATOMIC */ 294 0, /* META_BACKREF - more if group is >= 10 */ 295 1+SIZEOFFSET, /* META_BACKREF_BYNAME */ 296 1, /* META_BIGVALUE */ 297 3, /* META_CALLOUT_NUMBER */ 298 3+SIZEOFFSET, /* META_CALLOUT_STRING */ 299 0, /* META_CAPTURE */ 300 0, /* META_CIRCUMFLEX */ 301 0, /* META_CLASS */ 302 0, /* META_CLASS_EMPTY */ 303 0, /* META_CLASS_EMPTY_NOT */ 304 0, /* META_CLASS_END */ 305 0, /* META_CLASS_NOT */ 306 0, /* META_COND_ASSERT */ 307 SIZEOFFSET, /* META_COND_DEFINE */ 308 1+SIZEOFFSET, /* META_COND_NAME */ 309 1+SIZEOFFSET, /* META_COND_NUMBER */ 310 1+SIZEOFFSET, /* META_COND_RNAME */ 311 1+SIZEOFFSET, /* META_COND_RNUMBER */ 312 3, /* META_COND_VERSION */ 313 0, /* META_DOLLAR */ 314 0, /* META_DOT */ 315 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */ 316 0, /* META_KET */ 317 0, /* META_NOCAPTURE */ 318 1, /* META_OPTIONS */ 319 1, /* META_POSIX */ 320 1, /* META_POSIX_NEG */ 321 0, /* META_RANGE_ESCAPED */ 322 0, /* META_RANGE_LITERAL */ 323 SIZEOFFSET, /* META_RECURSE */ 324 1+SIZEOFFSET, /* META_RECURSE_BYNAME */ 325 0, /* META_LOOKAHEAD */ 326 0, /* META_LOOKAHEADNOT */ 327 SIZEOFFSET, /* META_LOOKBEHIND */ 328 SIZEOFFSET, /* META_LOOKBEHINDNOT */ 329 1, /* META_MARK - plus the string length */ 330 0, /* META_ACCEPT */ 331 0, /* META_FAIL */ 332 0, /* META_COMMIT */ 333 1, /* META_COMMIT_ARG - plus the string length */ 334 0, /* META_PRUNE */ 335 1, /* META_PRUNE_ARG - plus the string length */ 336 0, /* META_SKIP */ 337 1, /* META_SKIP_ARG - plus the string length */ 338 0, /* META_THEN */ 339 1, /* META_THEN_ARG - plus the string length */ 340 0, /* META_ASTERISK */ 341 0, /* META_ASTERISK_PLUS */ 342 0, /* META_ASTERISK_QUERY */ 343 0, /* META_PLUS */ 344 0, /* META_PLUS_PLUS */ 345 0, /* META_PLUS_QUERY */ 346 0, /* META_QUERY */ 347 0, /* META_QUERY_PLUS */ 348 0, /* META_QUERY_QUERY */ 349 2, /* META_MINMAX */ 350 2, /* META_MINMAX_PLUS */ 351 2 /* META_MINMAX_QUERY */ 352 }; 353 354 /* Types for skipping parts of a parsed pattern. */ 355 356 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET }; 357 358 /* Macro for setting individual bits in class bitmaps. It took some 359 experimenting to figure out how to stop gcc 5.3.0 from warning with 360 -Wconversion. This version gets a warning: 361 362 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7)) 363 364 Let's hope the apparently less efficient version isn't actually so bad if the 365 compiler is clever with identical subexpressions. */ 366 367 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7))) 368 369 /* Private flags added to firstcu and reqcu. */ 370 371 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */ 372 #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */ 373 /* Negative values for the firstcu and reqcu flags */ 374 #define REQ_UNSET (-2) /* Not yet found anything */ 375 #define REQ_NONE (-1) /* Found not fixed char */ 376 377 /* These flags are used in the groupinfo vector. */ 378 379 #define GI_SET_FIXED_LENGTH 0x80000000u 380 #define GI_NOT_FIXED_LENGTH 0x40000000u 381 #define GI_FIXED_LENGTH_MASK 0x0000ffffu 382 383 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC 384 and is fast (a good compiler can turn it into a subtraction and unsigned 385 comparison). */ 386 387 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) 388 389 /* Table to identify hex digits. The tables in chartables are dependent on the 390 locale, and may mark arbitrary characters as digits. We want to recognize only 391 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It 392 costs 256 bytes, but it is a lot faster than doing character value tests (at 393 least in some simple cases I timed), and in some applications one wants PCRE2 394 to compile efficiently as well as match efficiently. The value in the table is 395 the binary hex digit value, or 0xff for non-hex digits. */ 396 397 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in 398 UTF-8 mode. */ 399 400 #ifndef EBCDIC 401 static const uint8_t xdigitab[] = 402 { 403 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ 404 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ 405 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ 406 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ 407 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ 408 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ 409 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ 410 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ 411 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ 412 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ 413 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ 414 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ 415 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ 416 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ 417 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ 418 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ 419 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ 420 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ 421 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ 422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ 423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ 424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ 425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ 426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ 427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ 428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ 429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ 430 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ 431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ 432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ 433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ 434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ 435 436 #else 437 438 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ 439 440 static const uint8_t xdigitab[] = 441 { 442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ 443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ 444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ 445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ 446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ 447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ 448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ 449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ 450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ 451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ 452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ 453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ 454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ 455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ 456 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ 457 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ 458 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ 459 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ 460 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ 461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ 462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ 463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ 464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ 465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ 466 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ 467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ 468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ 469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ 470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ 471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ 472 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ 473 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ 474 #endif /* EBCDIC */ 475 476 477 /* Table for handling alphanumeric escaped characters. Positive returns are 478 simple data values; negative values are for special things like \d and so on. 479 Zero means further processing is needed (for things like \x), or the escape is 480 invalid. */ 481 482 /* This is the "normal" table for ASCII systems or for EBCDIC systems running 483 in UTF-8 mode. It runs from '0' to 'z'. */ 484 485 #ifndef EBCDIC 486 #define ESCAPES_FIRST CHAR_0 487 #define ESCAPES_LAST CHAR_z 488 #define UPPER_CASE(c) (c-32) 489 490 static const short int escapes[] = { 491 0, 0, 492 0, 0, 493 0, 0, 494 0, 0, 495 0, 0, 496 CHAR_COLON, CHAR_SEMICOLON, 497 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, 498 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, 499 CHAR_COMMERCIAL_AT, -ESC_A, 500 -ESC_B, -ESC_C, 501 -ESC_D, -ESC_E, 502 0, -ESC_G, 503 -ESC_H, 0, 504 0, -ESC_K, 505 0, 0, 506 -ESC_N, 0, 507 -ESC_P, -ESC_Q, 508 -ESC_R, -ESC_S, 509 0, 0, 510 -ESC_V, -ESC_W, 511 -ESC_X, 0, 512 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, 513 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, 514 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, 515 CHAR_GRAVE_ACCENT, CHAR_BEL, 516 -ESC_b, 0, 517 -ESC_d, CHAR_ESC, 518 CHAR_FF, 0, 519 -ESC_h, 0, 520 0, -ESC_k, 521 0, 0, 522 CHAR_LF, 0, 523 -ESC_p, 0, 524 CHAR_CR, -ESC_s, 525 CHAR_HT, 0, 526 -ESC_v, -ESC_w, 527 0, 0, 528 -ESC_z 529 }; 530 531 #else 532 533 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. 534 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code 535 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a 536 because it is defined as 'a', which of course picks up the ASCII value. */ 537 538 #if 'a' == 0x81 /* Check for a real EBCDIC environment */ 539 #define ESCAPES_FIRST CHAR_a 540 #define ESCAPES_LAST CHAR_9 541 #define UPPER_CASE(c) (c+64) 542 #else /* Testing in an ASCII environment */ 543 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ 544 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ 545 #define UPPER_CASE(c) (c-32) 546 #endif 547 548 static const short int escapes[] = { 549 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0, 550 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0, 551 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p, 552 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0, 553 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0, 554 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0, 555 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, 556 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', 557 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, 558 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0, 559 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P, 560 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0, 561 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X, 562 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0, 563 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, 564 /* F8 */ 0, 0 565 }; 566 567 /* We also need a table of characters that may follow \c in an EBCDIC 568 environment for characters 0-31. */ 569 570 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; 571 572 #endif /* EBCDIC */ 573 574 575 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is 576 searched linearly. Put all the names into a single string, in order to reduce 577 the number of relocations when a shared library is dynamically linked. The 578 string is built from string macros so that it works in UTF-8 mode on EBCDIC 579 platforms. */ 580 581 typedef struct verbitem { 582 unsigned int len; /* Length of verb name */ 583 uint32_t meta; /* Base META_ code */ 584 int has_arg; /* Argument requirement */ 585 } verbitem; 586 587 static const char verbnames[] = 588 "\0" /* Empty name is a shorthand for MARK */ 589 STRING_MARK0 590 STRING_ACCEPT0 591 STRING_F0 592 STRING_FAIL0 593 STRING_COMMIT0 594 STRING_PRUNE0 595 STRING_SKIP0 596 STRING_THEN; 597 598 static const verbitem verbs[] = { 599 { 0, META_MARK, +1 }, /* > 0 => must have an argument */ 600 { 4, META_MARK, +1 }, 601 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */ 602 { 1, META_FAIL, -1 }, 603 { 4, META_FAIL, -1 }, 604 { 6, META_COMMIT, 0 }, 605 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */ 606 { 4, META_SKIP, 0 }, 607 { 4, META_THEN, 0 } 608 }; 609 610 static const int verbcount = sizeof(verbs)/sizeof(verbitem); 611 612 /* Verb opcodes, indexed by their META code offset from META_MARK. */ 613 614 static const uint32_t verbops[] = { 615 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE, 616 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG }; 617 618 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ 619 620 static uint32_t chartypeoffset[] = { 621 OP_STAR - OP_STAR, OP_STARI - OP_STAR, 622 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR }; 623 624 /* Tables of names of POSIX character classes and their lengths. The names are 625 now all in a single string, to reduce the number of relocations when a shared 626 library is dynamically loaded. The list of lengths is terminated by a zero 627 length entry. The first three must be alpha, lower, upper, as this is assumed 628 for handling case independence. The indices for graph, print, and punct are 629 needed, so identify them. */ 630 631 static const char posix_names[] = 632 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 633 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 634 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 635 STRING_word0 STRING_xdigit; 636 637 static const uint8_t posix_name_lengths[] = { 638 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; 639 640 #define PC_GRAPH 8 641 #define PC_PRINT 9 642 #define PC_PUNCT 10 643 644 /* Table of class bit maps for each POSIX class. Each class is formed from a 645 base map, with an optional addition or removal of another map. Then, for some 646 classes, there is some additional tweaking: for [:blank:] the vertical space 647 characters are removed, and for [:alpha:] and [:alnum:] the underscore 648 character is removed. The triples in the table consist of the base map offset, 649 second map offset or -1 if no second map, and a non-negative value for map 650 addition or a negative value for map subtraction (if there are two maps). The 651 absolute value of the third field has these meanings: 0 => no tweaking, 1 => 652 remove vertical space characters, 2 => remove underscore. */ 653 654 static const int posix_class_maps[] = { 655 cbit_word, cbit_digit, -2, /* alpha */ 656 cbit_lower, -1, 0, /* lower */ 657 cbit_upper, -1, 0, /* upper */ 658 cbit_word, -1, 2, /* alnum - word without underscore */ 659 cbit_print, cbit_cntrl, 0, /* ascii */ 660 cbit_space, -1, 1, /* blank - a GNU extension */ 661 cbit_cntrl, -1, 0, /* cntrl */ 662 cbit_digit, -1, 0, /* digit */ 663 cbit_graph, -1, 0, /* graph */ 664 cbit_print, -1, 0, /* print */ 665 cbit_punct, -1, 0, /* punct */ 666 cbit_space, -1, 0, /* space */ 667 cbit_word, -1, 0, /* word - a Perl extension */ 668 cbit_xdigit,-1, 0 /* xdigit */ 669 }; 670 671 #ifdef SUPPORT_UNICODE 672 673 /* The POSIX class Unicode property substitutes that are used in UCP mode must 674 be in the order of the POSIX class names, defined above. */ 675 676 static int posix_substitutes[] = { 677 PT_GC, ucp_L, /* alpha */ 678 PT_PC, ucp_Ll, /* lower */ 679 PT_PC, ucp_Lu, /* upper */ 680 PT_ALNUM, 0, /* alnum */ 681 -1, 0, /* ascii, treat as non-UCP */ 682 -1, 1, /* blank, treat as \h */ 683 PT_PC, ucp_Cc, /* cntrl */ 684 PT_PC, ucp_Nd, /* digit */ 685 PT_PXGRAPH, 0, /* graph */ 686 PT_PXPRINT, 0, /* print */ 687 PT_PXPUNCT, 0, /* punct */ 688 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */ 689 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */ 690 -1, 0 /* xdigit, treat as non-UCP */ 691 }; 692 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) 693 #endif /* SUPPORT_UNICODE */ 694 695 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset 696 are allowed. */ 697 698 #define PUBLIC_LITERAL_COMPILE_OPTIONS \ 699 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \ 700 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \ 701 PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF) 702 703 #define PUBLIC_COMPILE_OPTIONS \ 704 (PUBLIC_LITERAL_COMPILE_OPTIONS| \ 705 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ 706 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ 707 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ 708 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ 709 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ 710 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) 711 712 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ 713 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD) 714 715 #define PUBLIC_COMPILE_EXTRA_OPTIONS \ 716 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ 717 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) 718 719 /* Compile time error code numbers. They are given names so that they can more 720 easily be tracked. When a new number is added, the tables called eint1 and 721 eint2 in pcre2posix.c may need to be updated, and a new error text must be 722 added to compile_error_texts in pcre2_error.c. */ 723 724 enum { ERR0 = COMPILE_ERROR_BASE, 725 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, 726 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, 727 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, 728 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, 729 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, 730 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, 731 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, 732 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, 733 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90, 734 ERR91, ERR92, ERR93, ERR94 }; 735 736 /* This is a table of start-of-pattern options such as (*UTF) and settings such 737 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward 738 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is 739 generic and always supported. */ 740 741 enum { PSO_OPT, /* Value is an option bit */ 742 PSO_FLG, /* Value is a flag bit */ 743 PSO_NL, /* Value is a newline type */ 744 PSO_BSR, /* Value is a \R type */ 745 PSO_LIMH, /* Read integer value for heap limit */ 746 PSO_LIMM, /* Read integer value for match limit */ 747 PSO_LIMD }; /* Read integer value for depth limit */ 748 749 typedef struct pso { 750 const uint8_t *name; 751 uint16_t length; 752 uint16_t type; 753 uint32_t value; 754 } pso; 755 756 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ 757 758 static pso pso_list[] = { 759 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, 760 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, 761 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, 762 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, 763 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, 764 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, 765 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, 766 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, 767 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, 768 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, 769 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, 770 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, 771 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 }, 772 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, 773 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, 774 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, 775 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, 776 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL }, 777 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, 778 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, 779 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } 780 }; 781 782 /* This table is used when converting repeating opcodes into possessified 783 versions as a result of an explicit possessive quantifier such as ++. A zero 784 value means there is no possessified version - in those cases the item in 785 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT 786 because all relevant opcodes are less than that. */ 787 788 static const uint8_t opcode_possessify[] = { 789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ 790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ 791 792 0, /* NOTI */ 793 OP_POSSTAR, 0, /* STAR, MINSTAR */ 794 OP_POSPLUS, 0, /* PLUS, MINPLUS */ 795 OP_POSQUERY, 0, /* QUERY, MINQUERY */ 796 OP_POSUPTO, 0, /* UPTO, MINUPTO */ 797 0, /* EXACT */ 798 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ 799 800 OP_POSSTARI, 0, /* STARI, MINSTARI */ 801 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ 802 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ 803 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ 804 0, /* EXACTI */ 805 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ 806 807 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ 808 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ 809 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ 810 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ 811 0, /* NOTEXACT */ 812 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ 813 814 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ 815 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ 816 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ 817 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ 818 0, /* NOTEXACTI */ 819 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ 820 821 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ 822 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ 823 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ 824 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ 825 0, /* TYPEEXACT */ 826 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ 827 828 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ 829 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ 830 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ 831 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ 832 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ 833 834 0, 0, 0, /* CLASS, NCLASS, XCLASS */ 835 0, 0, /* REF, REFI */ 836 0, 0, /* DNREF, DNREFI */ 837 0, 0 /* RECURSE, CALLOUT */ 838 }; 839 840 841 #ifdef DEBUG_SHOW_PARSED 842 /************************************************* 843 * Show the parsed pattern for debugging * 844 *************************************************/ 845 846 /* For debugging the pre-scan, this code, which outputs the parsed data vector, 847 can be enabled. */ 848 849 static void show_parsed(compile_block *cb) 850 { 851 uint32_t *pptr = cb->parsed_pattern; 852 853 for (;;) 854 { 855 int max, min; 856 PCRE2_SIZE offset; 857 uint32_t i; 858 uint32_t length; 859 uint32_t meta_arg = META_DATA(*pptr); 860 861 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); 862 863 if (*pptr < META_END) 864 { 865 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); 866 pptr++; 867 } 868 869 else switch (META_CODE(*pptr++)) 870 { 871 default: 872 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); 873 return; 874 875 case META_END: 876 fprintf(stderr, "META_END\n"); 877 return; 878 879 case META_CAPTURE: 880 fprintf(stderr, "META_CAPTURE %d", meta_arg); 881 break; 882 883 case META_RECURSE: 884 GETOFFSET(offset, pptr); 885 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); 886 break; 887 888 case META_BACKREF: 889 if (meta_arg < 10) 890 offset = cb->small_ref_offset[meta_arg]; 891 else 892 GETOFFSET(offset, pptr); 893 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); 894 break; 895 896 case META_ESCAPE: 897 if (meta_arg == ESC_P || meta_arg == ESC_p) 898 { 899 uint32_t ptype = *pptr >> 16; 900 uint32_t pvalue = *pptr++ & 0xffff; 901 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', 902 ptype, pvalue); 903 } 904 else 905 { 906 uint32_t cc; 907 /* There's just one escape we might have here that isn't negated in the 908 escapes table. */ 909 if (meta_arg == ESC_g) cc = CHAR_g; 910 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) 911 { 912 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; 913 } 914 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; 915 fprintf(stderr, "META \\%c", cc); 916 } 917 break; 918 919 case META_MINMAX: 920 min = *pptr++; 921 max = *pptr++; 922 if (max != REPEAT_UNLIMITED) 923 fprintf(stderr, "META {%d,%d}", min, max); 924 else 925 fprintf(stderr, "META {%d,}", min); 926 break; 927 928 case META_MINMAX_QUERY: 929 min = *pptr++; 930 max = *pptr++; 931 if (max != REPEAT_UNLIMITED) 932 fprintf(stderr, "META {%d,%d}?", min, max); 933 else 934 fprintf(stderr, "META {%d,}?", min); 935 break; 936 937 case META_MINMAX_PLUS: 938 min = *pptr++; 939 max = *pptr++; 940 if (max != REPEAT_UNLIMITED) 941 fprintf(stderr, "META {%d,%d}+", min, max); 942 else 943 fprintf(stderr, "META {%d,}+", min); 944 break; 945 946 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; 947 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; 948 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; 949 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; 950 case META_DOT: fprintf(stderr, "META_DOT"); break; 951 case META_ASTERISK: fprintf(stderr, "META *"); break; 952 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; 953 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; 954 case META_PLUS: fprintf(stderr, "META +"); break; 955 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; 956 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; 957 case META_QUERY: fprintf(stderr, "META ?"); break; 958 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; 959 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; 960 961 case META_ATOMIC: fprintf(stderr, "META (?>"); break; 962 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; 963 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; 964 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; 965 case META_KET: fprintf(stderr, "META )"); break; 966 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; 967 968 case META_CLASS: fprintf(stderr, "META ["); break; 969 case META_CLASS_NOT: fprintf(stderr, "META [^"); break; 970 case META_CLASS_END: fprintf(stderr, "META ]"); break; 971 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; 972 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; 973 974 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; 975 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; 976 977 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; 978 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; 979 980 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; 981 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; 982 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; 983 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; 984 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; 985 case META_THEN: fprintf(stderr, "META (*THEN)"); break; 986 987 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break; 988 989 case META_LOOKBEHIND: 990 fprintf(stderr, "META (?<= %d offset=", meta_arg); 991 GETOFFSET(offset, pptr); 992 fprintf(stderr, "%zd", offset); 993 break; 994 995 case META_LOOKBEHINDNOT: 996 fprintf(stderr, "META (?<! %d offset=", meta_arg); 997 GETOFFSET(offset, pptr); 998 fprintf(stderr, "%zd", offset); 999 break; 1000 1001 case META_CALLOUT_NUMBER: 1002 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0], 1003 pptr[1]); 1004 pptr += 3; 1005 break; 1006 1007 case META_CALLOUT_STRING: 1008 { 1009 uint32_t patoffset = *pptr++; /* Offset of next pattern item */ 1010 uint32_t patlength = *pptr++; /* Length of next pattern item */ 1011 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++); 1012 GETOFFSET(offset, pptr); 1013 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength); 1014 } 1015 break; 1016 1017 case META_RECURSE_BYNAME: 1018 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++); 1019 GETOFFSET(offset, pptr); 1020 fprintf(stderr, "%zd", offset); 1021 break; 1022 1023 case META_BACKREF_BYNAME: 1024 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++); 1025 GETOFFSET(offset, pptr); 1026 fprintf(stderr, "%zd", offset); 1027 break; 1028 1029 case META_COND_NUMBER: 1030 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]); 1031 GETOFFSET(offset, pptr); 1032 fprintf(stderr, "%zd", offset); 1033 pptr++; 1034 break; 1035 1036 case META_COND_DEFINE: 1037 fprintf(stderr, "META (?(DEFINE) offset="); 1038 GETOFFSET(offset, pptr); 1039 fprintf(stderr, "%zd", offset); 1040 break; 1041 1042 case META_COND_VERSION: 1043 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">="); 1044 fprintf(stderr, "%d.", *pptr++); 1045 fprintf(stderr, "%d)", *pptr++); 1046 break; 1047 1048 case META_COND_NAME: 1049 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++); 1050 GETOFFSET(offset, pptr); 1051 fprintf(stderr, "%zd", offset); 1052 break; 1053 1054 case META_COND_RNAME: 1055 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); 1056 GETOFFSET(offset, pptr); 1057 fprintf(stderr, "%zd", offset); 1058 break; 1059 1060 /* This is kept as a name, because it might be. */ 1061 1062 case META_COND_RNUMBER: 1063 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); 1064 GETOFFSET(offset, pptr); 1065 fprintf(stderr, "%zd", offset); 1066 break; 1067 1068 case META_MARK: 1069 fprintf(stderr, "META (*MARK:"); 1070 goto SHOWARG; 1071 1072 case META_COMMIT_ARG: 1073 fprintf(stderr, "META (*COMMIT:"); 1074 goto SHOWARG; 1075 1076 case META_PRUNE_ARG: 1077 fprintf(stderr, "META (*PRUNE:"); 1078 goto SHOWARG; 1079 1080 case META_SKIP_ARG: 1081 fprintf(stderr, "META (*SKIP:"); 1082 goto SHOWARG; 1083 1084 case META_THEN_ARG: 1085 fprintf(stderr, "META (*THEN:"); 1086 SHOWARG: 1087 length = *pptr++; 1088 for (i = 0; i < length; i++) 1089 { 1090 uint32_t cc = *pptr++; 1091 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); 1092 else fprintf(stderr, "\\x{%x}", cc); 1093 } 1094 fprintf(stderr, ") length=%u", length); 1095 break; 1096 } 1097 fprintf(stderr, "\n"); 1098 } 1099 return; 1100 } 1101 #endif /* DEBUG_SHOW_PARSED */ 1102 1103 1104 1105 /************************************************* 1106 * Copy compiled code * 1107 *************************************************/ 1108 1109 /* Compiled JIT code cannot be copied, so the new compiled block has no 1110 associated JIT data. */ 1111 1112 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION 1113 pcre2_code_copy(const pcre2_code *code) 1114 { 1115 PCRE2_SIZE* ref_count; 1116 pcre2_code *newcode; 1117 1118 if (code == NULL) return NULL; 1119 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); 1120 if (newcode == NULL) return NULL; 1121 memcpy(newcode, code, code->blocksize); 1122 newcode->executable_jit = NULL; 1123 1124 /* If the code is one that has been deserialized, increment the reference count 1125 in the decoded tables. */ 1126 1127 if ((code->flags & PCRE2_DEREF_TABLES) != 0) 1128 { 1129 ref_count = (PCRE2_SIZE *)(code->tables + tables_length); 1130 (*ref_count)++; 1131 } 1132 1133 return newcode; 1134 } 1135 1136 1137 1138 /************************************************* 1139 * Copy compiled code and character tables * 1140 *************************************************/ 1141 1142 /* Compiled JIT code cannot be copied, so the new compiled block has no 1143 associated JIT data. This version of code_copy also makes a separate copy of 1144 the character tables. */ 1145 1146 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION 1147 pcre2_code_copy_with_tables(const pcre2_code *code) 1148 { 1149 PCRE2_SIZE* ref_count; 1150 pcre2_code *newcode; 1151 uint8_t *newtables; 1152 1153 if (code == NULL) return NULL; 1154 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); 1155 if (newcode == NULL) return NULL; 1156 memcpy(newcode, code, code->blocksize); 1157 newcode->executable_jit = NULL; 1158 1159 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE), 1160 code->memctl.memory_data); 1161 if (newtables == NULL) 1162 { 1163 code->memctl.free((void *)newcode, code->memctl.memory_data); 1164 return NULL; 1165 } 1166 memcpy(newtables, code->tables, tables_length); 1167 ref_count = (PCRE2_SIZE *)(newtables + tables_length); 1168 *ref_count = 1; 1169 1170 newcode->tables = newtables; 1171 newcode->flags |= PCRE2_DEREF_TABLES; 1172 return newcode; 1173 } 1174 1175 1176 1177 /************************************************* 1178 * Free compiled code * 1179 *************************************************/ 1180 1181 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 1182 pcre2_code_free(pcre2_code *code) 1183 { 1184 PCRE2_SIZE* ref_count; 1185 1186 if (code != NULL) 1187 { 1188 if (code->executable_jit != NULL) 1189 PRIV(jit_free)(code->executable_jit, &code->memctl); 1190 1191 if ((code->flags & PCRE2_DEREF_TABLES) != 0) 1192 { 1193 /* Decoded tables belong to the codes after deserialization, and they must 1194 be freed when there are no more reference to them. The *ref_count should 1195 always be > 0. */ 1196 1197 ref_count = (PCRE2_SIZE *)(code->tables + tables_length); 1198 if (*ref_count > 0) 1199 { 1200 (*ref_count)--; 1201 if (*ref_count == 0) 1202 code->memctl.free((void *)code->tables, code->memctl.memory_data); 1203 } 1204 } 1205 1206 code->memctl.free(code, code->memctl.memory_data); 1207 } 1208 } 1209 1210 1211 1212 /************************************************* 1213 * Read a number, possibly signed * 1214 *************************************************/ 1215 1216 /* This function is used to read numbers in the pattern. The initial pointer 1217 must be the sign or first digit of the number. When relative values (introduced 1218 by + or -) are allowed, they are relative group numbers, and the result must be 1219 greater than zero. 1220 1221 Arguments: 1222 ptrptr points to the character pointer variable 1223 ptrend points to the end of the input string 1224 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this 1225 max_value the largest number allowed 1226 max_error the error to give for an over-large number 1227 intptr where to put the result 1228 errcodeptr where to put an error code 1229 1230 Returns: TRUE - a number was read 1231 FALSE - errorcode == 0 => no number was found 1232 errorcode != 0 => an error occurred 1233 */ 1234 1235 static BOOL 1236 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, 1237 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) 1238 { 1239 int sign = 0; 1240 uint32_t n = 0; 1241 PCRE2_SPTR ptr = *ptrptr; 1242 BOOL yield = FALSE; 1243 1244 *errorcodeptr = 0; 1245 1246 if (allow_sign >= 0 && ptr < ptrend) 1247 { 1248 if (*ptr == CHAR_PLUS) 1249 { 1250 sign = +1; 1251 max_value -= allow_sign; 1252 ptr++; 1253 } 1254 else if (*ptr == CHAR_MINUS) 1255 { 1256 sign = -1; 1257 ptr++; 1258 } 1259 } 1260 1261 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE; 1262 while (ptr < ptrend && IS_DIGIT(*ptr)) 1263 { 1264 n = n * 10 + *ptr++ - CHAR_0; 1265 if (n > max_value) 1266 { 1267 *errorcodeptr = max_error; 1268 goto EXIT; 1269 } 1270 } 1271 1272 if (allow_sign >= 0 && sign != 0) 1273 { 1274 if (n == 0) 1275 { 1276 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */ 1277 goto EXIT; 1278 } 1279 1280 if (sign > 0) n += allow_sign; 1281 else if ((int)n > allow_sign) 1282 { 1283 *errorcodeptr = ERR15; /* Non-existent subpattern */ 1284 goto EXIT; 1285 } 1286 else n = allow_sign + 1 - n; 1287 } 1288 1289 yield = TRUE; 1290 1291 EXIT: 1292 *intptr = n; 1293 *ptrptr = ptr; 1294 return yield; 1295 } 1296 1297 1298 1299 /************************************************* 1300 * Read repeat counts * 1301 *************************************************/ 1302 1303 /* Read an item of the form {n,m} and return the values if non-NULL pointers 1304 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a 1305 larger value is used for "unlimited". We have to use signed arguments for 1306 read_number() because it is capable of returning a signed value. 1307 1308 Arguments: 1309 ptrptr points to pointer to character after'{' 1310 ptrend pointer to end of input 1311 minp if not NULL, pointer to int for min 1312 maxp if not NULL, pointer to int for max (-1 if no max) 1313 returned as -1 if no max 1314 errorcodeptr points to error code variable 1315 1316 Returns: FALSE if not a repeat quantifier, errorcode set zero 1317 FALSE on error, with errorcode set non-zero 1318 TRUE on success, with pointer updated to point after '}' 1319 */ 1320 1321 static BOOL 1322 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, 1323 uint32_t *maxp, int *errorcodeptr) 1324 { 1325 PCRE2_SPTR p = *ptrptr; 1326 BOOL yield = FALSE; 1327 int32_t min = 0; 1328 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */ 1329 1330 /* NB read_number() initializes the error code to zero. The only error is for a 1331 number that is too big. */ 1332 1333 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr)) 1334 goto EXIT; 1335 1336 if (p >= ptrend) goto EXIT; 1337 1338 if (*p == CHAR_RIGHT_CURLY_BRACKET) 1339 { 1340 p++; 1341 max = min; 1342 } 1343 1344 else 1345 { 1346 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT; 1347 if (*p != CHAR_RIGHT_CURLY_BRACKET) 1348 { 1349 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, 1350 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) 1351 goto EXIT; 1352 if (max < min) 1353 { 1354 *errorcodeptr = ERR4; 1355 goto EXIT; 1356 } 1357 } 1358 p++; 1359 } 1360 1361 yield = TRUE; 1362 if (minp != NULL) *minp = (uint32_t)min; 1363 if (maxp != NULL) *maxp = (uint32_t)max; 1364 1365 /* Update the pattern pointer on success, or after an error, but not when 1366 the result is "not a repeat quantifier". */ 1367 1368 EXIT: 1369 if (yield || *errorcodeptr != 0) *ptrptr = p; 1370 return yield; 1371 1372 1373 1374 } 1375 1376 1377 1378 /************************************************* 1379 * Handle escapes * 1380 *************************************************/ 1381 1382 /* This function is called when a \ has been encountered. It either returns a 1383 positive value for a simple escape such as \d, or 0 for a data character, which 1384 is placed in chptr. A backreference to group n is returned as negative n. On 1385 entry, ptr is pointing at the character after \. On exit, it points after the 1386 final code unit of the escape sequence. 1387 1388 This function is also called from pcre2_substitute() to handle escape sequences 1389 in replacement strings. In this case, the cb argument is NULL, and in the case 1390 of escapes that have further processing, only sequences that define a data 1391 character are recognised. The isclass argument is not relevant; the options 1392 argument is the final value of the compiled pattern's options. 1393 1394 Arguments: 1395 ptrptr points to the input position pointer 1396 ptrend points to the end of the input 1397 chptr points to a returned data character 1398 errorcodeptr points to the errorcode variable (containing zero) 1399 options the current options bits 1400 isclass TRUE if inside a character class 1401 cb compile data block 1402 1403 Returns: zero => a data character 1404 positive => a special escape sequence 1405 negative => a numerical back reference 1406 on error, errorcodeptr is set non-zero 1407 */ 1408 1409 int 1410 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, 1411 int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb) 1412 { 1413 BOOL utf = (options & PCRE2_UTF) != 0; 1414 PCRE2_SPTR ptr = *ptrptr; 1415 uint32_t c, cc; 1416 int escape = 0; 1417 int i; 1418 1419 /* If backslash is at the end of the string, it's an error. */ 1420 1421 if (ptr >= ptrend) 1422 { 1423 *errorcodeptr = ERR1; 1424 return 0; 1425 } 1426 1427 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 1428 *errorcodeptr = 0; /* Be optimistic */ 1429 1430 /* Non-alphanumerics are literals, so we just leave the value in c. An initial 1431 value test saves a memory lookup for code points outside the alphanumeric 1432 range. Otherwise, do a table lookup. A non-zero result is something that can be 1433 returned immediately. Otherwise further processing is required. */ 1434 1435 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ 1436 1437 else if ((i = escapes[c - ESCAPES_FIRST]) != 0) 1438 { 1439 if (i > 0) c = (uint32_t)i; else /* Positive is a data character */ 1440 { 1441 escape = -i; /* Else return a special escape */ 1442 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X)) 1443 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ 1444 1445 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical 1446 Unicode code points, as well as plain \N for "not newline". PCRE does not 1447 support \N{name}. However, it does support quantification such as \N{2,3}, 1448 so if \N{ is not followed by U+dddd we check for a quantifier. */ 1449 1450 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) 1451 { 1452 PCRE2_SPTR p = ptr + 1; 1453 1454 /* \N{U+ can be handled by the \x{ code. However, this construction is 1455 not valid in EBCDIC environments because it specifies a Unicode 1456 character, not a codepoint in the local code. For example \N{U+0041} 1457 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode 1458 casing semantics for the entire pattern, so allow it only in UTF (i.e. 1459 Unicode) mode. */ 1460 1461 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS) 1462 { 1463 #ifdef EBCDIC 1464 *errorcodeptr = ERR93; 1465 #else 1466 if (utf) 1467 { 1468 ptr = p + 1; 1469 escape = 0; /* Not a fancy escape after all */ 1470 goto COME_FROM_NU; 1471 } 1472 else *errorcodeptr = ERR93; 1473 #endif 1474 } 1475 1476 /* Give an error if what follows is not a quantifier, but don't override 1477 an error set by the quantifier reader (e.g. number overflow). */ 1478 1479 else 1480 { 1481 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) && 1482 *errorcodeptr == 0) 1483 *errorcodeptr = ERR37; 1484 } 1485 } 1486 } 1487 } 1488 1489 /* Escapes that need further processing, including those that are unknown. 1490 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u 1491 when BSUX is set). */ 1492 1493 else 1494 { 1495 PCRE2_SPTR oldptr; 1496 BOOL overflow; 1497 int s; 1498 1499 /* Filter calls from pcre2_substitute(). */ 1500 1501 if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x && 1502 (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0)) 1503 { 1504 *errorcodeptr = ERR3; 1505 return 0; 1506 } 1507 1508 switch (c) 1509 { 1510 /* A number of Perl escapes are not handled by PCRE. We give an explicit 1511 error. */ 1512 1513 case CHAR_F: 1514 case CHAR_l: 1515 case CHAR_L: 1516 *errorcodeptr = ERR37; 1517 break; 1518 1519 /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated 1520 specially, \u must be followed by four hex digits. Otherwise it is a 1521 lowercase u letter. */ 1522 1523 case CHAR_u: 1524 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else 1525 { 1526 uint32_t xc; 1527 if (ptrend - ptr < 4) break; /* Less than 4 chars */ 1528 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ 1529 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 1530 cc = (cc << 4) | xc; 1531 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ 1532 cc = (cc << 4) | xc; 1533 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ 1534 c = (cc << 4) | xc; 1535 ptr += 4; 1536 if (utf) 1537 { 1538 if (c > 0x10ffffU) *errorcodeptr = ERR77; 1539 else 1540 if (c >= 0xd800 && c <= 0xdfff && 1541 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) 1542 *errorcodeptr = ERR73; 1543 } 1544 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; 1545 } 1546 break; 1547 1548 /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an 1549 upper case letter. */ 1550 1551 case CHAR_U: 1552 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; 1553 break; 1554 1555 /* In a character class, \g is just a literal "g". Outside a character 1556 class, \g must be followed by one of a number of specific things: 1557 1558 (1) A number, either plain or braced. If positive, it is an absolute 1559 backreference. If negative, it is a relative backreference. This is a Perl 1560 5.10 feature. 1561 1562 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This 1563 is part of Perl's movement towards a unified syntax for back references. As 1564 this is synonymous with \k{name}, we fudge it up by pretending it really 1565 was \k{name}. 1566 1567 (3) For Oniguruma compatibility we also support \g followed by a name or a 1568 number either in angle brackets or in single quotes. However, these are 1569 (possibly recursive) subroutine calls, _not_ backreferences. We return 1570 the ESC_g code. 1571 1572 Summary: Return a negative number for a numerical back reference, ESC_k for 1573 a named back reference, and ESC_g for a named or numbered subroutine call. 1574 */ 1575 1576 case CHAR_g: 1577 if (isclass) break; 1578 1579 if (ptr >= ptrend) 1580 { 1581 *errorcodeptr = ERR57; 1582 break; 1583 } 1584 1585 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE) 1586 { 1587 escape = ESC_g; 1588 break; 1589 } 1590 1591 /* If there is a brace delimiter, try to read a numerical reference. If 1592 there isn't one, assume we have a name and treat it as \k. */ 1593 1594 if (*ptr == CHAR_LEFT_CURLY_BRACKET) 1595 { 1596 PCRE2_SPTR p = ptr + 1; 1597 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, 1598 errorcodeptr)) 1599 { 1600 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */ 1601 break; 1602 } 1603 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) 1604 { 1605 *errorcodeptr = ERR57; 1606 break; 1607 } 1608 ptr = p + 1; 1609 } 1610 1611 /* Read an undelimited number */ 1612 1613 else 1614 { 1615 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, 1616 errorcodeptr)) 1617 { 1618 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */ 1619 break; 1620 } 1621 } 1622 1623 if (s <= 0) 1624 { 1625 *errorcodeptr = ERR15; 1626 break; 1627 } 1628 1629 escape = -s; 1630 break; 1631 1632 /* The handling of escape sequences consisting of a string of digits 1633 starting with one that is not zero is not straightforward. Perl has changed 1634 over the years. Nowadays \g{} for backreferences and \o{} for octal are 1635 recommended to avoid the ambiguities in the old syntax. 1636 1637 Outside a character class, the digits are read as a decimal number. If the 1638 number is less than 10, or if there are that many previous extracting left 1639 brackets, it is a back reference. Otherwise, up to three octal digits are 1640 read to form an escaped character code. Thus \123 is likely to be octal 123 1641 (cf \0123, which is octal 012 followed by the literal 3). 1642 1643 Inside a character class, \ followed by a digit is always either a literal 1644 8 or 9 or an octal number. */ 1645 1646 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: 1647 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 1648 1649 if (!isclass) 1650 { 1651 oldptr = ptr; 1652 ptr--; /* Back to the digit */ 1653 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s, 1654 errorcodeptr)) 1655 break; 1656 1657 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x 1658 are octal escapes if there are not that many previous captures. */ 1659 1660 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount) 1661 { 1662 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61; 1663 else escape = -s; /* Indicates a back reference */ 1664 break; 1665 } 1666 ptr = oldptr; /* Put the pointer back and fall through */ 1667 } 1668 1669 /* Handle a digit following \ when the number is not a back reference, or 1670 we are within a character class. If the first digit is 8 or 9, Perl used to 1671 generate a binary zero and then treat the digit as a following literal. At 1672 least by Perl 5.18 this changed so as not to insert the binary zero. */ 1673 1674 if (c >= CHAR_8) break; 1675 1676 /* Fall through */ 1677 1678 /* \0 always starts an octal number, but we may drop through to here with a 1679 larger first octal digit. The original code used just to take the least 1680 significant 8 bits of octal numbers (I think this is what early Perls used 1681 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, 1682 but no more than 3 octal digits. */ 1683 1684 case CHAR_0: 1685 c -= CHAR_0; 1686 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) 1687 c = c * 8 + *ptr++ - CHAR_0; 1688 #if PCRE2_CODE_UNIT_WIDTH == 8 1689 if (!utf && c > 0xff) *errorcodeptr = ERR51; 1690 #endif 1691 break; 1692 1693 /* \o is a relatively new Perl feature, supporting a more general way of 1694 specifying character codes in octal. The only supported form is \o{ddd}. */ 1695 1696 case CHAR_o: 1697 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET) 1698 { 1699 ptr--; 1700 *errorcodeptr = ERR55; 1701 } 1702 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) 1703 *errorcodeptr = ERR78; 1704 else 1705 { 1706 c = 0; 1707 overflow = FALSE; 1708 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) 1709 { 1710 cc = *ptr++; 1711 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ 1712 #if PCRE2_CODE_UNIT_WIDTH == 32 1713 if (c >= 0x20000000l) { overflow = TRUE; break; } 1714 #endif 1715 c = (c << 3) + (cc - CHAR_0); 1716 #if PCRE2_CODE_UNIT_WIDTH == 8 1717 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } 1718 #elif PCRE2_CODE_UNIT_WIDTH == 16 1719 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } 1720 #elif PCRE2_CODE_UNIT_WIDTH == 32 1721 if (utf && c > 0x10ffffU) { overflow = TRUE; break; } 1722 #endif 1723 } 1724 if (overflow) 1725 { 1726 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; 1727 *errorcodeptr = ERR34; 1728 } 1729 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) 1730 { 1731 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL || 1732 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)) 1733 { 1734 ptr--; 1735 *errorcodeptr = ERR73; 1736 } 1737 } 1738 else 1739 { 1740 ptr--; 1741 *errorcodeptr = ERR64; 1742 } 1743 } 1744 break; 1745 1746 /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by 1747 two hexadecimal digits. Otherwise it is a lowercase x letter. */ 1748 1749 case CHAR_x: 1750 if ((options & PCRE2_ALT_BSUX) != 0) 1751 { 1752 uint32_t xc; 1753 if (ptrend - ptr < 2) break; /* Less than 2 characters */ 1754 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ 1755 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 1756 c = (cc << 4) | xc; 1757 ptr += 2; 1758 } /* End PCRE2_ALT_BSUX handling */ 1759 1760 /* Handle \x in Perl's style. \x{ddd} is a character number which can be 1761 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex 1762 digits. If not, { used to be treated as a data character. However, Perl 1763 seems to read hex digits up to the first non-such, and ignore the rest, so 1764 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE 1765 now gives an error. */ 1766 1767 else 1768 { 1769 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) 1770 { 1771 #ifndef EBCDIC 1772 COME_FROM_NU: 1773 #endif 1774 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) 1775 { 1776 *errorcodeptr = ERR78; 1777 break; 1778 } 1779 c = 0; 1780 overflow = FALSE; 1781 1782 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff) 1783 { 1784 ptr++; 1785 if (c == 0 && cc == 0) continue; /* Leading zeroes */ 1786 #if PCRE2_CODE_UNIT_WIDTH == 32 1787 if (c >= 0x10000000l) { overflow = TRUE; break; } 1788 #endif 1789 c = (c << 4) | cc; 1790 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) 1791 { 1792 overflow = TRUE; 1793 break; 1794 } 1795 } 1796 1797 if (overflow) 1798 { 1799 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++; 1800 *errorcodeptr = ERR34; 1801 } 1802 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) 1803 { 1804 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL || 1805 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)) 1806 { 1807 ptr--; 1808 *errorcodeptr = ERR73; 1809 } 1810 } 1811 1812 /* If the sequence of hex digits does not end with '}', give an error. 1813 We used just to recognize this construct and fall through to the normal 1814 \x handling, but nowadays Perl gives an error, which seems much more 1815 sensible, so we do too. */ 1816 1817 else 1818 { 1819 ptr--; 1820 *errorcodeptr = ERR67; 1821 } 1822 } /* End of \x{} processing */ 1823 1824 /* Read a up to two hex digits after \x */ 1825 1826 else 1827 { 1828 c = 0; 1829 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ 1830 ptr++; 1831 c = cc; 1832 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ 1833 ptr++; 1834 c = (c << 4) | cc; 1835 } /* End of \xdd handling */ 1836 } /* End of Perl-style \x handling */ 1837 break; 1838 1839 /* The handling of \c is different in ASCII and EBCDIC environments. In an 1840 ASCII (or Unicode) environment, an error is given if the character 1841 following \c is not a printable ASCII character. Otherwise, the following 1842 character is upper-cased if it is a letter, and after that the 0x40 bit is 1843 flipped. The result is the value of the escape. 1844 1845 In an EBCDIC environment the handling of \c is compatible with the 1846 specification in the perlebcdic document. The following character must be 1847 a letter or one of small number of special characters. These provide a 1848 means of defining the character values 0-31. 1849 1850 For testing the EBCDIC handling of \c in an ASCII environment, recognize 1851 the EBCDIC value of 'c' explicitly. */ 1852 1853 #if defined EBCDIC && 'a' != 0x81 1854 case 0x83: 1855 #else 1856 case CHAR_c: 1857 #endif 1858 if (ptr >= ptrend) 1859 { 1860 *errorcodeptr = ERR2; 1861 break; 1862 } 1863 c = *ptr; 1864 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); 1865 1866 /* Handle \c in an ASCII/Unicode environment. */ 1867 1868 #ifndef EBCDIC /* ASCII/UTF-8 coding */ 1869 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ 1870 { 1871 *errorcodeptr = ERR68; 1872 break; 1873 } 1874 c ^= 0x40; 1875 1876 /* Handle \c in an EBCDIC environment. The special case \c? is converted to 1877 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC 1878 encoding. (This is the way Perl indicates that it handles \c?.) The other 1879 valid sequences correspond to a list of specific characters. */ 1880 1881 #else 1882 if (c == CHAR_QUESTION_MARK) 1883 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; 1884 else 1885 { 1886 for (i = 0; i < 32; i++) 1887 { 1888 if (c == ebcdic_escape_c[i]) break; 1889 } 1890 if (i < 32) c = i; else *errorcodeptr = ERR68; 1891 } 1892 #endif /* EBCDIC */ 1893 1894 ptr++; 1895 break; 1896 1897 /* Any other alphanumeric following \ is an error. Perl gives an error only 1898 if in warning mode, but PCRE doesn't have a warning mode. */ 1899 1900 default: 1901 *errorcodeptr = ERR3; 1902 *ptrptr = ptr - 1; /* Point to the character at fault */ 1903 return 0; 1904 } 1905 } 1906 1907 /* Set the pointer to the next character before returning. */ 1908 1909 *ptrptr = ptr; 1910 *chptr = c; 1911 return escape; 1912 } 1913 1914 1915 1916 #ifdef SUPPORT_UNICODE 1917 /************************************************* 1918 * Handle \P and \p * 1919 *************************************************/ 1920 1921 /* This function is called after \P or \p has been encountered, provided that 1922 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the 1923 contents of ptrptr are pointing after the P or p. On exit, it is left pointing 1924 after the final code unit of the escape sequence. 1925 1926 Arguments: 1927 ptrptr the pattern position pointer 1928 negptr a boolean that is set TRUE for negation else FALSE 1929 ptypeptr an unsigned int that is set to the type value 1930 pdataptr an unsigned int that is set to the detailed property value 1931 errorcodeptr the error code variable 1932 cb the compile data 1933 1934 Returns: TRUE if the type value was found, or FALSE for an invalid type 1935 */ 1936 1937 static BOOL 1938 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, 1939 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) 1940 { 1941 PCRE2_UCHAR c; 1942 PCRE2_SIZE i, bot, top; 1943 PCRE2_SPTR ptr = *ptrptr; 1944 PCRE2_UCHAR name[32]; 1945 1946 if (ptr >= cb->end_pattern) goto ERROR_RETURN; 1947 c = *ptr++; 1948 *negptr = FALSE; 1949 1950 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for 1951 negation. */ 1952 1953 if (c == CHAR_LEFT_CURLY_BRACKET) 1954 { 1955 if (ptr >= cb->end_pattern) goto ERROR_RETURN; 1956 if (*ptr == CHAR_CIRCUMFLEX_ACCENT) 1957 { 1958 *negptr = TRUE; 1959 ptr++; 1960 } 1961 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) 1962 { 1963 if (ptr >= cb->end_pattern) goto ERROR_RETURN; 1964 c = *ptr++; 1965 if (c == CHAR_NUL) goto ERROR_RETURN; 1966 if (c == CHAR_RIGHT_CURLY_BRACKET) break; 1967 name[i] = c; 1968 } 1969 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; 1970 name[i] = 0; 1971 } 1972 1973 /* Otherwise there is just one following character, which must be an ASCII 1974 letter. */ 1975 1976 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) 1977 { 1978 name[0] = c; 1979 name[1] = 0; 1980 } 1981 else goto ERROR_RETURN; 1982 1983 *ptrptr = ptr; 1984 1985 /* Search for a recognized property name using binary chop. */ 1986 1987 bot = 0; 1988 top = PRIV(utt_size); 1989 1990 while (bot < top) 1991 { 1992 int r; 1993 i = (bot + top) >> 1; 1994 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); 1995 if (r == 0) 1996 { 1997 *ptypeptr = PRIV(utt)[i].type; 1998 *pdataptr = PRIV(utt)[i].value; 1999 return TRUE; 2000 } 2001 if (r > 0) bot = i + 1; else top = i; 2002 } 2003 *errorcodeptr = ERR47; /* Unrecognized name */ 2004 return FALSE; 2005 2006 ERROR_RETURN: /* Malformed \P or \p */ 2007 *errorcodeptr = ERR46; 2008 *ptrptr = ptr; 2009 return FALSE; 2010 } 2011 #endif 2012 2013 2014 2015 /************************************************* 2016 * Check for POSIX class syntax * 2017 *************************************************/ 2018 2019 /* This function is called when the sequence "[:" or "[." or "[=" is 2020 encountered in a character class. It checks whether this is followed by a 2021 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we 2022 reach an unescaped ']' without the special preceding character, return FALSE. 2023 2024 Originally, this function only recognized a sequence of letters between the 2025 terminators, but it seems that Perl recognizes any sequence of characters, 2026 though of course unknown POSIX names are subsequently rejected. Perl gives an 2027 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE 2028 didn't consider this to be a POSIX class. Likewise for [:1234:]. 2029 2030 The problem in trying to be exactly like Perl is in the handling of escapes. We 2031 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX 2032 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code 2033 below handles the special cases \\ and \], but does not try to do any other 2034 escape processing. This makes it different from Perl for cases such as 2035 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does 2036 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes 2037 when Perl does, I think. 2038 2039 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. 2040 It seems that the appearance of a nested POSIX class supersedes an apparent 2041 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or 2042 a digit. This is handled by returning FALSE if the start of a new group with 2043 the same terminator is encountered, since the next closing sequence must close 2044 the nested group, not the outer one. 2045 2046 In Perl, unescaped square brackets may also appear as part of class names. For 2047 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for 2048 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not 2049 seem right at all. PCRE does not allow closing square brackets in POSIX class 2050 names. 2051 2052 Arguments: 2053 ptr pointer to the character after the initial [ (colon, dot, equals) 2054 ptrend pointer to the end of the pattern 2055 endptr where to return a pointer to the terminating ':', '.', or '=' 2056 2057 Returns: TRUE or FALSE 2058 */ 2059 2060 static BOOL 2061 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) 2062 { 2063 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ 2064 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */ 2065 2066 for (; ptrend - ptr >= 2; ptr++) 2067 { 2068 if (*ptr == CHAR_BACKSLASH && 2069 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) 2070 ptr++; 2071 2072 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || 2073 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; 2074 2075 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 2076 { 2077 *endptr = ptr; 2078 return TRUE; 2079 } 2080 } 2081 2082 return FALSE; 2083 } 2084 2085 2086 2087 /************************************************* 2088 * Check POSIX class name * 2089 *************************************************/ 2090 2091 /* This function is called to check the name given in a POSIX-style class entry 2092 such as [:alnum:]. 2093 2094 Arguments: 2095 ptr points to the first letter 2096 len the length of the name 2097 2098 Returns: a value representing the name, or -1 if unknown 2099 */ 2100 2101 static int 2102 check_posix_name(PCRE2_SPTR ptr, int len) 2103 { 2104 const char *pn = posix_names; 2105 int yield = 0; 2106 while (posix_name_lengths[yield] != 0) 2107 { 2108 if (len == posix_name_lengths[yield] && 2109 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; 2110 pn += posix_name_lengths[yield] + 1; 2111 yield++; 2112 } 2113 return -1; 2114 } 2115 2116 2117 2118 /************************************************* 2119 * Read a subpattern or VERB name * 2120 *************************************************/ 2121 2122 /* This function is called from parse_regex() below whenever it needs to read 2123 the name of a subpattern or a (*VERB). The initial pointer must be to the 2124 character before the name. If that character is '*' we are reading a verb name. 2125 The pointer is updated to point after the name, for a VERB, or after tha name's 2126 terminator for a subpattern name. Returning both the offset and the name 2127 pointer is redundant information, but some callers use one and some the other, 2128 so it is simplest just to return both. 2129 2130 Arguments: 2131 ptrptr points to the character pointer variable 2132 ptrend points to the end of the input string 2133 terminator the terminator of a subpattern name must be this 2134 offsetptr where to put the offset from the start of the pattern 2135 nameptr where to put a pointer to the name in the input 2136 namelenptr where to put the length of the name 2137 errcodeptr where to put an error code 2138 cb pointer to the compile data block 2139 2140 Returns: TRUE if a name was read 2141 FALSE otherwise, with error code set 2142 */ 2143 2144 static BOOL 2145 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator, 2146 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, 2147 int *errorcodeptr, compile_block *cb) 2148 { 2149 PCRE2_SPTR ptr = *ptrptr; 2150 BOOL is_verb = (*ptr == CHAR_ASTERISK); 2151 uint32_t namelen = 0; 2152 uint32_t ctype = is_verb? ctype_letter : ctype_word; 2153 2154 if (++ptr >= ptrend) 2155 { 2156 *errorcodeptr = is_verb? ERR60: /* Verb not recognized or malformed */ 2157 ERR62; /* Subpattern name expected */ 2158 goto FAILED; 2159 } 2160 2161 *nameptr = ptr; 2162 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); 2163 2164 if (IS_DIGIT(*ptr)) 2165 { 2166 *errorcodeptr = ERR44; /* Group name must not start with digit */ 2167 goto FAILED; 2168 } 2169 2170 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0) 2171 { 2172 ptr++; 2173 namelen++; 2174 if (namelen > MAX_NAME_SIZE) 2175 { 2176 *errorcodeptr = ERR48; 2177 goto FAILED; 2178 } 2179 } 2180 2181 /* Subpattern names must not be empty, and their terminator is checked here. 2182 (What follows a verb name is checked separately.) */ 2183 2184 if (!is_verb) 2185 { 2186 if (namelen == 0) 2187 { 2188 *errorcodeptr = ERR62; /* Subpattern name expected */ 2189 goto FAILED; 2190 } 2191 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator) 2192 { 2193 *errorcodeptr = ERR42; 2194 goto FAILED; 2195 } 2196 ptr++; 2197 } 2198 2199 *namelenptr = namelen; 2200 *ptrptr = ptr; 2201 return TRUE; 2202 2203 FAILED: 2204 *ptrptr = ptr; 2205 return FALSE; 2206 } 2207 2208 2209 2210 /************************************************* 2211 * Manage callouts at start of cycle * 2212 *************************************************/ 2213 2214 /* At the start of a new item in parse_regex() we are able to record the 2215 details of the previous item in a prior callout, and also to set up an 2216 automatic callout if enabled. Avoid having two adjacent automatic callouts, 2217 which would otherwise happen for items such as \Q that contribute nothing to 2218 the parsed pattern. 2219 2220 Arguments: 2221 ptr current pattern pointer 2222 pcalloutptr points to a pointer to previous callout, or NULL 2223 auto_callout TRUE if auto_callouts are enabled 2224 parsed_pattern the parsed pattern pointer 2225 cb compile block 2226 2227 Returns: possibly updated parsed_pattern pointer. 2228 */ 2229 2230 static uint32_t * 2231 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout, 2232 uint32_t *parsed_pattern, compile_block *cb) 2233 { 2234 uint32_t *previous_callout = *pcalloutptr; 2235 2236 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr - 2237 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]); 2238 2239 if (!auto_callout) previous_callout = NULL; else 2240 { 2241 if (previous_callout == NULL || 2242 previous_callout != parsed_pattern - 4 || 2243 previous_callout[3] != 255) 2244 { 2245 previous_callout = parsed_pattern; /* Set up new automatic callout */ 2246 parsed_pattern += 4; 2247 previous_callout[0] = META_CALLOUT_NUMBER; 2248 previous_callout[2] = 0; 2249 previous_callout[3] = 255; 2250 } 2251 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); 2252 } 2253 2254 *pcalloutptr = previous_callout; 2255 return parsed_pattern; 2256 } 2257 2258 2259 2260 /************************************************* 2261 * Parse regex and identify named groups * 2262 *************************************************/ 2263 2264 /* This function is called first of all. It scans the pattern and does two 2265 things: (1) It identifies capturing groups and makes a table of named capturing 2266 groups so that information about them is fully available to both the compiling 2267 scans. (2) It writes a parsed version of the pattern with comments omitted and 2268 escapes processed into the parsed_pattern vector. 2269 2270 Arguments: 2271 ptr points to the start of the pattern 2272 options compiling dynamic options (may change during the scan) 2273 has_lookbehind points to a boolean, set TRUE if a lookbehind is found 2274 cb pointer to the compile data block 2275 2276 Returns: zero on success or a non-zero error code, with the 2277 error offset placed in the cb field 2278 */ 2279 2280 /* A structure and some flags for dealing with nested groups. */ 2281 2282 typedef struct nest_save { 2283 uint16_t nest_depth; 2284 uint16_t reset_group; 2285 uint16_t max_group; 2286 uint16_t flags; 2287 uint32_t options; 2288 } nest_save; 2289 2290 #define NSF_RESET 0x0001u 2291 #define NSF_CONDASSERT 0x0002u 2292 2293 /* Options that are changeable within the pattern must be tracked during 2294 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing, 2295 but all must be tracked so that META_OPTIONS items set the correct values for 2296 the main compiling phase. */ 2297 2298 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \ 2299 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ 2300 PCRE2_UNGREEDY) 2301 2302 /* States used for analyzing ranges in character classes. The two OK values 2303 must be last. */ 2304 2305 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; 2306 2307 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates 2308 the storing of literal values in the parsed pattern. */ 2309 2310 #if PCRE2_CODE_UNIT_WIDTH == 32 2311 #define PARSED_LITERAL(c, p) \ 2312 { \ 2313 if (c >= META_END) *p++ = META_BIGVALUE; \ 2314 *p++ = c; \ 2315 okquantifier = TRUE; \ 2316 } 2317 #else 2318 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE; 2319 #endif 2320 2321 /* Here's the actual function. */ 2322 2323 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, 2324 compile_block *cb) 2325 { 2326 uint32_t c; 2327 uint32_t delimiter; 2328 uint32_t namelen; 2329 uint32_t class_range_state; 2330 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ 2331 uint32_t *previous_callout = NULL; 2332 uint32_t *parsed_pattern = cb->parsed_pattern; 2333 uint32_t *parsed_pattern_end = cb->parsed_pattern_end; 2334 uint32_t meta_quantifier = 0; 2335 uint32_t add_after_mark = 0; 2336 uint16_t nest_depth = 0; 2337 int after_manual_callout = 0; 2338 int expect_cond_assert = 0; 2339 int errorcode = 0; 2340 int escape; 2341 int i; 2342 BOOL inescq = FALSE; 2343 BOOL inverbname = FALSE; 2344 BOOL utf = (options & PCRE2_UTF) != 0; 2345 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; 2346 BOOL isdupname; 2347 BOOL negate_class; 2348 BOOL okquantifier = FALSE; 2349 PCRE2_SPTR thisptr; 2350 PCRE2_SPTR name; 2351 PCRE2_SPTR ptrend = cb->end_pattern; 2352 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ 2353 named_group *ng; 2354 nest_save *top_nest, *end_nests; 2355 2356 /* Insert leading items for word and line matching (features provided for the 2357 benefit of pcre2grep). */ 2358 2359 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) 2360 { 2361 *parsed_pattern++ = META_CIRCUMFLEX; 2362 *parsed_pattern++ = META_NOCAPTURE; 2363 } 2364 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) 2365 { 2366 *parsed_pattern++ = META_ESCAPE + ESC_b; 2367 *parsed_pattern++ = META_NOCAPTURE; 2368 } 2369 2370 /* If the pattern is actually a literal string, process it separately to avoid 2371 cluttering up the main loop. */ 2372 2373 if ((options & PCRE2_LITERAL) != 0) 2374 { 2375 while (ptr < ptrend) 2376 { 2377 if (parsed_pattern >= parsed_pattern_end) 2378 { 2379 errorcode = ERR63; /* Internal error (parsed pattern overflow) */ 2380 goto FAILED; 2381 } 2382 thisptr = ptr; 2383 GETCHARINCTEST(c, ptr); 2384 if (auto_callout) 2385 parsed_pattern = manage_callouts(thisptr, &previous_callout, 2386 auto_callout, parsed_pattern, cb); 2387 PARSED_LITERAL(c, parsed_pattern); 2388 } 2389 goto PARSED_END; 2390 } 2391 2392 /* Process a real regex which may contain meta-characters. */ 2393 2394 top_nest = NULL; 2395 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); 2396 2397 /* The size of the nest_save structure might not be a factor of the size of the 2398 workspace. Therefore we must round down end_nests so as to correctly avoid 2399 creating a nest_save that spans the end of the workspace. */ 2400 2401 end_nests = (nest_save *)((char *)end_nests - 2402 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); 2403 2404 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */ 2405 2406 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED; 2407 2408 /* Now scan the pattern */ 2409 2410 while (ptr < ptrend) 2411 { 2412 int prev_expect_cond_assert; 2413 uint32_t min_repeat, max_repeat; 2414 uint32_t set, unset, *optset; 2415 uint32_t terminator; 2416 uint32_t prev_meta_quantifier; 2417 BOOL prev_okquantifier; 2418 PCRE2_SPTR tempptr; 2419 PCRE2_SIZE offset; 2420 2421 if (parsed_pattern >= parsed_pattern_end) 2422 { 2423 errorcode = ERR63; /* Internal error (parsed pattern overflow) */ 2424 goto FAILED; 2425 } 2426 2427 if (nest_depth > cb->cx->parens_nest_limit) 2428 { 2429 errorcode = ERR19; 2430 goto FAILED; /* Parentheses too deeply nested */ 2431 } 2432 2433 /* Get next input character, save its position for callout handling. */ 2434 2435 thisptr = ptr; 2436 GETCHARINCTEST(c, ptr); 2437 2438 /* Copy quoted literals until \E, allowing for the possibility of automatic 2439 callouts, except when processing a (*VERB) "name". */ 2440 2441 if (inescq) 2442 { 2443 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) 2444 { 2445 inescq = FALSE; 2446 ptr++; /* Skip E */ 2447 } 2448 else 2449 { 2450 if (expect_cond_assert > 0) /* A literal is not allowed if we are */ 2451 { /* expecting a conditional assertion, */ 2452 ptr--; /* but an empty \Q\E sequence is OK. */ 2453 errorcode = ERR28; 2454 goto FAILED; 2455 } 2456 if (!inverbname && after_manual_callout-- <= 0) 2457 parsed_pattern = manage_callouts(thisptr, &previous_callout, 2458 auto_callout, parsed_pattern, cb); 2459 PARSED_LITERAL(c, parsed_pattern); 2460 meta_quantifier = 0; 2461 } 2462 continue; /* Next character */ 2463 } 2464 2465 /* If we are processing the "name" part of a (*VERB:NAME) item, all 2466 characters up to the closing parenthesis are literals except when 2467 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q 2468 and \E and escaped characters are allowed (no character types such as \d). If 2469 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do 2470 this by not entering the special (*VERB:NAME) processing - they are then 2471 picked up below. Note that c is a character, not a code unit, so we must not 2472 use MAX_255 to test its size because MAX_255 tests code units and is assumed 2473 TRUE in 8-bit mode. */ 2474 2475 if (inverbname && 2476 ( 2477 /* EITHER: not both options set */ 2478 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != 2479 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || 2480 #ifdef SUPPORT_UNICODE 2481 /* OR: character > 255 AND not Unicode Pattern White Space */ 2482 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) || 2483 #endif 2484 /* OR: not a # comment or isspace() white space */ 2485 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0 2486 #ifdef SUPPORT_UNICODE 2487 /* and not CHAR_NEL when Unicode is supported */ 2488 && c != CHAR_NEL 2489 #endif 2490 ))) 2491 { 2492 PCRE2_SIZE verbnamelength; 2493 2494 switch(c) 2495 { 2496 default: 2497 PARSED_LITERAL(c, parsed_pattern); 2498 break; 2499 2500 case CHAR_RIGHT_PARENTHESIS: 2501 inverbname = FALSE; 2502 okquantifier = FALSE; /* Was probably set by literals */ 2503 /* This is the length in characters */ 2504 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); 2505 /* But the limit on the length is in code units */ 2506 if (ptr - verbnamestart - 1 > (int)MAX_MARK) 2507 { 2508 ptr--; 2509 errorcode = ERR76; 2510 goto FAILED; 2511 } 2512 *verblengthptr = (uint32_t)verbnamelength; 2513 2514 /* If this name was on a verb such as (*ACCEPT) which does not continue, 2515 a (*MARK) was generated for the name. We now add the original verb as the 2516 next item. */ 2517 2518 if (add_after_mark != 0) 2519 { 2520 *parsed_pattern++ = add_after_mark; 2521 add_after_mark = 0; 2522 } 2523 break; 2524 2525 case CHAR_BACKSLASH: 2526 if ((options & PCRE2_ALT_VERBNAMES) != 0) 2527 { 2528 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, 2529 FALSE, cb); 2530 if (errorcode != 0) goto FAILED; 2531 } 2532 else escape = 0; /* Treat all as literal */ 2533 2534 switch(escape) 2535 { 2536 case 0: 2537 PARSED_LITERAL(c, parsed_pattern); 2538 break; 2539 2540 case ESC_Q: 2541 inescq = TRUE; 2542 break; 2543 2544 case ESC_E: /* Ignore */ 2545 break; 2546 2547 default: 2548 errorcode = ERR40; /* Invalid in verb name */ 2549 goto FAILED; 2550 } 2551 } 2552 continue; /* Next character in pattern */ 2553 } 2554 2555 /* Not a verb name character. At this point we must process everything that 2556 must not change the quantification state. This is mainly comments, but we 2557 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as 2558 A+, as in Perl. An isolated \E is ignored. */ 2559 2560 if (c == CHAR_BACKSLASH && ptr < ptrend) 2561 { 2562 if (*ptr == CHAR_Q || *ptr == CHAR_E) 2563 { 2564 inescq = *ptr == CHAR_Q; 2565 ptr++; 2566 continue; 2567 } 2568 } 2569 2570 /* Skip over whitespace and # comments in extended mode. Note that c is a 2571 character, not a code unit, so we must not use MAX_255 to test its size 2572 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The 2573 whitespace characters are those designated as "Pattern White Space" by 2574 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is 2575 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a 2576 subset of space characters that match \h and \v. */ 2577 2578 if ((options & PCRE2_EXTENDED) != 0) 2579 { 2580 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; 2581 #ifdef SUPPORT_UNICODE 2582 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue; 2583 #endif 2584 if (c == CHAR_NUMBER_SIGN) 2585 { 2586 while (ptr < ptrend) 2587 { 2588 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 2589 { /* IS_NEWLINE sets cb->nllen. */ 2590 ptr += cb->nllen; 2591 break; 2592 } 2593 ptr++; 2594 #ifdef SUPPORT_UNICODE 2595 if (utf) FORWARDCHARTEST(ptr, ptrend); 2596 #endif 2597 } 2598 continue; /* Next character in pattern */ 2599 } 2600 } 2601 2602 /* Skip over bracketed comments */ 2603 2604 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 && 2605 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) 2606 { 2607 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS); 2608 if (ptr >= ptrend) 2609 { 2610 errorcode = ERR18; /* A special error for missing ) in a comment */ 2611 goto FAILED; /* to make it easier to debug. */ 2612 } 2613 ptr++; 2614 continue; /* Next character in pattern */ 2615 } 2616 2617 /* If the next item is not a quantifier, fill in length of any previous 2618 callout and create an auto callout if required. */ 2619 2620 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK && 2621 (c != CHAR_LEFT_CURLY_BRACKET || 2622 (tempptr = ptr, 2623 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) 2624 { 2625 if (after_manual_callout-- <= 0) 2626 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout, 2627 parsed_pattern, cb); 2628 } 2629 2630 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an 2631 assertion, possibly preceded by a callout. If the value is 1, we have just 2632 had the callout and expect an assertion. There must be at least 3 more 2633 characters in all cases. When expect_cond_assert is 2, we know that the 2634 current character is an opening parenthesis, as otherwise we wouldn't be 2635 here. However, when it is 1, we need to check, and it's easiest just to check 2636 always. Note that expect_cond_assert may be negative, since all callouts just 2637 decrement it. */ 2638 2639 if (expect_cond_assert > 0) 2640 { 2641 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 && 2642 ptr[0] == CHAR_QUESTION_MARK; 2643 if (ok) switch(ptr[1]) 2644 { 2645 case CHAR_C: 2646 ok = expect_cond_assert == 2; 2647 break; 2648 2649 case CHAR_EQUALS_SIGN: 2650 case CHAR_EXCLAMATION_MARK: 2651 break; 2652 2653 case CHAR_LESS_THAN_SIGN: 2654 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK; 2655 break; 2656 2657 default: 2658 ok = FALSE; 2659 } 2660 2661 if (!ok) 2662 { 2663 ptr--; /* Adjust error offset */ 2664 errorcode = ERR28; 2665 goto FAILED; 2666 } 2667 } 2668 2669 /* Remember whether we are expecting a conditional assertion, and set the 2670 default for this item. */ 2671 2672 prev_expect_cond_assert = expect_cond_assert; 2673 expect_cond_assert = 0; 2674 2675 /* Remember quantification status for the previous significant item, then set 2676 default for this item. */ 2677 2678 prev_okquantifier = okquantifier; 2679 prev_meta_quantifier = meta_quantifier; 2680 okquantifier = FALSE; 2681 meta_quantifier = 0; 2682 2683 /* If the previous significant item was a quantifier, adjust the parsed code 2684 if there is a following modifier. The base meta value is always followed by 2685 the PLUS and QUERY values, in that order. We do this here rather than after 2686 reading a quantifier so that intervening comments and /x whitespace can be 2687 ignored without having to replicate code. */ 2688 2689 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS)) 2690 { 2691 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] = 2692 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)? 2693 0x00020000u : 0x00010000u); 2694 continue; /* Next character in pattern */ 2695 } 2696 2697 2698 /* Process the next item in the main part of a pattern. */ 2699 2700 switch(c) 2701 { 2702 default: /* Non-special character */ 2703 PARSED_LITERAL(c, parsed_pattern); 2704 break; 2705 2706 2707 /* ---- Escape sequence ---- */ 2708 2709 case CHAR_BACKSLASH: 2710 tempptr = ptr; 2711 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, 2712 FALSE, cb); 2713 if (errorcode != 0) 2714 { 2715 ESCAPE_FAILED: 2716 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) 2717 goto FAILED; 2718 ptr = tempptr; 2719 if (ptr >= ptrend) c = CHAR_BACKSLASH; else 2720 { 2721 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 2722 } 2723 escape = 0; /* Treat as literal character */ 2724 } 2725 2726 /* The escape was a data escape or literal character. */ 2727 2728 if (escape == 0) 2729 { 2730 PARSED_LITERAL(c, parsed_pattern); 2731 } 2732 2733 /* The escape was a back (or forward) reference. We keep the offset in 2734 order to give a more useful diagnostic for a bad forward reference. For 2735 references to groups numbered less than 10 we can't use more than two items 2736 in parsed_pattern because they may be just two characters in the input (and 2737 in a 64-bit world an offset may need two elements). So for them, the offset 2738 of the first occurrent is held in a special vector. */ 2739 2740 else if (escape < 0) 2741 { 2742 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1); 2743 escape = -escape; 2744 *parsed_pattern++ = META_BACKREF | (uint32_t)escape; 2745 if (escape < 10) 2746 { 2747 if (cb->small_ref_offset[escape] == PCRE2_UNSET) 2748 cb->small_ref_offset[escape] = offset; 2749 } 2750 else 2751 { 2752 PUTOFFSET(offset, parsed_pattern); 2753 } 2754 okquantifier = TRUE; 2755 } 2756 2757 /* The escape was a character class such as \d etc. or other special 2758 escape indicator such as \A or \X. Most of them generate just a single 2759 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit 2760 value. They are supported only when Unicode is available. The type and 2761 value are packed into a single 32-bit value so that the whole sequences 2762 uses only two elements in the parsed_vector. This is because the same 2763 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is 2764 set. 2765 2766 There are also some cases where the escape sequence is followed by a name: 2767 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name> 2768 and \g'name' are subroutine calls by name; \g{name} is a synonym for 2769 \k{name}. Note that \g<number> and \g'number' are handled by check_escape() 2770 and returned as a negative value (handled above). A name is coded as an 2771 offset into the pattern and a length. */ 2772 2773 else switch (escape) 2774 { 2775 case ESC_C: 2776 #ifdef NEVER_BACKSLASH_C 2777 errorcode = ERR85; 2778 goto ESCAPE_FAILED; 2779 #else 2780 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0) 2781 { 2782 errorcode = ERR83; 2783 goto ESCAPE_FAILED; 2784 } 2785 #endif 2786 okquantifier = TRUE; 2787 *parsed_pattern++ = META_ESCAPE + escape; 2788 break; 2789 2790 case ESC_X: 2791 #ifndef SUPPORT_UNICODE 2792 errorcode = ERR45; /* Supported only with Unicode support */ 2793 goto ESCAPE_FAILED; 2794 #endif 2795 case ESC_H: 2796 case ESC_h: 2797 case ESC_N: 2798 case ESC_R: 2799 case ESC_V: 2800 case ESC_v: 2801 okquantifier = TRUE; 2802 *parsed_pattern++ = META_ESCAPE + escape; 2803 break; 2804 2805 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */ 2806 *parsed_pattern++ = META_ESCAPE + escape; 2807 break; 2808 2809 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set 2810 without Unicode support because it is checked when pcre2_compile() is 2811 called. */ 2812 2813 case ESC_d: 2814 case ESC_D: 2815 case ESC_s: 2816 case ESC_S: 2817 case ESC_w: 2818 case ESC_W: 2819 okquantifier = TRUE; 2820 if ((options & PCRE2_UCP) == 0) 2821 { 2822 *parsed_pattern++ = META_ESCAPE + escape; 2823 } 2824 else 2825 { 2826 *parsed_pattern++ = META_ESCAPE + 2827 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? 2828 ESC_p : ESC_P); 2829 switch(escape) 2830 { 2831 case ESC_d: 2832 case ESC_D: 2833 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; 2834 break; 2835 2836 case ESC_s: 2837 case ESC_S: 2838 *parsed_pattern++ = PT_SPACE << 16; 2839 break; 2840 2841 case ESC_w: 2842 case ESC_W: 2843 *parsed_pattern++ = PT_WORD << 16; 2844 break; 2845 } 2846 } 2847 break; 2848 2849 /* Unicode property matching */ 2850 2851 case ESC_P: 2852 case ESC_p: 2853 #ifdef SUPPORT_UNICODE 2854 { 2855 BOOL negated; 2856 uint16_t ptype = 0, pdata = 0; 2857 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) 2858 goto ESCAPE_FAILED; 2859 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; 2860 *parsed_pattern++ = META_ESCAPE + escape; 2861 *parsed_pattern++ = (ptype << 16) | pdata; 2862 okquantifier = TRUE; 2863 } 2864 #else 2865 errorcode = ERR45; 2866 goto ESCAPE_FAILED; 2867 #endif 2868 break; /* End \P and \p */ 2869 2870 /* When \g is used with quotes or angle brackets as delimiters, it is a 2871 numerical or named subroutine call, and control comes here. When used 2872 with brace delimiters it is a numberical back reference and does not come 2873 here because check_escape() returns it directly as a reference. \k is 2874 always a named back reference. */ 2875 2876 case ESC_g: 2877 case ESC_k: 2878 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET && 2879 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE)) 2880 { 2881 errorcode = (escape == ESC_g)? ERR57 : ERR69; 2882 goto ESCAPE_FAILED; 2883 } 2884 terminator = (*ptr == CHAR_LESS_THAN_SIGN)? 2885 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? 2886 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; 2887 2888 /* For a non-braced \g, check for a numerical recursion. */ 2889 2890 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET) 2891 { 2892 PCRE2_SPTR p = ptr + 1; 2893 2894 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, 2895 &errorcode)) 2896 { 2897 if (p >= ptrend || *p != terminator) 2898 { 2899 errorcode = ERR57; 2900 goto ESCAPE_FAILED; 2901 } 2902 ptr = p; 2903 goto SET_RECURSION; 2904 } 2905 if (errorcode != 0) goto ESCAPE_FAILED; 2906 } 2907 2908 /* Not a numerical recursion */ 2909 2910 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, 2911 &errorcode, cb)) goto ESCAPE_FAILED; 2912 2913 /* \k and \g when used with braces are back references, whereas \g used 2914 with quotes or angle brackets is a recursion */ 2915 2916 *parsed_pattern++ = 2917 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)? 2918 META_BACKREF_BYNAME : META_RECURSE_BYNAME; 2919 *parsed_pattern++ = namelen; 2920 2921 PUTOFFSET(offset, parsed_pattern); 2922 okquantifier = TRUE; 2923 break; /* End special escape processing */ 2924 } 2925 break; /* End escape sequence processing */ 2926 2927 2928 /* ---- Single-character special items ---- */ 2929 2930 case CHAR_CIRCUMFLEX_ACCENT: 2931 *parsed_pattern++ = META_CIRCUMFLEX; 2932 break; 2933 2934 case CHAR_DOLLAR_SIGN: 2935 *parsed_pattern++ = META_DOLLAR; 2936 break; 2937 2938 case CHAR_DOT: 2939 *parsed_pattern++ = META_DOT; 2940 okquantifier = TRUE; 2941 break; 2942 2943 2944 /* ---- Single-character quantifiers ---- */ 2945 2946 case CHAR_ASTERISK: 2947 meta_quantifier = META_ASTERISK; 2948 goto CHECK_QUANTIFIER; 2949 2950 case CHAR_PLUS: 2951 meta_quantifier = META_PLUS; 2952 goto CHECK_QUANTIFIER; 2953 2954 case CHAR_QUESTION_MARK: 2955 meta_quantifier = META_QUERY; 2956 goto CHECK_QUANTIFIER; 2957 2958 2959 /* ---- Potential {n,m} quantifier ---- */ 2960 2961 case CHAR_LEFT_CURLY_BRACKET: 2962 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat, 2963 &errorcode)) 2964 { 2965 if (errorcode != 0) goto FAILED; /* Error in quantifier. */ 2966 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */ 2967 break; /* No more quantifier processing */ 2968 } 2969 meta_quantifier = META_MINMAX; 2970 /* Fall through */ 2971 2972 2973 /* ---- Quantifier post-processing ---- */ 2974 2975 /* Check that a quantifier is allowed after the previous item. */ 2976 2977 CHECK_QUANTIFIER: 2978 if (!prev_okquantifier) 2979 { 2980 errorcode = ERR9; 2981 goto FAILED_BACK; 2982 } 2983 2984 /* Now we can put the quantifier into the parsed pattern vector. At this 2985 stage, we have only the basic quantifier. The check for a following + or ? 2986 modifier happens at the top of the loop, after any intervening comments 2987 have been removed. */ 2988 2989 *parsed_pattern++ = meta_quantifier; 2990 if (c == CHAR_LEFT_CURLY_BRACKET) 2991 { 2992 *parsed_pattern++ = min_repeat; 2993 *parsed_pattern++ = max_repeat; 2994 } 2995 break; 2996 2997 2998 /* ---- Character class ---- */ 2999 3000 case CHAR_LEFT_SQUARE_BRACKET: 3001 okquantifier = TRUE; 3002 3003 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is 3004 used for "start of word" and "end of word". As these are otherwise illegal 3005 sequences, we don't break anything by recognizing them. They are replaced 3006 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are 3007 erroneous and are handled by the normal code below. */ 3008 3009 if (ptrend - ptr >= 6 && 3010 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 || 3011 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0)) 3012 { 3013 *parsed_pattern++ = META_ESCAPE + ESC_b; 3014 3015 if (ptr[2] == CHAR_LESS_THAN_SIGN) 3016 { 3017 *parsed_pattern++ = META_LOOKAHEAD; 3018 } 3019 else 3020 { 3021 *parsed_pattern++ = META_LOOKBEHIND; 3022 *has_lookbehind = TRUE; 3023 3024 /* The offset is used only for the "non-fixed length" error; this won't 3025 occur here, so just store zero. */ 3026 3027 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern); 3028 } 3029 3030 if ((options & PCRE2_UCP) == 0) 3031 *parsed_pattern++ = META_ESCAPE + ESC_w; 3032 else 3033 { 3034 *parsed_pattern++ = META_ESCAPE + ESC_p; 3035 *parsed_pattern++ = PT_WORD << 16; 3036 } 3037 *parsed_pattern++ = META_KET; 3038 ptr += 6; 3039 break; 3040 } 3041 3042 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 3043 they are encountered at the top level, so we'll do that too. */ 3044 3045 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT || 3046 *ptr == CHAR_EQUALS_SIGN) && 3047 check_posix_syntax(ptr, ptrend, &tempptr)) 3048 { 3049 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13; 3050 goto FAILED; 3051 } 3052 3053 /* Process a regular character class. If the first character is '^', set 3054 the negation flag. If the first few characters (either before or after ^) 3055 are \Q\E or \E or space or tab in extended-more mode, we skip them too. 3056 This makes for compatibility with Perl. */ 3057 3058 negate_class = FALSE; 3059 while (ptr < ptrend) 3060 { 3061 GETCHARINCTEST(c, ptr); 3062 if (c == CHAR_BACKSLASH) 3063 { 3064 if (ptr < ptrend && *ptr == CHAR_E) ptr++; 3065 else if (ptrend - ptr >= 3 && 3066 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0) 3067 ptr += 3; 3068 else 3069 break; 3070 } 3071 else if ((options & PCRE2_EXTENDED_MORE) != 0 && 3072 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */ 3073 continue; 3074 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) 3075 negate_class = TRUE; 3076 else break; 3077 } 3078 3079 /* Now the real contents of the class; c has the first "real" character. 3080 Empty classes are permitted only if the option is set. */ 3081 3082 if (c == CHAR_RIGHT_SQUARE_BRACKET && 3083 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) 3084 { 3085 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY; 3086 break; /* End of class processing */ 3087 } 3088 3089 /* Process a non-empty class. */ 3090 3091 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS; 3092 class_range_state = RANGE_NO; 3093 3094 /* In an EBCDIC environment, Perl treats alphabetic ranges specially 3095 because there are holes in the encoding, and simply using the range A-Z 3096 (for example) would include the characters in the holes. This applies only 3097 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z] 3098 in this respect. In order to accommodate this, we keep track of whether 3099 character values are literal or not, and a state variable for handling 3100 ranges. */ 3101 3102 /* Loop for the contents of the class */ 3103 3104 for (;;) 3105 { 3106 BOOL char_is_literal = TRUE; 3107 3108 /* Inside \Q...\E everything is literal except \E */ 3109 3110 if (inescq) 3111 { 3112 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) 3113 { 3114 inescq = FALSE; /* Reset literal state */ 3115 ptr++; /* Skip the 'E' */ 3116 goto CLASS_CONTINUE; 3117 } 3118 goto CLASS_LITERAL; 3119 } 3120 3121 /* Skip over space and tab (only) in extended-more mode. */ 3122 3123 if ((options & PCRE2_EXTENDED_MORE) != 0 && 3124 (c == CHAR_SPACE || c == CHAR_HT)) 3125 goto CLASS_CONTINUE; 3126 3127 /* Handle POSIX class names. Perl allows a negation extension of the 3128 form [:^name:]. A square bracket that doesn't match the syntax is 3129 treated as a literal. We also recognize the POSIX constructions 3130 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 3131 5.6 and 5.8 do. */ 3132 3133 if (c == CHAR_LEFT_SQUARE_BRACKET && 3134 ptrend - ptr >= 3 && 3135 (*ptr == CHAR_COLON || *ptr == CHAR_DOT || 3136 *ptr == CHAR_EQUALS_SIGN) && 3137 check_posix_syntax(ptr, ptrend, &tempptr)) 3138 { 3139 BOOL posix_negate = FALSE; 3140 int posix_class; 3141 3142 /* Perl treats a hyphen before a POSIX class as a literal, not the 3143 start of a range. However, it gives a warning in its warning mode. PCRE 3144 does not have a warning mode, so we give an error, because this is 3145 likely an error on the user's part. */ 3146 3147 if (class_range_state == RANGE_STARTED) 3148 { 3149 errorcode = ERR50; 3150 goto FAILED; 3151 } 3152 3153 if (*ptr != CHAR_COLON) 3154 { 3155 errorcode = ERR13; 3156 goto FAILED_BACK; 3157 } 3158 3159 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) 3160 { 3161 posix_negate = TRUE; 3162 ptr++; 3163 } 3164 3165 posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); 3166 if (posix_class < 0) 3167 { 3168 errorcode = ERR30; 3169 goto FAILED; 3170 } 3171 ptr = tempptr + 2; 3172 3173 /* Perl treats a hyphen after a POSIX class as a literal, not the 3174 start of a range. However, it gives a warning in its warning mode 3175 unless the hyphen is the last character in the class. PCRE does not 3176 have a warning mode, so we give an error, because this is likely an 3177 error on the user's part. */ 3178 3179 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && 3180 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) 3181 { 3182 errorcode = ERR50; 3183 goto FAILED; 3184 } 3185 3186 /* Set "a hyphen is not the start of a range" for the -] case, and also 3187 in case the POSIX class is followed by \E or \Q\E (possibly repeated - 3188 fuzzers do that kind of thing) and *then* a hyphen. This causes that 3189 hyphen to be treated as a literal. I don't think it's worth setting up 3190 special apparatus to do otherwise. */ 3191 3192 class_range_state = RANGE_NO; 3193 3194 /* When PCRE2_UCP is set, some of the POSIX classes are converted to 3195 use Unicode properties \p or \P or, in one case, \h or \H. The 3196 substitutes table has two values per class, containing the type and 3197 value of a \p or \P item. The special cases are specified with a 3198 negative type: a non-zero value causes \h or \H to be used, and a zero 3199 value falls through to behave like a non-UCP POSIX class. */ 3200 3201 #ifdef SUPPORT_UNICODE 3202 if ((options & PCRE2_UCP) != 0) 3203 { 3204 int ptype = posix_substitutes[2*posix_class]; 3205 int pvalue = posix_substitutes[2*posix_class + 1]; 3206 if (ptype >= 0) 3207 { 3208 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p); 3209 *parsed_pattern++ = (ptype << 16) | pvalue; 3210 goto CLASS_CONTINUE; 3211 } 3212 3213 if (pvalue != 0) 3214 { 3215 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h); 3216 goto CLASS_CONTINUE; 3217 } 3218 3219 /* Fall through */ 3220 } 3221 #endif /* SUPPORT_UNICODE */ 3222 3223 /* Non-UCP POSIX class */ 3224 3225 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX; 3226 *parsed_pattern++ = posix_class; 3227 } 3228 3229 /* Handle potential start of range */ 3230 3231 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED) 3232 { 3233 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)? 3234 META_RANGE_LITERAL : META_RANGE_ESCAPED; 3235 class_range_state = RANGE_STARTED; 3236 } 3237 3238 /* Handle a literal character */ 3239 3240 else if (c != CHAR_BACKSLASH) 3241 { 3242 CLASS_LITERAL: 3243 if (class_range_state == RANGE_STARTED) 3244 { 3245 if (c == parsed_pattern[-2]) /* Optimize one-char range */ 3246 parsed_pattern--; 3247 else if (parsed_pattern[-2] > c) /* Check range is in order */ 3248 { 3249 errorcode = ERR8; 3250 goto FAILED_BACK; 3251 } 3252 else 3253 { 3254 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL) 3255 parsed_pattern[-1] = META_RANGE_ESCAPED; 3256 PARSED_LITERAL(c, parsed_pattern); 3257 } 3258 class_range_state = RANGE_NO; 3259 } 3260 else /* Potential start of range */ 3261 { 3262 class_range_state = char_is_literal? 3263 RANGE_OK_LITERAL : RANGE_OK_ESCAPED; 3264 PARSED_LITERAL(c, parsed_pattern); 3265 } 3266 } 3267 3268 /* Handle escapes in a class */ 3269 3270 else 3271 { 3272 tempptr = ptr; 3273 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, 3274 options, TRUE, cb); 3275 if (errorcode != 0) 3276 { 3277 CLASS_ESCAPE_FAILED: 3278 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) 3279 goto FAILED; 3280 ptr = tempptr; 3281 if (ptr >= ptrend) c = CHAR_BACKSLASH; else 3282 { 3283 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 3284 } 3285 escape = 0; /* Treat as literal character */ 3286 } 3287 3288 if (escape == 0) /* Escaped character code point is in c */ 3289 { 3290 char_is_literal = FALSE; 3291 goto CLASS_LITERAL; 3292 } 3293 3294 /* These three escapes do not alter the class range state. */ 3295 3296 if (escape == ESC_b) 3297 { 3298 c = CHAR_BS; /* \b is backspace in a class */ 3299 char_is_literal = FALSE; 3300 goto CLASS_LITERAL; 3301 } 3302 3303 else if (escape == ESC_Q) 3304 { 3305 inescq = TRUE; /* Enter literal mode */ 3306 goto CLASS_CONTINUE; 3307 } 3308 3309 else if (escape == ESC_E) /* Ignore orphan \E */ 3310 goto CLASS_CONTINUE; 3311 3312 /* The second part of a range can be a single-character escape 3313 sequence (detected above), but not any of the other escapes. Perl 3314 treats a hyphen as a literal in such circumstances. However, in Perl's 3315 warning mode, a warning is given, so PCRE now faults it, as it is 3316 almost certainly a mistake on the user's part. */ 3317 3318 if (class_range_state == RANGE_STARTED) 3319 { 3320 errorcode = ERR50; 3321 goto CLASS_ESCAPE_FAILED; 3322 } 3323 3324 /* Of the remaining escapes, only those that define characters are 3325 allowed in a class. None may start a range. */ 3326 3327 class_range_state = RANGE_NO; 3328 switch(escape) 3329 { 3330 case ESC_N: 3331 errorcode = ERR71; /* Not supported in a class */ 3332 goto CLASS_ESCAPE_FAILED; 3333 3334 case ESC_H: 3335 case ESC_h: 3336 case ESC_V: 3337 case ESC_v: 3338 *parsed_pattern++ = META_ESCAPE + escape; 3339 break; 3340 3341 /* These escapes are converted to Unicode property tests when 3342 PCRE2_UCP is set. */ 3343 3344 case ESC_d: 3345 case ESC_D: 3346 case ESC_s: 3347 case ESC_S: 3348 case ESC_w: 3349 case ESC_W: 3350 if ((options & PCRE2_UCP) == 0) 3351 { 3352 *parsed_pattern++ = META_ESCAPE + escape; 3353 } 3354 else 3355 { 3356 *parsed_pattern++ = META_ESCAPE + 3357 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? 3358 ESC_p : ESC_P); 3359 switch(escape) 3360 { 3361 case ESC_d: 3362 case ESC_D: 3363 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; 3364 break; 3365 3366 case ESC_s: 3367 case ESC_S: 3368 *parsed_pattern++ = PT_SPACE << 16; 3369 break; 3370 3371 case ESC_w: 3372 case ESC_W: 3373 *parsed_pattern++ = PT_WORD << 16; 3374 break; 3375 } 3376 } 3377 break; 3378 3379 /* Explicit Unicode property matching */ 3380 3381 case ESC_P: 3382 case ESC_p: 3383 #ifdef SUPPORT_UNICODE 3384 { 3385 BOOL negated; 3386 uint16_t ptype = 0, pdata = 0; 3387 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) 3388 goto FAILED; 3389 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; 3390 *parsed_pattern++ = META_ESCAPE + escape; 3391 *parsed_pattern++ = (ptype << 16) | pdata; 3392 } 3393 #else 3394 errorcode = ERR45; 3395 goto CLASS_ESCAPE_FAILED; 3396 #endif 3397 break; /* End \P and \p */ 3398 3399 default: /* All others are not allowed in a class */ 3400 errorcode = ERR7; 3401 ptr--; 3402 goto CLASS_ESCAPE_FAILED; 3403 } 3404 3405 /* Perl gives a warning unless a following hyphen is the last character 3406 in the class. PCRE throws an error. */ 3407 3408 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && 3409 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) 3410 { 3411 errorcode = ERR50; 3412 goto FAILED; 3413 } 3414 } 3415 3416 /* Proceed to next thing in the class. */ 3417 3418 CLASS_CONTINUE: 3419 if (ptr >= ptrend) 3420 { 3421 errorcode = ERR6; /* Missing terminating ']' */ 3422 goto FAILED; 3423 } 3424 GETCHARINCTEST(c, ptr); 3425 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; 3426 } /* End of class-processing loop */ 3427 3428 if (class_range_state == RANGE_STARTED) 3429 { 3430 parsed_pattern[-1] = CHAR_MINUS; 3431 class_range_state = RANGE_NO; 3432 } 3433 3434 *parsed_pattern++ = META_CLASS_END; 3435 break; /* End of character class */ 3436 3437 3438 /* ---- Opening parenthesis ---- */ 3439 3440 case CHAR_LEFT_PARENTHESIS: 3441 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3442 3443 /* If ( is not followed by ? it is either a capture or a special verb. */ 3444 3445 if (*ptr != CHAR_QUESTION_MARK) 3446 { 3447 const char *vn; 3448 3449 /* Handle capturing brackets (or non-capturing if auto-capture is turned 3450 off). */ 3451 3452 if (*ptr != CHAR_ASTERISK) 3453 { 3454 nest_depth++; 3455 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) 3456 { 3457 cb->bracount++; 3458 *parsed_pattern++ = META_CAPTURE | cb->bracount; 3459 } 3460 else *parsed_pattern++ = META_NOCAPTURE; 3461 } 3462 3463 3464 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */ 3465 3466 /* Do nothing for (*) so it gives a "bad quantifier" error rather than 3467 "(*MARK) must have an argument". */ 3468 3469 else if (ptrend - ptr > 1 && ptr[1] != CHAR_RIGHT_PARENTHESIS) 3470 { 3471 vn = verbnames; 3472 if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode, 3473 cb)) goto FAILED; 3474 if (ptr >= ptrend || (*ptr != CHAR_COLON && 3475 *ptr != CHAR_RIGHT_PARENTHESIS)) 3476 { 3477 errorcode = ERR60; /* Malformed */ 3478 goto FAILED; 3479 } 3480 3481 /* Scan the table of verb names */ 3482 3483 for (i = 0; i < verbcount; i++) 3484 { 3485 if (namelen == verbs[i].len && 3486 PRIV(strncmp_c8)(name, vn, namelen) == 0) 3487 break; 3488 vn += verbs[i].len + 1; 3489 } 3490 3491 if (i >= verbcount) 3492 { 3493 errorcode = ERR60; /* Verb not recognized */ 3494 goto FAILED; 3495 } 3496 3497 /* An empty argument is treated as no argument. */ 3498 3499 if (*ptr == CHAR_COLON && ptr + 1 < ptrend && 3500 ptr[1] == CHAR_RIGHT_PARENTHESIS) 3501 ptr++; /* Advance to the closing parens */ 3502 3503 /* Check for mandatory non-empty argument; this is (*MARK) */ 3504 3505 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON) 3506 { 3507 errorcode = ERR66; 3508 goto FAILED; 3509 } 3510 3511 /* It appears that Perl allows any characters whatsoever, other than a 3512 closing parenthesis, to appear in arguments ("names"), so we no longer 3513 insist on letters, digits, and underscores. Perl does not, however, do 3514 any interpretation within arguments, and has no means of including a 3515 closing parenthesis. PCRE supports escape processing but only when it 3516 is requested by an option. We set inverbname TRUE here, and let the 3517 main loop take care of this so that escape and \x processing is done by 3518 the main code above. */ 3519 3520 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */ 3521 { 3522 /* Some optional arguments can be treated as a preceding (*MARK) */ 3523 3524 if (verbs[i].has_arg < 0) 3525 { 3526 add_after_mark = verbs[i].meta; 3527 *parsed_pattern++ = META_MARK; 3528 } 3529 3530 /* The remaining verbs with arguments (except *MARK) need a different 3531 opcode. */ 3532 3533 else 3534 { 3535 *parsed_pattern++ = verbs[i].meta + 3536 ((verbs[i].meta != META_MARK)? 0x00010000u:0); 3537 } 3538 3539 /* Set up for reading the name in the main loop. */ 3540 3541 verblengthptr = parsed_pattern++; 3542 verbnamestart = ptr; 3543 inverbname = TRUE; 3544 } 3545 else /* No verb "name" argument */ 3546 { 3547 *parsed_pattern++ = verbs[i].meta; 3548 } 3549 } /* End of (*VERB) handling */ 3550 break; /* Done with this parenthesis */ 3551 } /* End of groups that don't start with (? */ 3552 3553 3554 /* ---- Items starting (? ---- */ 3555 3556 /* The type of item is determined by what follows (?. Handle (?| and option 3557 changes under "default" because both need a new block on the nest stack. 3558 Comments starting with (?# are handled above. Note that there is some 3559 ambiguity about the sequence (?- because if a digit follows it's a relative 3560 recursion or subroutine call whereas otherwise it's an option unsetting. */ 3561 3562 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3563 3564 switch(*ptr) 3565 { 3566 default: 3567 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1])) 3568 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */ 3569 3570 /* We now have either (?| or a (possibly empty) option setting, 3571 optionally followed by a non-capturing group. */ 3572 3573 nest_depth++; 3574 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); 3575 else if (++top_nest >= end_nests) 3576 { 3577 errorcode = ERR84; 3578 goto FAILED; 3579 } 3580 top_nest->nest_depth = nest_depth; 3581 top_nest->flags = 0; 3582 top_nest->options = options & PARSE_TRACKED_OPTIONS; 3583 3584 /* Start of non-capturing group that resets the capture count for each 3585 branch. */ 3586 3587 if (*ptr == CHAR_VERTICAL_LINE) 3588 { 3589 top_nest->reset_group = (uint16_t)cb->bracount; 3590 top_nest->max_group = (uint16_t)cb->bracount; 3591 top_nest->flags |= NSF_RESET; 3592 cb->external_flags |= PCRE2_DUPCAPUSED; 3593 *parsed_pattern++ = META_NOCAPTURE; 3594 ptr++; 3595 } 3596 3597 /* Scan for options imnsxJU to be set or unset. */ 3598 3599 else 3600 { 3601 BOOL hyphenok = TRUE; 3602 uint32_t oldoptions = options; 3603 3604 top_nest->reset_group = 0; 3605 top_nest->max_group = 0; 3606 set = unset = 0; 3607 optset = &set; 3608 3609 /* ^ at the start unsets imnsx and disables the subsequent use of - */ 3610 3611 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT) 3612 { 3613 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| 3614 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE); 3615 hyphenok = FALSE; 3616 ptr++; 3617 } 3618 3619 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && 3620 *ptr != CHAR_COLON) 3621 { 3622 switch (*ptr++) 3623 { 3624 case CHAR_MINUS: 3625 if (!hyphenok) 3626 { 3627 errorcode = ERR94; 3628 ptr--; /* Correct the offset */ 3629 goto FAILED; 3630 } 3631 optset = &unset; 3632 hyphenok = FALSE; 3633 break; 3634 3635 case CHAR_J: /* Record that it changed in the external options */ 3636 *optset |= PCRE2_DUPNAMES; 3637 cb->external_flags |= PCRE2_JCHANGED; 3638 break; 3639 3640 case CHAR_i: *optset |= PCRE2_CASELESS; break; 3641 case CHAR_m: *optset |= PCRE2_MULTILINE; break; 3642 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break; 3643 case CHAR_s: *optset |= PCRE2_DOTALL; break; 3644 case CHAR_U: *optset |= PCRE2_UNGREEDY; break; 3645 3646 /* If x appears twice it sets the extended extended option. */ 3647 3648 case CHAR_x: 3649 *optset |= PCRE2_EXTENDED; 3650 if (ptr < ptrend && *ptr == CHAR_x) 3651 { 3652 *optset |= PCRE2_EXTENDED_MORE; 3653 ptr++; 3654 } 3655 break; 3656 3657 default: 3658 errorcode = ERR11; 3659 ptr--; /* Correct the offset */ 3660 goto FAILED; 3661 } 3662 } 3663 3664 /* If we are setting extended without extended-more, ensure that any 3665 existing extended-more gets unset. Also, unsetting extended must also 3666 unset extended-more. */ 3667 3668 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED || 3669 (unset & PCRE2_EXTENDED) != 0) 3670 unset |= PCRE2_EXTENDED_MORE; 3671 3672 options = (options | set) & (~unset); 3673 3674 /* If the options ended with ')' this is not the start of a nested 3675 group with option changes, so the options change at this level. 3676 In this case, if the previous level set up a nest block, discard the 3677 one we have just created. Otherwise adjust it for the previous level. 3678 If the options ended with ':' we are starting a non-capturing group, 3679 possibly with an options setting. */ 3680 3681 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3682 if (*ptr++ == CHAR_RIGHT_PARENTHESIS) 3683 { 3684 nest_depth--; /* This is not a nested group after all. */ 3685 if (top_nest > (nest_save *)(cb->start_workspace) && 3686 (top_nest-1)->nest_depth == nest_depth) top_nest--; 3687 else top_nest->nest_depth = nest_depth; 3688 } 3689 else *parsed_pattern++ = META_NOCAPTURE; 3690 3691 /* If nothing changed, no need to record. */ 3692 3693 if (options != oldoptions) 3694 { 3695 *parsed_pattern++ = META_OPTIONS; 3696 *parsed_pattern++ = options; 3697 } 3698 } /* End options processing */ 3699 break; /* End default case after (? */ 3700 3701 3702 /* ---- Python syntax support ---- */ 3703 3704 case CHAR_P: 3705 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3706 3707 /* (?P<name> is the same as (?<name>, which defines a named group. */ 3708 3709 if (*ptr == CHAR_LESS_THAN_SIGN) 3710 { 3711 terminator = CHAR_GREATER_THAN_SIGN; 3712 goto DEFINE_NAME; 3713 } 3714 3715 /* (?P>name) is the same as (?&name), which is a recursion or subroutine 3716 call. */ 3717 3718 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME; 3719 3720 /* (?P=name) is the same as \k<name>, a back reference by name. Anything 3721 else after (?P is an error. */ 3722 3723 if (*ptr != CHAR_EQUALS_SIGN) 3724 { 3725 errorcode = ERR41; 3726 goto FAILED; 3727 } 3728 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, 3729 &namelen, &errorcode, cb)) goto FAILED; 3730 *parsed_pattern++ = META_BACKREF_BYNAME; 3731 *parsed_pattern++ = namelen; 3732 PUTOFFSET(offset, parsed_pattern); 3733 okquantifier = TRUE; 3734 break; /* End of (?P processing */ 3735 3736 3737 /* ---- Recursion/subroutine calls by number ---- */ 3738 3739 case CHAR_R: 3740 i = 0; /* (?R) == (?R0) */ 3741 ptr++; 3742 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) 3743 { 3744 errorcode = ERR58; 3745 goto FAILED; 3746 } 3747 goto SET_RECURSION; 3748 3749 /* An item starting (?- followed by a digit comes here via the "default" 3750 case because (?- followed by a non-digit is an options setting. */ 3751 3752 case CHAR_PLUS: 3753 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1])) 3754 { 3755 errorcode = ERR29; /* Missing number */ 3756 goto FAILED; 3757 } 3758 /* Fall through */ 3759 3760 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: 3761 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 3762 RECURSION_BYNUMBER: 3763 if (!read_number(&ptr, ptrend, 3764 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */ 3765 MAX_GROUP_NUMBER, ERR61, 3766 &i, &errorcode)) goto FAILED; 3767 if (i < 0) /* NB (?0) is permitted */ 3768 { 3769 errorcode = ERR15; /* Unknown group */ 3770 goto FAILED_BACK; 3771 } 3772 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) 3773 goto UNCLOSED_PARENTHESIS; 3774 3775 SET_RECURSION: 3776 *parsed_pattern++ = META_RECURSE | (uint32_t)i; 3777 offset = (PCRE2_SIZE)(ptr - cb->start_pattern); 3778 ptr++; 3779 PUTOFFSET(offset, parsed_pattern); 3780 okquantifier = TRUE; 3781 break; /* End of recursive call by number handling */ 3782 3783 3784 /* ---- Recursion/subroutine calls by name ---- */ 3785 3786 case CHAR_AMPERSAND: 3787 RECURSE_BY_NAME: 3788 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, 3789 &namelen, &errorcode, cb)) goto FAILED; 3790 *parsed_pattern++ = META_RECURSE_BYNAME; 3791 *parsed_pattern++ = namelen; 3792 PUTOFFSET(offset, parsed_pattern); 3793 okquantifier = TRUE; 3794 break; 3795 3796 /* ---- Callout with numerical or string argument ---- */ 3797 3798 case CHAR_C: 3799 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3800 3801 /* If the previous item was a condition starting (?(? an assertion, 3802 optionally preceded by a callout, is expected. This is checked later on, 3803 during actual compilation. However we need to identify this kind of 3804 assertion in this pass because it must not be qualified. The value of 3805 expect_cond_assert is set to 2 after (?(? is processed. We decrement it 3806 for a callout - still leaving a positive value that identifies the 3807 assertion. Multiple callouts or any other items will make it zero or 3808 less, which doesn't matter because they will cause an error later. */ 3809 3810 expect_cond_assert = prev_expect_cond_assert - 1; 3811 3812 /* If previous_callout is not NULL, it means this follows a previous 3813 callout. If it was a manual callout, do nothing; this means its "length 3814 of next pattern item" field will remain zero. If it was an automatic 3815 callout, abolish it. */ 3816 3817 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 && 3818 previous_callout == parsed_pattern - 4 && 3819 parsed_pattern[-1] == 255) 3820 parsed_pattern = previous_callout; 3821 3822 /* Save for updating next pattern item length, and skip one item before 3823 completing. */ 3824 3825 previous_callout = parsed_pattern; 3826 after_manual_callout = 1; 3827 3828 /* Handle a string argument; specific delimiter is required. */ 3829 3830 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) 3831 { 3832 PCRE2_SIZE calloutlength; 3833 PCRE2_SPTR startptr = ptr; 3834 3835 delimiter = 0; 3836 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) 3837 { 3838 if (*ptr == PRIV(callout_start_delims)[i]) 3839 { 3840 delimiter = PRIV(callout_end_delims)[i]; 3841 break; 3842 } 3843 } 3844 if (delimiter == 0) 3845 { 3846 errorcode = ERR82; 3847 goto FAILED; 3848 } 3849 3850 *parsed_pattern = META_CALLOUT_STRING; 3851 parsed_pattern += 3; /* Skip pattern info */ 3852 3853 for (;;) 3854 { 3855 if (++ptr >= ptrend) 3856 { 3857 errorcode = ERR81; 3858 ptr = startptr; /* To give a more useful message */ 3859 goto FAILED; 3860 } 3861 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter)) 3862 break; 3863 } 3864 3865 calloutlength = (PCRE2_SIZE)(ptr - startptr); 3866 if (calloutlength > UINT32_MAX) 3867 { 3868 errorcode = ERR72; 3869 goto FAILED; 3870 } 3871 *parsed_pattern++ = (uint32_t)calloutlength; 3872 offset = (PCRE2_SIZE)(startptr - cb->start_pattern); 3873 PUTOFFSET(offset, parsed_pattern); 3874 } 3875 3876 /* Handle a callout with an optional numerical argument, which must be 3877 less than or equal to 255. A missing argument gives 0. */ 3878 3879 else 3880 { 3881 int n = 0; 3882 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */ 3883 parsed_pattern += 3; /* Skip pattern info */ 3884 while (ptr < ptrend && IS_DIGIT(*ptr)) 3885 { 3886 n = n * 10 + *ptr++ - CHAR_0; 3887 if (n > 255) 3888 { 3889 errorcode = ERR38; 3890 goto FAILED; 3891 } 3892 } 3893 *parsed_pattern++ = n; 3894 } 3895 3896 /* Both formats must have a closing parenthesis */ 3897 3898 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) 3899 { 3900 errorcode = ERR39; 3901 goto FAILED; 3902 } 3903 ptr++; 3904 3905 /* Remember the offset to the next item in the pattern, and set a default 3906 length. This should get updated after the next item is read. */ 3907 3908 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); 3909 previous_callout[2] = 0; 3910 break; /* End callout */ 3911 3912 3913 /* ---- Conditional group ---- */ 3914 3915 /* A condition can be an assertion, a number (referring to a numbered 3916 group's having been set), a name (referring to a named group), or 'R', 3917 referring to overall recursion. R<digits> and R&name are also permitted 3918 for recursion state tests. Numbers may be preceded by + or - to specify a 3919 relative group number. 3920 3921 There are several syntaxes for testing a named group: (?(name)) is used 3922 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). 3923 3924 There are two unfortunate ambiguities. 'R' can be the recursive thing or 3925 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be 3926 the Perl DEFINE feature or the Python named test. We look for a name 3927 first; if not found, we try the other case. 3928 3929 For compatibility with auto-callouts, we allow a callout to be specified 3930 before a condition that is an assertion. */ 3931 3932 case CHAR_LEFT_PARENTHESIS: 3933 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; 3934 nest_depth++; 3935 3936 /* If the next character is ? there must be an assertion next (optionally 3937 preceded by a callout). We do not check this here, but instead we set 3938 expect_cond_assert to 2. If this is still greater than zero (callouts 3939 decrement it) when the next assertion is read, it will be marked as a 3940 condition that must not be repeated. A value greater than zero also 3941 causes checking that an assertion (possibly with callout) follows. */ 3942 3943 if (*ptr == CHAR_QUESTION_MARK) 3944 { 3945 *parsed_pattern++ = META_COND_ASSERT; 3946 ptr--; /* Pull pointer back to the opening parenthesis. */ 3947 expect_cond_assert = 2; 3948 break; /* End of conditional */ 3949 } 3950 3951 /* Handle (?([+-]number)... */ 3952 3953 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, 3954 &errorcode)) 3955 { 3956 if (i <= 0) 3957 { 3958 errorcode = ERR15; 3959 goto FAILED; 3960 } 3961 *parsed_pattern++ = META_COND_NUMBER; 3962 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); 3963 PUTOFFSET(offset, parsed_pattern); 3964 *parsed_pattern++ = i; 3965 } 3966 else if (errorcode != 0) goto FAILED; /* Number too big */ 3967 3968 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */ 3969 3970 else if (ptrend - ptr >= 10 && 3971 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && 3972 ptr[7] != CHAR_RIGHT_PARENTHESIS) 3973 { 3974 uint32_t ge = 0; 3975 int major = 0; 3976 int minor = 0; 3977 3978 ptr += 7; 3979 if (*ptr == CHAR_GREATER_THAN_SIGN) 3980 { 3981 ge = 1; 3982 ptr++; 3983 } 3984 3985 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT 3986 references its argument twice. */ 3987 3988 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) 3989 goto BAD_VERSION_CONDITION; 3990 3991 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode)) 3992 goto FAILED; 3993 3994 if (ptr >= ptrend) goto BAD_VERSION_CONDITION; 3995 if (*ptr == CHAR_DOT) 3996 { 3997 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION; 3998 minor = (*ptr++ - CHAR_0) * 10; 3999 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0; 4000 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) 4001 goto BAD_VERSION_CONDITION; 4002 } 4003 4004 *parsed_pattern++ = META_COND_VERSION; 4005 *parsed_pattern++ = ge; 4006 *parsed_pattern++ = major; 4007 *parsed_pattern++ = minor; 4008 } 4009 4010 /* All the remaining cases now require us to read a name. We cannot at 4011 this stage distinguish ambiguous cases such as (?(R12) which might be a 4012 recursion test by number or a name, because the named groups have not yet 4013 all been identified. Those cases are treated as names, but given a 4014 different META code. */ 4015 4016 else 4017 { 4018 BOOL was_r_ampersand = FALSE; 4019 4020 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND) 4021 { 4022 terminator = CHAR_RIGHT_PARENTHESIS; 4023 was_r_ampersand = TRUE; 4024 ptr++; 4025 } 4026 else if (*ptr == CHAR_LESS_THAN_SIGN) 4027 terminator = CHAR_GREATER_THAN_SIGN; 4028 else if (*ptr == CHAR_APOSTROPHE) 4029 terminator = CHAR_APOSTROPHE; 4030 else 4031 { 4032 terminator = CHAR_RIGHT_PARENTHESIS; 4033 ptr--; /* Point to char before name */ 4034 } 4035 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, 4036 &errorcode, cb)) goto FAILED; 4037 4038 /* Handle (?(R&name) */ 4039 4040 if (was_r_ampersand) 4041 { 4042 *parsed_pattern = META_COND_RNAME; 4043 ptr--; /* Back to closing parens */ 4044 } 4045 4046 /* Handle (?(name). If the name is "DEFINE" we identify it with a 4047 special code. Likewise if the name consists of R followed only by 4048 digits. Otherwise, handle it like a quoted name. */ 4049 4050 else if (terminator == CHAR_RIGHT_PARENTHESIS) 4051 { 4052 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) 4053 *parsed_pattern = META_COND_DEFINE; 4054 else 4055 { 4056 for (i = 1; i < (int)namelen; i++) 4057 if (!IS_DIGIT(name[i])) break; 4058 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)? 4059 META_COND_RNUMBER : META_COND_NAME; 4060 } 4061 ptr--; /* Back to closing parens */ 4062 } 4063 4064 /* Handle (?('name') or (?(<name>) */ 4065 4066 else *parsed_pattern = META_COND_NAME; 4067 4068 /* All these cases except DEFINE end with the name length and offset; 4069 DEFINE just has an offset (for the "too many branches" error). */ 4070 4071 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen; 4072 PUTOFFSET(offset, parsed_pattern); 4073 } /* End cases that read a name */ 4074 4075 /* Check the closing parenthesis of the condition */ 4076 4077 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) 4078 { 4079 errorcode = ERR24; 4080 goto FAILED; 4081 } 4082 ptr++; 4083 break; /* End of condition processing */ 4084 4085 4086 /* ---- Atomic group ---- */ 4087 4088 case CHAR_GREATER_THAN_SIGN: 4089 *parsed_pattern++ = META_ATOMIC; 4090 nest_depth++; 4091 ptr++; 4092 break; 4093 4094 4095 /* ---- Lookahead assertions ---- */ 4096 4097 case CHAR_EQUALS_SIGN: 4098 *parsed_pattern++ = META_LOOKAHEAD; 4099 ptr++; 4100 goto POST_ASSERTION; 4101 4102 case CHAR_EXCLAMATION_MARK: 4103 *parsed_pattern++ = META_LOOKAHEADNOT; 4104 ptr++; 4105 goto POST_ASSERTION; 4106 4107 4108 /* ---- Lookbehind assertions ---- */ 4109 4110 /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the 4111 start of the name of a capturing group. */ 4112 4113 case CHAR_LESS_THAN_SIGN: 4114 if (ptrend - ptr <= 1 || 4115 (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) 4116 { 4117 terminator = CHAR_GREATER_THAN_SIGN; 4118 goto DEFINE_NAME; 4119 } 4120 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? 4121 META_LOOKBEHIND : META_LOOKBEHINDNOT; 4122 *has_lookbehind = TRUE; 4123 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); 4124 PUTOFFSET(offset, parsed_pattern); 4125 ptr += 2; 4126 /* Fall through */ 4127 4128 /* If the previous item was a condition starting (?(? an assertion, 4129 optionally preceded by a callout, is expected. This is checked later on, 4130 during actual compilation. However we need to identify this kind of 4131 assertion in this pass because it must not be qualified. The value of 4132 expect_cond_assert is set to 2 after (?(? is processed. We decrement it 4133 for a callout - still leaving a positive value that identifies the 4134 assertion. Multiple callouts or any other items will make it zero or 4135 less, which doesn't matter because they will cause an error later. */ 4136 4137 POST_ASSERTION: 4138 nest_depth++; 4139 if (prev_expect_cond_assert > 0) 4140 { 4141 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); 4142 else if (++top_nest >= end_nests) 4143 { 4144 errorcode = ERR84; 4145 goto FAILED; 4146 } 4147 top_nest->nest_depth = nest_depth; 4148 top_nest->flags = NSF_CONDASSERT; 4149 top_nest->options = options & PARSE_TRACKED_OPTIONS; 4150 } 4151 break; 4152 4153 4154 /* ---- Define a named group ---- */ 4155 4156 /* A named group may be defined as (?'name') or (?<name>). In the latter 4157 case we jump to DEFINE_NAME from the disambiguation of (?< above with the 4158 terminator set to '>'. */ 4159 4160 case CHAR_APOSTROPHE: 4161 terminator = CHAR_APOSTROPHE; /* Terminator */ 4162 4163 DEFINE_NAME: 4164 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, 4165 &errorcode, cb)) goto FAILED; 4166 4167 /* We have a name for this capturing group. It is also assigned a number, 4168 which is its primary means of identification. */ 4169 4170 cb->bracount++; 4171 *parsed_pattern++ = META_CAPTURE | cb->bracount; 4172 nest_depth++; 4173 4174 /* Check not too many names */ 4175 4176 if (cb->names_found >= MAX_NAME_COUNT) 4177 { 4178 errorcode = ERR49; 4179 goto FAILED; 4180 } 4181 4182 /* Adjust the entry size to accommodate the longest name found. */ 4183 4184 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) 4185 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); 4186 4187 /* Scan the list to check for duplicates. For duplicate names, if the 4188 number is the same, break the loop, which causes the name to be 4189 discarded; otherwise, if DUPNAMES is not set, give an error. 4190 If it is set, allow the name with a different number, but continue 4191 scanning in case this is a duplicate with the same number. For 4192 non-duplicate names, give an error if the number is duplicated. */ 4193 4194 isdupname = FALSE; 4195 ng = cb->named_groups; 4196 for (i = 0; i < cb->names_found; i++, ng++) 4197 { 4198 if (namelen == ng->length && 4199 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0) 4200 { 4201 if (ng->number == cb->bracount) break; 4202 if ((options & PCRE2_DUPNAMES) == 0) 4203 { 4204 errorcode = ERR43; 4205 goto FAILED; 4206 } 4207 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ 4208 cb->dupnames = TRUE; /* Duplicate names exist */ 4209 } 4210 else if (ng->number == cb->bracount) 4211 { 4212 errorcode = ERR65; 4213 goto FAILED; 4214 } 4215 } 4216 4217 if (i < cb->names_found) break; /* Ignore duplicate with same number */ 4218 4219 /* Increase the list size if necessary */ 4220 4221 if (cb->names_found >= cb->named_group_list_size) 4222 { 4223 uint32_t newsize = cb->named_group_list_size * 2; 4224 named_group *newspace = 4225 cb->cx->memctl.malloc(newsize * sizeof(named_group), 4226 cb->cx->memctl.memory_data); 4227 if (newspace == NULL) 4228 { 4229 errorcode = ERR21; 4230 goto FAILED; 4231 } 4232 4233 memcpy(newspace, cb->named_groups, 4234 cb->named_group_list_size * sizeof(named_group)); 4235 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) 4236 cb->cx->memctl.free((void *)cb->named_groups, 4237 cb->cx->memctl.memory_data); 4238 cb->named_groups = newspace; 4239 cb->named_group_list_size = newsize; 4240 } 4241 4242 /* Add this name to the list */ 4243 4244 cb->named_groups[cb->names_found].name = name; 4245 cb->named_groups[cb->names_found].length = (uint16_t)namelen; 4246 cb->named_groups[cb->names_found].number = cb->bracount; 4247 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; 4248 cb->names_found++; 4249 break; 4250 } /* End of (? switch */ 4251 break; /* End of ( handling */ 4252 4253 4254 /* ---- Branch terminators ---- */ 4255 4256 /* Alternation: reset the capture count if we are in a (?| group. */ 4257 4258 case CHAR_VERTICAL_LINE: 4259 if (top_nest != NULL && top_nest->nest_depth == nest_depth && 4260 (top_nest->flags & NSF_RESET) != 0) 4261 { 4262 if (cb->bracount > top_nest->max_group) 4263 top_nest->max_group = (uint16_t)cb->bracount; 4264 cb->bracount = top_nest->reset_group; 4265 } 4266 *parsed_pattern++ = META_ALT; 4267 break; 4268 4269 /* End of group; reset the capture count to the maximum if we are in a (?| 4270 group and/or reset the options that are tracked during parsing. Disallow 4271 quantifier for a condition that is an assertion. */ 4272 4273 case CHAR_RIGHT_PARENTHESIS: 4274 okquantifier = TRUE; 4275 if (top_nest != NULL && top_nest->nest_depth == nest_depth) 4276 { 4277 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options; 4278 if ((top_nest->flags & NSF_RESET) != 0 && 4279 top_nest->max_group > cb->bracount) 4280 cb->bracount = top_nest->max_group; 4281 if ((top_nest->flags & NSF_CONDASSERT) != 0) 4282 okquantifier = FALSE; 4283 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; 4284 else top_nest--; 4285 } 4286 if (nest_depth == 0) /* Unmatched closing parenthesis */ 4287 { 4288 errorcode = ERR22; 4289 goto FAILED_BACK; 4290 } 4291 nest_depth--; 4292 *parsed_pattern++ = META_KET; 4293 break; 4294 } /* End of switch on pattern character */ 4295 } /* End of main character scan loop */ 4296 4297 /* End of pattern reached. Check for missing ) at the end of a verb name. */ 4298 4299 if (inverbname && ptr >= ptrend) 4300 { 4301 errorcode = ERR60; 4302 goto FAILED; 4303 } 4304 4305 /* Manage callout for the final item */ 4306 4307 PARSED_END: 4308 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, 4309 parsed_pattern, cb); 4310 4311 /* Insert trailing items for word and line matching (features provided for the 4312 benefit of pcre2grep). */ 4313 4314 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) 4315 { 4316 *parsed_pattern++ = META_KET; 4317 *parsed_pattern++ = META_DOLLAR; 4318 } 4319 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) 4320 { 4321 *parsed_pattern++ = META_KET; 4322 *parsed_pattern++ = META_ESCAPE + ESC_b; 4323 } 4324 4325 /* Terminate the parsed pattern, then return success if all groups are closed. 4326 Otherwise we have unclosed parentheses. */ 4327 4328 if (parsed_pattern >= parsed_pattern_end) 4329 { 4330 errorcode = ERR63; /* Internal error (parsed pattern overflow) */ 4331 goto FAILED; 4332 } 4333 4334 *parsed_pattern = META_END; 4335 if (nest_depth == 0) return 0; 4336 4337 UNCLOSED_PARENTHESIS: 4338 errorcode = ERR14; 4339 4340 /* Come here for all failures. */ 4341 4342 FAILED: 4343 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern); 4344 return errorcode; 4345 4346 /* Some errors need to indicate the previous character. */ 4347 4348 FAILED_BACK: 4349 ptr--; 4350 goto FAILED; 4351 4352 /* This failure happens several times. */ 4353 4354 BAD_VERSION_CONDITION: 4355 errorcode = ERR79; 4356 goto FAILED; 4357 } 4358 4359 4360 4361 /************************************************* 4362 * Find first significant opcode * 4363 *************************************************/ 4364 4365 /* This is called by several functions that scan a compiled expression looking 4366 for a fixed first character, or an anchoring opcode etc. It skips over things 4367 that do not influence this. For some calls, it makes sense to skip negative 4368 forward and all backward assertions, and also the \b assertion; for others it 4369 does not. 4370 4371 Arguments: 4372 code pointer to the start of the group 4373 skipassert TRUE if certain assertions are to be skipped 4374 4375 Returns: pointer to the first significant opcode 4376 */ 4377 4378 static const PCRE2_UCHAR* 4379 first_significant_code(PCRE2_SPTR code, BOOL skipassert) 4380 { 4381 for (;;) 4382 { 4383 switch ((int)*code) 4384 { 4385 case OP_ASSERT_NOT: 4386 case OP_ASSERTBACK: 4387 case OP_ASSERTBACK_NOT: 4388 if (!skipassert) return code; 4389 do code += GET(code, 1); while (*code == OP_ALT); 4390 code += PRIV(OP_lengths)[*code]; 4391 break; 4392 4393 case OP_WORD_BOUNDARY: 4394 case OP_NOT_WORD_BOUNDARY: 4395 if (!skipassert) return code; 4396 /* Fall through */ 4397 4398 case OP_CALLOUT: 4399 case OP_CREF: 4400 case OP_DNCREF: 4401 case OP_RREF: 4402 case OP_DNRREF: 4403 case OP_FALSE: 4404 case OP_TRUE: 4405 code += PRIV(OP_lengths)[*code]; 4406 break; 4407 4408 case OP_CALLOUT_STR: 4409 code += GET(code, 1 + 2*LINK_SIZE); 4410 break; 4411 4412 case OP_SKIPZERO: 4413 code += 2 + GET(code, 2) + LINK_SIZE; 4414 break; 4415 4416 case OP_COND: 4417 case OP_SCOND: 4418 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */ 4419 code[GET(code, 1)] != OP_KET) /* More than one branch */ 4420 return code; 4421 code += GET(code, 1) + 1 + LINK_SIZE; 4422 break; 4423 4424 default: 4425 return code; 4426 } 4427 } 4428 /* Control never reaches here */ 4429 } 4430 4431 4432 4433 #ifdef SUPPORT_UNICODE 4434 /************************************************* 4435 * Get othercase range * 4436 *************************************************/ 4437 4438 /* This function is passed the start and end of a class range in UCP mode. It 4439 searches up the characters, looking for ranges of characters in the "other" 4440 case. Each call returns the next one, updating the start address. A character 4441 with multiple other cases is returned on its own with a special return value. 4442 4443 Arguments: 4444 cptr points to starting character value; updated 4445 d end value 4446 ocptr where to put start of othercase range 4447 odptr where to put end of othercase range 4448 4449 Yield: -1 when no more 4450 0 when a range is returned 4451 >0 the CASESET offset for char with multiple other cases 4452 in this case, ocptr contains the original 4453 */ 4454 4455 static int 4456 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, 4457 uint32_t *odptr) 4458 { 4459 uint32_t c, othercase, next; 4460 unsigned int co; 4461 4462 /* Find the first character that has an other case. If it has multiple other 4463 cases, return its case offset value. */ 4464 4465 for (c = *cptr; c <= d; c++) 4466 { 4467 if ((co = UCD_CASESET(c)) != 0) 4468 { 4469 *ocptr = c++; /* Character that has the set */ 4470 *cptr = c; /* Rest of input range */ 4471 return (int)co; 4472 } 4473 if ((othercase = UCD_OTHERCASE(c)) != c) break; 4474 } 4475 4476 if (c > d) return -1; /* Reached end of range */ 4477 4478 /* Found a character that has a single other case. Search for the end of the 4479 range, which is either the end of the input range, or a character that has zero 4480 or more than one other cases. */ 4481 4482 *ocptr = othercase; 4483 next = othercase + 1; 4484 4485 for (++c; c <= d; c++) 4486 { 4487 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; 4488 next++; 4489 } 4490 4491 *odptr = next - 1; /* End of othercase range */ 4492 *cptr = c; /* Rest of input range */ 4493 return 0; 4494 } 4495 #endif /* SUPPORT_UNICODE */ 4496 4497 4498 4499 /************************************************* 4500 * Add a character or range to a class (internal) * 4501 *************************************************/ 4502 4503 /* This function packages up the logic of adding a character or range of 4504 characters to a class. The character values in the arguments will be within the 4505 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is 4506 called only from within the "add to class" group of functions, some of which 4507 are recursive and mutually recursive. The external entry point is 4508 add_to_class(). 4509 4510 Arguments: 4511 classbits the bit map for characters < 256 4512 uchardptr points to the pointer for extra data 4513 options the options word 4514 cb compile data 4515 start start of range character 4516 end end of range character 4517 4518 Returns: the number of < 256 characters added 4519 the pointer to extra data is updated 4520 */ 4521 4522 static unsigned int 4523 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 4524 uint32_t options, compile_block *cb, uint32_t start, uint32_t end) 4525 { 4526 uint32_t c; 4527 uint32_t classbits_end = (end <= 0xff ? end : 0xff); 4528 unsigned int n8 = 0; 4529 4530 /* If caseless matching is required, scan the range and process alternate 4531 cases. In Unicode, there are 8-bit characters that have alternate cases that 4532 are greater than 255 and vice-versa. Sometimes we can just extend the original 4533 range. */ 4534 4535 if ((options & PCRE2_CASELESS) != 0) 4536 { 4537 #ifdef SUPPORT_UNICODE 4538 if ((options & PCRE2_UTF) != 0) 4539 { 4540 int rc; 4541 uint32_t oc, od; 4542 4543 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ 4544 c = start; 4545 4546 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) 4547 { 4548 /* Handle a single character that has more than one other case. */ 4549 4550 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb, 4551 PRIV(ucd_caseless_sets) + rc, oc); 4552 4553 /* Do nothing if the other case range is within the original range. */ 4554 4555 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue; 4556 4557 /* Extend the original range if there is overlap, noting that if oc < c, we 4558 can't have od > end because a subrange is always shorter than the basic 4559 range. Otherwise, use a recursive call to add the additional range. */ 4560 4561 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ 4562 else if (od > end && oc <= end + 1) 4563 { 4564 end = od; /* Extend upwards */ 4565 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); 4566 } 4567 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od); 4568 } 4569 } 4570 else 4571 #endif /* SUPPORT_UNICODE */ 4572 4573 /* Not UTF mode */ 4574 4575 for (c = start; c <= classbits_end; c++) 4576 { 4577 SETBIT(classbits, cb->fcc[c]); 4578 n8++; 4579 } 4580 } 4581 4582 /* Now handle the originally supplied range. Adjust the final value according 4583 to the bit length - this means that the same lists of (e.g.) horizontal spaces 4584 can be used in all cases. */ 4585 4586 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) 4587 end = MAX_NON_UTF_CHAR; 4588 4589 if (start > cb->class_range_start && end < cb->class_range_end) return n8; 4590 4591 /* Use the bitmap for characters < 256. Otherwise use extra data.*/ 4592 4593 for (c = start; c <= classbits_end; c++) 4594 { 4595 /* Regardless of start, c will always be <= 255. */ 4596 SETBIT(classbits, c); 4597 n8++; 4598 } 4599 4600 #ifdef SUPPORT_WIDE_CHARS 4601 if (start <= 0xff) start = 0xff + 1; 4602 4603 if (end >= start) 4604 { 4605 PCRE2_UCHAR *uchardata = *uchardptr; 4606 4607 #ifdef SUPPORT_UNICODE 4608 if ((options & PCRE2_UTF) != 0) 4609 { 4610 if (start < end) 4611 { 4612 *uchardata++ = XCL_RANGE; 4613 uchardata += PRIV(ord2utf)(start, uchardata); 4614 uchardata += PRIV(ord2utf)(end, uchardata); 4615 } 4616 else if (start == end) 4617 { 4618 *uchardata++ = XCL_SINGLE; 4619 uchardata += PRIV(ord2utf)(start, uchardata); 4620 } 4621 } 4622 else 4623 #endif /* SUPPORT_UNICODE */ 4624 4625 /* Without UTF support, character values are constrained by the bit length, 4626 and can only be > 256 for 16-bit and 32-bit libraries. */ 4627 4628 #if PCRE2_CODE_UNIT_WIDTH == 8 4629 {} 4630 #else 4631 if (start < end) 4632 { 4633 *uchardata++ = XCL_RANGE; 4634 *uchardata++ = start; 4635 *uchardata++ = end; 4636 } 4637 else if (start == end) 4638 { 4639 *uchardata++ = XCL_SINGLE; 4640 *uchardata++ = start; 4641 } 4642 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ 4643 *uchardptr = uchardata; /* Updata extra data pointer */ 4644 } 4645 #else /* SUPPORT_WIDE_CHARS */ 4646 (void)uchardptr; /* Avoid compiler warning */ 4647 #endif /* SUPPORT_WIDE_CHARS */ 4648 4649 return n8; /* Number of 8-bit characters */ 4650 } 4651 4652 4653 4654 #ifdef SUPPORT_UNICODE 4655 /************************************************* 4656 * Add a list of characters to a class (internal) * 4657 *************************************************/ 4658 4659 /* This function is used for adding a list of case-equivalent characters to a 4660 class when in UTF mode. This function is called only from within 4661 add_to_class_internal(), with which it is mutually recursive. 4662 4663 Arguments: 4664 classbits the bit map for characters < 256 4665 uchardptr points to the pointer for extra data 4666 options the options word 4667 cb contains pointers to tables etc. 4668 p points to row of 32-bit values, terminated by NOTACHAR 4669 except character to omit; this is used when adding lists of 4670 case-equivalent characters to avoid including the one we 4671 already know about 4672 4673 Returns: the number of < 256 characters added 4674 the pointer to extra data is updated 4675 */ 4676 4677 static unsigned int 4678 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 4679 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) 4680 { 4681 unsigned int n8 = 0; 4682 while (p[0] < NOTACHAR) 4683 { 4684 unsigned int n = 0; 4685 if (p[0] != except) 4686 { 4687 while(p[n+1] == p[0] + n + 1) n++; 4688 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); 4689 } 4690 p += n + 1; 4691 } 4692 return n8; 4693 } 4694 #endif 4695 4696 4697 4698 /************************************************* 4699 * External entry point for add range to class * 4700 *************************************************/ 4701 4702 /* This function sets the overall range so that the internal functions can try 4703 to avoid duplication when handling case-independence. 4704 4705 Arguments: 4706 classbits the bit map for characters < 256 4707 uchardptr points to the pointer for extra data 4708 options the options word 4709 cb compile data 4710 start start of range character 4711 end end of range character 4712 4713 Returns: the number of < 256 characters added 4714 the pointer to extra data is updated 4715 */ 4716 4717 static unsigned int 4718 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, 4719 compile_block *cb, uint32_t start, uint32_t end) 4720 { 4721 cb->class_range_start = start; 4722 cb->class_range_end = end; 4723 return add_to_class_internal(classbits, uchardptr, options, cb, start, end); 4724 } 4725 4726 4727 /************************************************* 4728 * External entry point for add list to class * 4729 *************************************************/ 4730 4731 /* This function is used for adding a list of horizontal or vertical whitespace 4732 characters to a class. The list must be in order so that ranges of characters 4733 can be detected and handled appropriately. This function sets the overall range 4734 so that the internal functions can try to avoid duplication when handling 4735 case-independence. 4736 4737 Arguments: 4738 classbits the bit map for characters < 256 4739 uchardptr points to the pointer for extra data 4740 options the options word 4741 cb contains pointers to tables etc. 4742 p points to row of 32-bit values, terminated by NOTACHAR 4743 except character to omit; this is used when adding lists of 4744 case-equivalent characters to avoid including the one we 4745 already know about 4746 4747 Returns: the number of < 256 characters added 4748 the pointer to extra data is updated 4749 */ 4750 4751 static unsigned int 4752 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, 4753 compile_block *cb, const uint32_t *p, unsigned int except) 4754 { 4755 unsigned int n8 = 0; 4756 while (p[0] < NOTACHAR) 4757 { 4758 unsigned int n = 0; 4759 if (p[0] != except) 4760 { 4761 while(p[n+1] == p[0] + n + 1) n++; 4762 cb->class_range_start = p[0]; 4763 cb->class_range_end = p[n]; 4764 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); 4765 } 4766 p += n + 1; 4767 } 4768 return n8; 4769 } 4770 4771 4772 4773 /************************************************* 4774 * Add characters not in a list to a class * 4775 *************************************************/ 4776 4777 /* This function is used for adding the complement of a list of horizontal or 4778 vertical whitespace to a class. The list must be in order. 4779 4780 Arguments: 4781 classbits the bit map for characters < 256 4782 uchardptr points to the pointer for extra data 4783 options the options word 4784 cb contains pointers to tables etc. 4785 p points to row of 32-bit values, terminated by NOTACHAR 4786 4787 Returns: the number of < 256 characters added 4788 the pointer to extra data is updated 4789 */ 4790 4791 static unsigned int 4792 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 4793 uint32_t options, compile_block *cb, const uint32_t *p) 4794 { 4795 BOOL utf = (options & PCRE2_UTF) != 0; 4796 unsigned int n8 = 0; 4797 if (p[0] > 0) 4798 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); 4799 while (p[0] < NOTACHAR) 4800 { 4801 while (p[1] == p[0] + 1) p++; 4802 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, 4803 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); 4804 p++; 4805 } 4806 return n8; 4807 } 4808 4809 4810 4811 /************************************************* 4812 * Find details of duplicate group names * 4813 *************************************************/ 4814 4815 /* This is called from compile_branch() when it needs to know the index and 4816 count of duplicates in the names table when processing named backreferences, 4817 either directly, or as conditions. 4818 4819 Arguments: 4820 name points to the name 4821 length the length of the name 4822 indexptr where to put the index 4823 countptr where to put the count of duplicates 4824 errorcodeptr where to put an error code 4825 cb the compile block 4826 4827 Returns: TRUE if OK, FALSE if not, error code set 4828 */ 4829 4830 static BOOL 4831 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, 4832 int *countptr, int *errorcodeptr, compile_block *cb) 4833 { 4834 uint32_t i, groupnumber; 4835 int count; 4836 PCRE2_UCHAR *slot = cb->name_table; 4837 4838 /* Find the first entry in the table */ 4839 4840 for (i = 0; i < cb->names_found; i++) 4841 { 4842 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 && 4843 slot[IMM2_SIZE+length] == 0) break; 4844 slot += cb->name_entry_size; 4845 } 4846 4847 /* This should not occur, because this function is called only when we know we 4848 have duplicate names. Give an internal error. */ 4849 4850 if (i >= cb->names_found) 4851 { 4852 *errorcodeptr = ERR53; 4853 cb->erroroffset = name - cb->start_pattern; 4854 return FALSE; 4855 } 4856 4857 /* Record the index and then see how many duplicates there are, updating the 4858 backref map and maximum back reference as we do. */ 4859 4860 *indexptr = i; 4861 count = 0; 4862 4863 for (;;) 4864 { 4865 count++; 4866 groupnumber = GET2(slot,0); 4867 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; 4868 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; 4869 if (++i >= cb->names_found) break; 4870 slot += cb->name_entry_size; 4871 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 || 4872 (slot+IMM2_SIZE)[length] != 0) break; 4873 } 4874 4875 *countptr = count; 4876 return TRUE; 4877 } 4878 4879 4880 4881 /************************************************* 4882 * Compile one branch * 4883 *************************************************/ 4884 4885 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If 4886 the options are changed during the branch, the pointer is used to change the 4887 external options bits. This function is used during the pre-compile phase when 4888 we are trying to find out the amount of memory needed, as well as during the 4889 real compile phase. The value of lengthptr distinguishes the two phases. 4890 4891 Arguments: 4892 optionsptr pointer to the option bits 4893 codeptr points to the pointer to the current code point 4894 pptrptr points to the current parsed pattern pointer 4895 errorcodeptr points to error code variable 4896 firstcuptr place to put the first required code unit 4897 firstcuflagsptr place to put the first code unit flags, or a negative number 4898 reqcuptr place to put the last required code unit 4899 reqcuflagsptr place to put the last required code unit flags, or a negative number 4900 bcptr points to current branch chain 4901 cb contains pointers to tables etc. 4902 lengthptr NULL during the real compile phase 4903 points to length accumulator during pre-compile phase 4904 4905 Returns: 0 There's been an error, *errorcodeptr is non-zero 4906 +1 Success, this branch must match at least one character 4907 -1 Success, this branch may match an empty string 4908 */ 4909 4910 static int 4911 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, 4912 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr, 4913 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, 4914 compile_block *cb, PCRE2_SIZE *lengthptr) 4915 { 4916 int bravalue = 0; 4917 int okreturn = -1; 4918 int group_return = 0; 4919 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 4920 uint32_t greedy_default, greedy_non_default; 4921 uint32_t repeat_type, op_type; 4922 uint32_t options = *optionsptr; /* May change dynamically */ 4923 uint32_t firstcu, reqcu; 4924 uint32_t zeroreqcu, zerofirstcu; 4925 uint32_t escape; 4926 uint32_t *pptr = *pptrptr; 4927 uint32_t meta, meta_arg; 4928 int32_t firstcuflags, reqcuflags; 4929 int32_t zeroreqcuflags, zerofirstcuflags; 4930 int32_t req_caseopt, reqvary, tempreqvary; 4931 PCRE2_SIZE offset = 0; 4932 PCRE2_SIZE length_prevgroup = 0; 4933 PCRE2_UCHAR *code = *codeptr; 4934 PCRE2_UCHAR *last_code = code; 4935 PCRE2_UCHAR *orig_code = code; 4936 PCRE2_UCHAR *tempcode; 4937 PCRE2_UCHAR *previous = NULL; 4938 PCRE2_UCHAR op_previous; 4939 BOOL groupsetfirstcu = FALSE; 4940 BOOL matched_char = FALSE; 4941 BOOL previous_matched_char = FALSE; 4942 const uint8_t *cbits = cb->cbits; 4943 uint8_t classbits[32]; 4944 4945 /* We can fish out the UTF setting once and for all into a BOOL, but we must 4946 not do this for other options (e.g. PCRE2_EXTENDED) because they may change 4947 dynamically as we process the pattern. */ 4948 4949 #ifdef SUPPORT_UNICODE 4950 BOOL utf = (options & PCRE2_UTF) != 0; 4951 #else /* No UTF support */ 4952 BOOL utf = FALSE; 4953 #endif 4954 4955 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define 4956 class_uchardata always so that it can be passed to add_to_class() always, 4957 though it will not be used in non-UTF 8-bit cases. This avoids having to supply 4958 alternative calls for the different cases. */ 4959 4960 PCRE2_UCHAR *class_uchardata; 4961 #ifdef SUPPORT_WIDE_CHARS 4962 BOOL xclass; 4963 PCRE2_UCHAR *class_uchardata_base; 4964 #endif 4965 4966 /* Set up the default and non-default settings for greediness */ 4967 4968 greedy_default = ((options & PCRE2_UNGREEDY) != 0); 4969 greedy_non_default = greedy_default ^ 1; 4970 4971 /* Initialize no first unit, no required unit. REQ_UNSET means "no char 4972 matching encountered yet". It gets changed to REQ_NONE if we hit something that 4973 matches a non-fixed first unit; reqcu just remains unset if we never find one. 4974 4975 When we hit a repeat whose minimum is zero, we may have to adjust these values 4976 to take the zero repeat into account. This is implemented by setting them to 4977 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual 4978 item types that can be repeated set these backoff variables appropriately. */ 4979 4980 firstcu = reqcu = zerofirstcu = zeroreqcu = 0; 4981 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; 4982 4983 /* The variable req_caseopt contains either the REQ_CASELESS value or zero, 4984 according to the current setting of the caseless flag. The REQ_CASELESS value 4985 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables 4986 to record the case status of the value. This is used only for ASCII characters. 4987 */ 4988 4989 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; 4990 4991 /* Switch on next META item until the end of the branch */ 4992 4993 for (;; pptr++) 4994 { 4995 #ifdef SUPPORT_WIDE_CHARS 4996 BOOL xclass_has_prop; 4997 #endif 4998 BOOL negate_class; 4999 BOOL should_flip_negation; 5000 BOOL match_all_or_no_wide_chars; 5001 BOOL possessive_quantifier; 5002 BOOL note_group_empty; 5003 int class_has_8bitchar; 5004 int i; 5005 uint32_t mclength; 5006 uint32_t skipunits; 5007 uint32_t subreqcu, subfirstcu; 5008 uint32_t groupnumber; 5009 uint32_t verbarglen, verbculen; 5010 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */ 5011 open_capitem *oc; 5012 PCRE2_UCHAR mcbuffer[8]; 5013 5014 /* Get next META item in the pattern and its potential argument. */ 5015 5016 meta = META_CODE(*pptr); 5017 meta_arg = META_DATA(*pptr); 5018 5019 /* If we are in the pre-compile phase, accumulate the length used for the 5020 previous cycle of this loop, unless the next item is a quantifier. */ 5021 5022 if (lengthptr != NULL) 5023 { 5024 if (code > cb->start_workspace + cb->workspace_size - 5025 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ 5026 { 5027 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? 5028 ERR52 : ERR86; 5029 return 0; 5030 } 5031 5032 /* There is at least one situation where code goes backwards: this is the 5033 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier 5034 is processed, the whole class is eliminated. However, it is created first, 5035 so we have to allow memory for it. Therefore, don't ever reduce the length 5036 at this point. */ 5037 5038 if (code < last_code) code = last_code; 5039 5040 /* If the next thing is not a quantifier, we add the length of the previous 5041 item into the total, and reset the code pointer to the start of the 5042 workspace. Otherwise leave the previous item available to be quantified. */ 5043 5044 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) 5045 { 5046 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code)) 5047 { 5048 *errorcodeptr = ERR20; /* Integer overflow */ 5049 return 0; 5050 } 5051 *lengthptr += (PCRE2_SIZE)(code - orig_code); 5052 if (*lengthptr > MAX_PATTERN_SIZE) 5053 { 5054 *errorcodeptr = ERR20; /* Pattern is too large */ 5055 return 0; 5056 } 5057 code = orig_code; 5058 } 5059 5060 /* Remember where this code item starts so we can catch the "backwards" 5061 case above next time round. */ 5062 5063 last_code = code; 5064 } 5065 5066 /* Process the next parsed pattern item. If it is not a quantifier, remember 5067 where it starts so that it can be quantified when a quantifier follows. 5068 Checking for the legality of quantifiers happens in parse_regex(), except for 5069 a quantifier after an assertion that is a condition. */ 5070 5071 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) 5072 { 5073 previous = code; 5074 if (matched_char) okreturn = 1; 5075 } 5076 5077 previous_matched_char = matched_char; 5078 matched_char = FALSE; 5079 note_group_empty = FALSE; 5080 skipunits = 0; /* Default value for most subgroups */ 5081 5082 switch(meta) 5083 { 5084 /* ===================================================================*/ 5085 /* The branch terminates at pattern end or | or ) */ 5086 5087 case META_END: 5088 case META_ALT: 5089 case META_KET: 5090 *firstcuptr = firstcu; 5091 *firstcuflagsptr = firstcuflags; 5092 *reqcuptr = reqcu; 5093 *reqcuflagsptr = reqcuflags; 5094 *codeptr = code; 5095 *pptrptr = pptr; 5096 return okreturn; 5097 5098 5099 /* ===================================================================*/ 5100 /* Handle single-character metacharacters. In multiline mode, ^ disables 5101 the setting of any following char as a first character. */ 5102 5103 case META_CIRCUMFLEX: 5104 if ((options & PCRE2_MULTILINE) != 0) 5105 { 5106 if (firstcuflags == REQ_UNSET) 5107 zerofirstcuflags = firstcuflags = REQ_NONE; 5108 *code++ = OP_CIRCM; 5109 } 5110 else *code++ = OP_CIRC; 5111 break; 5112 5113 case META_DOLLAR: 5114 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; 5115 break; 5116 5117 /* There can never be a first char if '.' is first, whatever happens about 5118 repeats. The value of reqcu doesn't change either. */ 5119 5120 case META_DOT: 5121 matched_char = TRUE; 5122 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5123 zerofirstcu = firstcu; 5124 zerofirstcuflags = firstcuflags; 5125 zeroreqcu = reqcu; 5126 zeroreqcuflags = reqcuflags; 5127 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; 5128 break; 5129 5130 5131 /* ===================================================================*/ 5132 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. 5133 Otherwise, an initial ']' is taken as a data character. When empty classes 5134 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must 5135 match any character, so generate OP_ALLANY. */ 5136 5137 case META_CLASS_EMPTY: 5138 case META_CLASS_EMPTY_NOT: 5139 matched_char = TRUE; 5140 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL; 5141 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5142 zerofirstcu = firstcu; 5143 zerofirstcuflags = firstcuflags; 5144 break; 5145 5146 5147 /* ===================================================================*/ 5148 /* Non-empty character class. If the included characters are all < 256, we 5149 build a 32-byte bitmap of the permitted characters, except in the special 5150 case where there is only one such character. For negated classes, we build 5151 the map as usual, then invert it at the end. However, we use a different 5152 opcode so that data characters > 255 can be handled correctly. 5153 5154 If the class contains characters outside the 0-255 range, a different 5155 opcode is compiled. It may optionally have a bit map for characters < 256, 5156 but those above are are explicitly listed afterwards. A flag code unit 5157 tells whether the bitmap is present, and whether this is a negated class or 5158 not. */ 5159 5160 case META_CLASS_NOT: 5161 case META_CLASS: 5162 matched_char = TRUE; 5163 negate_class = meta == META_CLASS_NOT; 5164 5165 /* We can optimize the case of a single character in a class by generating 5166 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's 5167 negative. In the negative case there can be no first char if this item is 5168 first, whatever repeat count may follow. In the case of reqcu, save the 5169 previous value for reinstating. */ 5170 5171 /* NOTE: at present this optimization is not effective if the only 5172 character in a class in 32-bit, non-UCP mode has its top bit set. */ 5173 5174 if (pptr[1] < META_END && pptr[2] == META_CLASS_END) 5175 { 5176 #ifdef SUPPORT_UNICODE 5177 uint32_t d; 5178 #endif 5179 uint32_t c = pptr[1]; 5180 5181 pptr += 2; /* Move on to class end */ 5182 if (meta == META_CLASS) /* A positive one-char class can be */ 5183 { /* handled as a normal literal character. */ 5184 meta = c; /* Set up the character */ 5185 goto NORMAL_CHAR_SET; 5186 } 5187 5188 /* Handle a negative one-character class */ 5189 5190 zeroreqcu = reqcu; 5191 zeroreqcuflags = reqcuflags; 5192 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5193 zerofirstcu = firstcu; 5194 zerofirstcuflags = firstcuflags; 5195 5196 /* For caseless UTF mode, check whether this character has more than 5197 one other case. If so, generate a special OP_NOTPROP item instead of 5198 OP_NOTI. */ 5199 5200 #ifdef SUPPORT_UNICODE 5201 if (utf && (options & PCRE2_CASELESS) != 0 && 5202 (d = UCD_CASESET(c)) != 0) 5203 { 5204 *code++ = OP_NOTPROP; 5205 *code++ = PT_CLIST; 5206 *code++ = d; 5207 break; /* We are finished with this class */ 5208 } 5209 #endif 5210 /* Char has only one other case, or UCP not available */ 5211 5212 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; 5213 code += PUTCHAR(c, code); 5214 break; /* We are finished with this class */ 5215 } /* End of 1-char optimization */ 5216 5217 /* Handle character classes that contain more than just one literal 5218 character. */ 5219 5220 /* If a non-extended class contains a negative special such as \S, we need 5221 to flip the negation flag at the end, so that support for characters > 255 5222 works correctly (they are all included in the class). An extended class may 5223 need to insert specific matching or non-matching code for wide characters. 5224 */ 5225 5226 should_flip_negation = match_all_or_no_wide_chars = FALSE; 5227 5228 /* Extended class (xclass) will be used when characters > 255 5229 might match. */ 5230 5231 #ifdef SUPPORT_WIDE_CHARS 5232 xclass = FALSE; 5233 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ 5234 class_uchardata_base = class_uchardata; /* Save the start */ 5235 #endif 5236 5237 /* For optimization purposes, we track some properties of the class: 5238 class_has_8bitchar will be non-zero if the class contains at least one 5239 character with a code point less than 256; xclass_has_prop will be TRUE if 5240 Unicode property checks are present in the class. */ 5241 5242 class_has_8bitchar = 0; 5243 #ifdef SUPPORT_WIDE_CHARS 5244 xclass_has_prop = FALSE; 5245 #endif 5246 5247 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map 5248 in a temporary bit of memory, in case the class contains fewer than two 5249 8-bit characters because in that case the compiled code doesn't use the bit 5250 map. */ 5251 5252 memset(classbits, 0, 32 * sizeof(uint8_t)); 5253 5254 /* Process items until META_CLASS_END is reached. */ 5255 5256 while ((meta = *(++pptr)) != META_CLASS_END) 5257 { 5258 /* Handle POSIX classes such as [:alpha:] etc. */ 5259 5260 if (meta == META_POSIX || meta == META_POSIX_NEG) 5261 { 5262 BOOL local_negate = (meta == META_POSIX_NEG); 5263 int posix_class = *(++pptr); 5264 int taboffset, tabopt; 5265 uint8_t pbits[32]; 5266 5267 should_flip_negation = local_negate; /* Note negative special */ 5268 5269 /* If matching is caseless, upper and lower are converted to alpha. 5270 This relies on the fact that the class table starts with alpha, 5271 lower, upper as the first 3 entries. */ 5272 5273 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) 5274 posix_class = 0; 5275 5276 /* When PCRE2_UCP is set, some of the POSIX classes are converted to 5277 different escape sequences that use Unicode properties \p or \P. 5278 Others that are not available via \p or \P have to generate 5279 XCL_PROP/XCL_NOTPROP directly, which is done here. */ 5280 5281 #ifdef SUPPORT_UNICODE 5282 if ((options & PCRE2_UCP) != 0) switch(posix_class) 5283 { 5284 case PC_GRAPH: 5285 case PC_PRINT: 5286 case PC_PUNCT: 5287 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; 5288 *class_uchardata++ = (PCRE2_UCHAR) 5289 ((posix_class == PC_GRAPH)? PT_PXGRAPH : 5290 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); 5291 *class_uchardata++ = 0; 5292 xclass_has_prop = TRUE; 5293 goto CONTINUE_CLASS; 5294 5295 /* For the other POSIX classes (ascii, xdigit) we are going to 5296 fall through to the non-UCP case and build a bit map for 5297 characters with code points less than 256. However, if we are in 5298 a negated POSIX class, characters with code points greater than 5299 255 must either all match or all not match, depending on whether 5300 the whole class is not or is negated. For example, for 5301 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]... 5302 they must not. 5303 5304 In the special case where there are no xclass items, this is 5305 automatically handled by the use of OP_CLASS or OP_NCLASS, but an 5306 explicit range is needed for OP_XCLASS. Setting a flag here 5307 causes the range to be generated later when it is known that 5308 OP_XCLASS is required. In the 8-bit library this is relevant only in 5309 utf mode, since no wide characters can exist otherwise. */ 5310 5311 default: 5312 #if PCRE2_CODE_UNIT_WIDTH == 8 5313 if (utf) 5314 #endif 5315 match_all_or_no_wide_chars |= local_negate; 5316 break; 5317 } 5318 #endif /* SUPPORT_UNICODE */ 5319 5320 /* In the non-UCP case, or when UCP makes no difference, we build the 5321 bit map for the POSIX class in a chunk of local store because we may 5322 be adding and subtracting from it, and we don't want to subtract bits 5323 that may be in the main map already. At the end we or the result into 5324 the bit map that is being built. */ 5325 5326 posix_class *= 3; 5327 5328 /* Copy in the first table (always present) */ 5329 5330 memcpy(pbits, cbits + posix_class_maps[posix_class], 5331 32 * sizeof(uint8_t)); 5332 5333 /* If there is a second table, add or remove it as required. */ 5334 5335 taboffset = posix_class_maps[posix_class + 1]; 5336 tabopt = posix_class_maps[posix_class + 2]; 5337 5338 if (taboffset >= 0) 5339 { 5340 if (tabopt >= 0) 5341 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; 5342 else 5343 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; 5344 } 5345 5346 /* Now see if we need to remove any special characters. An option 5347 value of 1 removes vertical space and 2 removes underscore. */ 5348 5349 if (tabopt < 0) tabopt = -tabopt; 5350 if (tabopt == 1) pbits[1] &= ~0x3c; 5351 else if (tabopt == 2) pbits[11] &= 0x7f; 5352 5353 /* Add the POSIX table or its complement into the main table that is 5354 being built and we are done. */ 5355 5356 if (local_negate) 5357 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i]; 5358 else 5359 for (i = 0; i < 32; i++) classbits[i] |= pbits[i]; 5360 5361 /* Every class contains at least one < 256 character. */ 5362 5363 class_has_8bitchar = 1; 5364 goto CONTINUE_CLASS; /* End of POSIX handling */ 5365 } 5366 5367 /* Other than POSIX classes, the only items we should encounter are 5368 \d-type escapes and literal characters (possibly as ranges). */ 5369 5370 if (meta == META_BIGVALUE) 5371 { 5372 meta = *(++pptr); 5373 goto CLASS_LITERAL; 5374 } 5375 5376 /* Any other non-literal must be an escape */ 5377 5378 if (meta >= META_END) 5379 { 5380 if (META_CODE(meta) != META_ESCAPE) 5381 { 5382 #ifdef DEBUG_SHOW_PARSED 5383 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x " 5384 "in character class\n", meta); 5385 #endif 5386 *errorcodeptr = ERR89; /* Internal error - unrecognized. */ 5387 return 0; 5388 } 5389 escape = META_DATA(meta); 5390 5391 /* Every class contains at least one < 256 character. */ 5392 5393 class_has_8bitchar++; 5394 5395 switch(escape) 5396 { 5397 case ESC_d: 5398 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; 5399 break; 5400 5401 case ESC_D: 5402 should_flip_negation = TRUE; 5403 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit]; 5404 break; 5405 5406 case ESC_w: 5407 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; 5408 break; 5409 5410 case ESC_W: 5411 should_flip_negation = TRUE; 5412 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word]; 5413 break; 5414 5415 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl 5416 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was 5417 previously set by something earlier in the character class. 5418 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so 5419 we could just adjust the appropriate bit. From PCRE 8.34 we no 5420 longer treat \s and \S specially. */ 5421 5422 case ESC_s: 5423 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; 5424 break; 5425 5426 case ESC_S: 5427 should_flip_negation = TRUE; 5428 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space]; 5429 break; 5430 5431 /* When adding the horizontal or vertical space lists to a class, or 5432 their complements, disable PCRE2_CASELESS, because it justs wastes 5433 time, and in the "not-x" UTF cases can create unwanted duplicates in 5434 the XCLASS list (provoked by characters that have more than one other 5435 case and by both cases being in the same "not-x" sublist). */ 5436 5437 case ESC_h: 5438 (void)add_list_to_class(classbits, &class_uchardata, 5439 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); 5440 break; 5441 5442 case ESC_H: 5443 (void)add_not_list_to_class(classbits, &class_uchardata, 5444 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); 5445 break; 5446 5447 case ESC_v: 5448 (void)add_list_to_class(classbits, &class_uchardata, 5449 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); 5450 break; 5451 5452 case ESC_V: 5453 (void)add_not_list_to_class(classbits, &class_uchardata, 5454 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); 5455 break; 5456 5457 /* If Unicode is not supported, \P and \p are not allowed and are 5458 faulted at parse time, so will never appear here. */ 5459 5460 #ifdef SUPPORT_UNICODE 5461 case ESC_p: 5462 case ESC_P: 5463 { 5464 uint32_t ptype = *(++pptr) >> 16; 5465 uint32_t pdata = *pptr & 0xffff; 5466 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; 5467 *class_uchardata++ = ptype; 5468 *class_uchardata++ = pdata; 5469 xclass_has_prop = TRUE; 5470 class_has_8bitchar--; /* Undo! */ 5471 } 5472 break; 5473 #endif 5474 } 5475 5476 goto CONTINUE_CLASS; 5477 } /* End handling \d-type escapes */ 5478 5479 /* A literal character may be followed by a range meta. At parse time 5480 there are checks for out-of-order characters, for ranges where the two 5481 characters are equal, and for hyphens that cannot indicate a range. At 5482 this point, therefore, no checking is needed. */ 5483 5484 else 5485 { 5486 uint32_t c, d; 5487 5488 CLASS_LITERAL: 5489 c = d = meta; 5490 5491 /* Remember if \r or \n were explicitly used */ 5492 5493 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; 5494 5495 /* Process a character range */ 5496 5497 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED) 5498 { 5499 #ifdef EBCDIC 5500 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL); 5501 #endif 5502 pptr += 2; 5503 d = *pptr; 5504 if (d == META_BIGVALUE) d = *(++pptr); 5505 5506 /* Remember an explicit \r or \n, and add the range to the class. */ 5507 5508 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; 5509 5510 /* In an EBCDIC environment, Perl treats alphabetic ranges specially 5511 because there are holes in the encoding, and simply using the range 5512 A-Z (for example) would include the characters in the holes. This 5513 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ 5514 5515 #ifdef EBCDIC 5516 if (range_is_literal && 5517 (cb->ctypes[c] & ctype_letter) != 0 && 5518 (cb->ctypes[d] & ctype_letter) != 0 && 5519 (d <= CHAR_z) == (d <= CHAR_z)) 5520 { 5521 uint32_t uc = (d <= CHAR_z)? 0 : 64; 5522 uint32_t C = d - uc; 5523 uint32_t D = d - uc; 5524 5525 if (C <= CHAR_i) 5526 { 5527 class_has_8bitchar += 5528 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 5529 ((D < CHAR_i)? D : CHAR_i) + uc); 5530 C = CHAR_j; 5531 } 5532 5533 if (C <= D && C <= CHAR_r) 5534 { 5535 class_has_8bitchar += 5536 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 5537 ((D < CHAR_r)? D : CHAR_r) + uc); 5538 C = CHAR_s; 5539 } 5540 5541 if (C <= D) 5542 { 5543 class_has_8bitchar += 5544 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 5545 D + uc); 5546 } 5547 } 5548 else 5549 #endif 5550 /* Not an EBCDIC special range */ 5551 5552 class_has_8bitchar += 5553 add_to_class(classbits, &class_uchardata, options, cb, c, d); 5554 goto CONTINUE_CLASS; /* Go get the next char in the class */ 5555 } /* End of range handling */ 5556 5557 5558 /* Handle a single character. */ 5559 5560 class_has_8bitchar += 5561 add_to_class(classbits, &class_uchardata, options, cb, meta, meta); 5562 } 5563 5564 /* Continue to the next item in the class. */ 5565 5566 CONTINUE_CLASS: 5567 5568 #ifdef SUPPORT_WIDE_CHARS 5569 /* If any wide characters or Unicode properties have been encountered, 5570 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length 5571 of the extra data and reset the pointer. This is so that very large 5572 classes that contain a zillion wide characters or Unicode property tests 5573 do not overwrite the workspace (which is on the stack). */ 5574 5575 if (class_uchardata > class_uchardata_base) 5576 { 5577 xclass = TRUE; 5578 if (lengthptr != NULL) 5579 { 5580 *lengthptr += class_uchardata - class_uchardata_base; 5581 class_uchardata = class_uchardata_base; 5582 } 5583 } 5584 #endif 5585 5586 continue; /* Needed to avoid error when not supporting wide chars */ 5587 } /* End of main class-processing loop */ 5588 5589 /* If this class is the first thing in the branch, there can be no first 5590 char setting, whatever the repeat count. Any reqcu setting must remain 5591 unchanged after any kind of repeat. */ 5592 5593 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5594 zerofirstcu = firstcu; 5595 zerofirstcuflags = firstcuflags; 5596 zeroreqcu = reqcu; 5597 zeroreqcuflags = reqcuflags; 5598 5599 /* If there are characters with values > 255, or Unicode property settings 5600 (\p or \P), we have to compile an extended class, with its own opcode, 5601 unless there were no property settings and there was a negated special such 5602 as \S in the class, and PCRE2_UCP is not set, because in that case all 5603 characters > 255 are in or not in the class, so any that were explicitly 5604 given as well can be ignored. 5605 5606 In the UCP case, if certain negated POSIX classes ([:^ascii:] or 5607 [^:xdigit:]) were present in a class, we either have to match or not match 5608 all wide characters (depending on whether the whole class is or is not 5609 negated). This requirement is indicated by match_all_or_no_wide_chars being 5610 true. We do this by including an explicit range, which works in both cases. 5611 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there 5612 cannot be any wide characters in 8-bit non-UTF mode. 5613 5614 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit 5615 class where \S etc is present without PCRE2_UCP, causing an extended class 5616 to be compiled, we make sure that all characters > 255 are included by 5617 forcing match_all_or_no_wide_chars to be true. 5618 5619 If, when generating an xclass, there are no characters < 256, we can omit 5620 the bitmap in the actual compiled code. */ 5621 5622 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ 5623 if (xclass && ( 5624 #ifdef SUPPORT_UNICODE 5625 (options & PCRE2_UCP) != 0 || 5626 #endif 5627 xclass_has_prop || !should_flip_negation)) 5628 { 5629 if (match_all_or_no_wide_chars || ( 5630 #if PCRE2_CODE_UNIT_WIDTH == 8 5631 utf && 5632 #endif 5633 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) 5634 { 5635 *class_uchardata++ = XCL_RANGE; 5636 if (utf) /* Will always be utf in the 8-bit library */ 5637 { 5638 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); 5639 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); 5640 } 5641 else /* Can only happen for the 16-bit & 32-bit libraries */ 5642 { 5643 #if PCRE2_CODE_UNIT_WIDTH == 16 5644 *class_uchardata++ = 0x100; 5645 *class_uchardata++ = 0xffffu; 5646 #elif PCRE2_CODE_UNIT_WIDTH == 32 5647 *class_uchardata++ = 0x100; 5648 *class_uchardata++ = 0xffffffffu; 5649 #endif 5650 } 5651 } 5652 *class_uchardata++ = XCL_END; /* Marks the end of extra data */ 5653 *code++ = OP_XCLASS; 5654 code += LINK_SIZE; 5655 *code = negate_class? XCL_NOT:0; 5656 if (xclass_has_prop) *code |= XCL_HASPROP; 5657 5658 /* If the map is required, move up the extra data to make room for it; 5659 otherwise just move the code pointer to the end of the extra data. */ 5660 5661 if (class_has_8bitchar > 0) 5662 { 5663 *code++ |= XCL_MAP; 5664 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, 5665 CU2BYTES(class_uchardata - code)); 5666 if (negate_class && !xclass_has_prop) 5667 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i]; 5668 memcpy(code, classbits, 32); 5669 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); 5670 } 5671 else code = class_uchardata; 5672 5673 /* Now fill in the complete length of the item */ 5674 5675 PUT(previous, 1, (int)(code - previous)); 5676 break; /* End of class handling */ 5677 } 5678 #endif /* SUPPORT_WIDE_CHARS */ 5679 5680 /* If there are no characters > 255, or they are all to be included or 5681 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the 5682 whole class was negated and whether there were negative specials such as \S 5683 (non-UCP) in the class. Then copy the 32-byte map into the code vector, 5684 negating it if necessary. */ 5685 5686 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; 5687 if (lengthptr == NULL) /* Save time in the pre-compile phase */ 5688 { 5689 if (negate_class) 5690 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i]; 5691 memcpy(code, classbits, 32); 5692 } 5693 code += 32 / sizeof(PCRE2_UCHAR); 5694 break; /* End of class processing */ 5695 5696 5697 /* ===================================================================*/ 5698 /* Deal with (*VERB)s. */ 5699 5700 /* Check for open captures before ACCEPT and close those that are within 5701 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an 5702 assertion. In the first pass, just accumulate the length required; 5703 otherwise hitting (*ACCEPT) inside many nested parentheses can cause 5704 workspace overflow. Do not set firstcu after *ACCEPT. */ 5705 5706 case META_ACCEPT: 5707 cb->had_accept = TRUE; 5708 for (oc = cb->open_caps; 5709 oc != NULL && oc->assert_depth >= cb->assert_depth; 5710 oc = oc->next) 5711 { 5712 if (lengthptr != NULL) 5713 { 5714 *lengthptr += CU2BYTES(1) + IMM2_SIZE; 5715 } 5716 else 5717 { 5718 *code++ = OP_CLOSE; 5719 PUT2INC(code, 0, oc->number); 5720 } 5721 } 5722 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; 5723 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5724 break; 5725 5726 case META_PRUNE: 5727 case META_SKIP: 5728 cb->had_pruneorskip = TRUE; 5729 /* Fall through */ 5730 case META_COMMIT: 5731 case META_FAIL: 5732 *code++ = verbops[(meta - META_MARK) >> 16]; 5733 break; 5734 5735 case META_THEN: 5736 cb->external_flags |= PCRE2_HASTHEN; 5737 *code++ = OP_THEN; 5738 break; 5739 5740 /* Handle verbs with arguments. Arguments can be very long, especially in 5741 16- and 32-bit modes, and can overflow the workspace in the first pass. 5742 However, the argument length is constrained to be small enough to fit in 5743 one code unit. This check happens in parse_regex(). In the first pass, 5744 instead of putting the argument into memory, we just update the length 5745 counter and set up an empty argument. */ 5746 5747 case META_THEN_ARG: 5748 cb->external_flags |= PCRE2_HASTHEN; 5749 goto VERB_ARG; 5750 5751 case META_PRUNE_ARG: 5752 case META_SKIP_ARG: 5753 cb->had_pruneorskip = TRUE; 5754 /* Fall through */ 5755 case META_MARK: 5756 case META_COMMIT_ARG: 5757 VERB_ARG: 5758 *code++ = verbops[(meta - META_MARK) >> 16]; 5759 /* The length is in characters. */ 5760 verbarglen = *(++pptr); 5761 verbculen = 0; 5762 tempcode = code++; 5763 for (i = 0; i < (int)verbarglen; i++) 5764 { 5765 meta = *(++pptr); 5766 #ifdef SUPPORT_UNICODE 5767 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else 5768 #endif 5769 { 5770 mclength = 1; 5771 mcbuffer[0] = meta; 5772 } 5773 if (lengthptr != NULL) *lengthptr += mclength; else 5774 { 5775 memcpy(code, mcbuffer, CU2BYTES(mclength)); 5776 code += mclength; 5777 verbculen += mclength; 5778 } 5779 } 5780 5781 *tempcode = verbculen; /* Fill in the code unit length */ 5782 *code++ = 0; /* Terminating zero */ 5783 break; 5784 5785 5786 /* ===================================================================*/ 5787 /* Handle options change. The new setting must be passed back for use in 5788 subsequent branches. Reset the greedy defaults and the case value for 5789 firstcu and reqcu. */ 5790 5791 case META_OPTIONS: 5792 *optionsptr = options = *(++pptr); 5793 greedy_default = ((options & PCRE2_UNGREEDY) != 0); 5794 greedy_non_default = greedy_default ^ 1; 5795 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; 5796 break; 5797 5798 5799 /* ===================================================================*/ 5800 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous 5801 because it could be a numerical check on recursion, or a name check on a 5802 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that 5803 we can handle it either way. We first try for a name; if not found, process 5804 the number. */ 5805 5806 case META_COND_RNUMBER: /* (?(Rdigits) */ 5807 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */ 5808 case META_COND_RNAME: /* (?(R&name) - test for recursion */ 5809 bravalue = OP_COND; 5810 { 5811 int count, index; 5812 PCRE2_SPTR name; 5813 named_group *ng = cb->named_groups; 5814 uint32_t length = *(++pptr); 5815 5816 GETPLUSOFFSET(offset, pptr); 5817 name = cb->start_pattern + offset; 5818 5819 /* In the first pass, the names generated in the pre-pass are available, 5820 but the main name table has not yet been created. Scan the list of names 5821 generated in the pre-pass in order to get a number and whether or not 5822 this name is duplicated. If it is not duplicated, we can handle it as a 5823 numerical group. */ 5824 5825 for (i = 0; i < cb->names_found; i++, ng++) 5826 { 5827 if (length == ng->length && 5828 PRIV(strncmp)(name, ng->name, length) == 0) 5829 { 5830 if (!ng->isdup) 5831 { 5832 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; 5833 PUT2(code, 2+LINK_SIZE, ng->number); 5834 if (ng->number > cb->top_backref) cb->top_backref = ng->number; 5835 skipunits = 1+IMM2_SIZE; 5836 goto GROUP_PROCESS_NOTE_EMPTY; 5837 } 5838 break; /* Found a duplicated name */ 5839 } 5840 } 5841 5842 /* If the name was not found we have a bad reference, unless we are 5843 dealing with R<digits>, which is treated as a recursion test by number. 5844 */ 5845 5846 if (i >= cb->names_found) 5847 { 5848 groupnumber = 0; 5849 if (meta == META_COND_RNUMBER) 5850 { 5851 for (i = 1; i < (int)length; i++) 5852 { 5853 groupnumber = groupnumber * 10 + name[i] - CHAR_0; 5854 if (groupnumber > MAX_GROUP_NUMBER) 5855 { 5856 *errorcodeptr = ERR61; 5857 cb->erroroffset = offset + i; 5858 return 0; 5859 } 5860 } 5861 } 5862 5863 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount) 5864 { 5865 *errorcodeptr = ERR15; 5866 cb->erroroffset = offset; 5867 return 0; 5868 } 5869 5870 /* (?Rdigits) treated as a recursion reference by number. A value of 5871 zero (which is the result of both (?R) and (?R0)) means "any", and is 5872 translated into RREF_ANY (which is 0xffff). */ 5873 5874 if (groupnumber == 0) groupnumber = RREF_ANY; 5875 code[1+LINK_SIZE] = OP_RREF; 5876 PUT2(code, 2+LINK_SIZE, groupnumber); 5877 skipunits = 1+IMM2_SIZE; 5878 goto GROUP_PROCESS_NOTE_EMPTY; 5879 } 5880 5881 /* A duplicated name was found. Note that if an R<digits> name is found 5882 (META_COND_RNUMBER), it is a reference test, not a recursion test. */ 5883 5884 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; 5885 5886 /* We have a duplicated name. In the compile pass we have to search the 5887 main table in order to get the index and count values. */ 5888 5889 count = 0; /* Values for first pass (avoids compiler warning) */ 5890 index = 0; 5891 if (lengthptr == NULL && !find_dupname_details(name, length, &index, 5892 &count, errorcodeptr, cb)) return 0; 5893 5894 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and 5895 insert appropriate data values. */ 5896 5897 code[1+LINK_SIZE]++; 5898 skipunits = 1+2*IMM2_SIZE; 5899 PUT2(code, 2+LINK_SIZE, index); 5900 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); 5901 } 5902 goto GROUP_PROCESS_NOTE_EMPTY; 5903 5904 /* The DEFINE condition is always false. It's internal groups may never 5905 be called, so matched_char must remain false, hence the jump to 5906 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */ 5907 5908 case META_COND_DEFINE: 5909 bravalue = OP_COND; 5910 GETPLUSOFFSET(offset, pptr); 5911 code[1+LINK_SIZE] = OP_DEFINE; 5912 skipunits = 1; 5913 goto GROUP_PROCESS; 5914 5915 /* Conditional test of a group's being set. */ 5916 5917 case META_COND_NUMBER: 5918 bravalue = OP_COND; 5919 GETPLUSOFFSET(offset, pptr); 5920 groupnumber = *(++pptr); 5921 if (groupnumber > cb->bracount) 5922 { 5923 *errorcodeptr = ERR15; 5924 cb->erroroffset = offset; 5925 return 0; 5926 } 5927 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; 5928 offset -= 2; /* Point at initial ( for too many branches error */ 5929 code[1+LINK_SIZE] = OP_CREF; 5930 skipunits = 1+IMM2_SIZE; 5931 PUT2(code, 2+LINK_SIZE, groupnumber); 5932 goto GROUP_PROCESS_NOTE_EMPTY; 5933 5934 /* Test for the PCRE2 version. */ 5935 5936 case META_COND_VERSION: 5937 bravalue = OP_COND; 5938 if (pptr[1] > 0) 5939 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) || 5940 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))? 5941 OP_TRUE : OP_FALSE; 5942 else 5943 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])? 5944 OP_TRUE : OP_FALSE; 5945 skipunits = 1; 5946 pptr += 3; 5947 goto GROUP_PROCESS_NOTE_EMPTY; 5948 5949 /* The condition is an assertion, possibly preceded by a callout. */ 5950 5951 case META_COND_ASSERT: 5952 bravalue = OP_COND; 5953 goto GROUP_PROCESS_NOTE_EMPTY; 5954 5955 5956 /* ===================================================================*/ 5957 /* Handle all kinds of nested bracketed groups. The non-capturing, 5958 non-conditional cases are here; others come to GROUP_PROCESS via goto. */ 5959 5960 case META_LOOKAHEAD: 5961 bravalue = OP_ASSERT; 5962 cb->assert_depth += 1; 5963 goto GROUP_PROCESS; 5964 5965 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird 5966 thing to do, but Perl allows all assertions to be quantified, and when 5967 they contain capturing parentheses there may be a potential use for 5968 this feature. Not that that applies to a quantified (?!) but we allow 5969 it for uniformity. */ 5970 5971 case META_LOOKAHEADNOT: 5972 if (pptr[1] == META_KET && 5973 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY)) 5974 { 5975 *code++ = OP_FAIL; 5976 pptr++; 5977 } 5978 else 5979 { 5980 bravalue = OP_ASSERT_NOT; 5981 cb->assert_depth += 1; 5982 goto GROUP_PROCESS; 5983 } 5984 break; 5985 5986 case META_LOOKBEHIND: 5987 bravalue = OP_ASSERTBACK; 5988 cb->assert_depth += 1; 5989 goto GROUP_PROCESS; 5990 5991 case META_LOOKBEHINDNOT: 5992 bravalue = OP_ASSERTBACK_NOT; 5993 cb->assert_depth += 1; 5994 goto GROUP_PROCESS; 5995 5996 case META_ATOMIC: 5997 bravalue = OP_ONCE; 5998 goto GROUP_PROCESS_NOTE_EMPTY; 5999 6000 case META_NOCAPTURE: 6001 bravalue = OP_BRA; 6002 /* Fall through */ 6003 6004 /* Process nested bracketed regex. The nesting depth is maintained for the 6005 benefit of the stackguard function. The test for too deep nesting is now 6006 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; 6007 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take 6008 note of whether or not they may match an empty string. */ 6009 6010 GROUP_PROCESS_NOTE_EMPTY: 6011 note_group_empty = TRUE; 6012 6013 GROUP_PROCESS: 6014 cb->parens_depth += 1; 6015 *code = bravalue; 6016 pptr++; 6017 tempcode = code; 6018 tempreqvary = cb->req_varyopt; /* Save value before group */ 6019 length_prevgroup = 0; /* Initialize for pre-compile phase */ 6020 6021 if ((group_return = 6022 compile_regex( 6023 options, /* The option state */ 6024 &tempcode, /* Where to put code (updated) */ 6025 &pptr, /* Input pointer (updated) */ 6026 errorcodeptr, /* Where to put an error message */ 6027 skipunits, /* Skip over bracket number */ 6028 &subfirstcu, /* For possible first char */ 6029 &subfirstcuflags, 6030 &subreqcu, /* For possible last char */ 6031 &subreqcuflags, 6032 bcptr, /* Current branch chain */ 6033 cb, /* Compile data block */ 6034 (lengthptr == NULL)? NULL : /* Actual compile phase */ 6035 &length_prevgroup /* Pre-compile phase */ 6036 )) == 0) 6037 return 0; /* Error */ 6038 6039 cb->parens_depth -= 1; 6040 6041 /* If that was a non-conditional significant group (not an assertion, not a 6042 DEFINE) that matches at least one character, then the current item matches 6043 a character. Conditionals are handled below. */ 6044 6045 if (note_group_empty && bravalue != OP_COND && group_return > 0) 6046 matched_char = TRUE; 6047 6048 /* If we've just compiled an assertion, pop the assert depth. */ 6049 6050 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) 6051 cb->assert_depth -= 1; 6052 6053 /* At the end of compiling, code is still pointing to the start of the 6054 group, while tempcode has been updated to point past the end of the group. 6055 The parsed pattern pointer (pptr) is on the closing META_KET. 6056 6057 If this is a conditional bracket, check that there are no more than 6058 two branches in the group, or just one if it's a DEFINE group. We do this 6059 in the real compile phase, not in the pre-pass, where the whole group may 6060 not be available. */ 6061 6062 if (bravalue == OP_COND && lengthptr == NULL) 6063 { 6064 PCRE2_UCHAR *tc = code; 6065 int condcount = 0; 6066 6067 do { 6068 condcount++; 6069 tc += GET(tc,1); 6070 } 6071 while (*tc != OP_KET); 6072 6073 /* A DEFINE group is never obeyed inline (the "condition" is always 6074 false). It must have only one branch. Having checked this, change the 6075 opcode to OP_FALSE. */ 6076 6077 if (code[LINK_SIZE+1] == OP_DEFINE) 6078 { 6079 if (condcount > 1) 6080 { 6081 cb->erroroffset = offset; 6082 *errorcodeptr = ERR54; 6083 return 0; 6084 } 6085 code[LINK_SIZE+1] = OP_FALSE; 6086 bravalue = OP_DEFINE; /* A flag to suppress char handling below */ 6087 } 6088 6089 /* A "normal" conditional group. If there is just one branch, we must not 6090 make use of its firstcu or reqcu, because this is equivalent to an 6091 empty second branch. Also, it may match an empty string. If there are two 6092 branches, this item must match a character if the group must. */ 6093 6094 else 6095 { 6096 if (condcount > 2) 6097 { 6098 cb->erroroffset = offset; 6099 *errorcodeptr = ERR27; 6100 return 0; 6101 } 6102 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; 6103 else if (group_return > 0) matched_char = TRUE; 6104 } 6105 } 6106 6107 /* In the pre-compile phase, update the length by the length of the group, 6108 less the brackets at either end. Then reduce the compiled code to just a 6109 set of non-capturing brackets so that it doesn't use much memory if it is 6110 duplicated by a quantifier.*/ 6111 6112 if (lengthptr != NULL) 6113 { 6114 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) 6115 { 6116 *errorcodeptr = ERR20; 6117 return 0; 6118 } 6119 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; 6120 code++; /* This already contains bravalue */ 6121 PUTINC(code, 0, 1 + LINK_SIZE); 6122 *code++ = OP_KET; 6123 PUTINC(code, 0, 1 + LINK_SIZE); 6124 break; /* No need to waste time with special character handling */ 6125 } 6126 6127 /* Otherwise update the main code pointer to the end of the group. */ 6128 6129 code = tempcode; 6130 6131 /* For a DEFINE group, required and first character settings are not 6132 relevant. */ 6133 6134 if (bravalue == OP_DEFINE) break; 6135 6136 /* Handle updating of the required and first code units for other types of 6137 group. Update for normal brackets of all kinds, and conditions with two 6138 branches (see code above). If the bracket is followed by a quantifier with 6139 zero repeat, we have to back off. Hence the definition of zeroreqcu and 6140 zerofirstcu outside the main loop so that they can be accessed for the back 6141 off. */ 6142 6143 zeroreqcu = reqcu; 6144 zeroreqcuflags = reqcuflags; 6145 zerofirstcu = firstcu; 6146 zerofirstcuflags = firstcuflags; 6147 groupsetfirstcu = FALSE; 6148 6149 if (bravalue >= OP_ONCE) /* Not an assertion */ 6150 { 6151 /* If we have not yet set a firstcu in this branch, take it from the 6152 subpattern, remembering that it was set here so that a repeat of more 6153 than one can replicate it as reqcu if necessary. If the subpattern has 6154 no firstcu, set "none" for the whole branch. In both cases, a zero 6155 repeat forces firstcu to "none". */ 6156 6157 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) 6158 { 6159 if (subfirstcuflags >= 0) 6160 { 6161 firstcu = subfirstcu; 6162 firstcuflags = subfirstcuflags; 6163 groupsetfirstcu = TRUE; 6164 } 6165 else firstcuflags = REQ_NONE; 6166 zerofirstcuflags = REQ_NONE; 6167 } 6168 6169 /* If firstcu was previously set, convert the subpattern's firstcu 6170 into reqcu if there wasn't one, using the vary flag that was in 6171 existence beforehand. */ 6172 6173 else if (subfirstcuflags >= 0 && subreqcuflags < 0) 6174 { 6175 subreqcu = subfirstcu; 6176 subreqcuflags = subfirstcuflags | tempreqvary; 6177 } 6178 6179 /* If the subpattern set a required code unit (or set a first code unit 6180 that isn't really the first code unit - see above), set it. */ 6181 6182 if (subreqcuflags >= 0) 6183 { 6184 reqcu = subreqcu; 6185 reqcuflags = subreqcuflags; 6186 } 6187 } 6188 6189 /* For a forward assertion, we take the reqcu, if set, provided that the 6190 group has also set a firstcu. This can be helpful if the pattern that 6191 follows the assertion doesn't set a different char. For example, it's 6192 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however 6193 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when 6194 the "real" "a" would then become a reqcu instead of a firstcu. This is 6195 overcome by a scan at the end if there's no firstcu, looking for an 6196 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means 6197 we must only take the reqcu when the group also set a firstcu. Otherwise, 6198 in that example, 'X' ends up set for both. */ 6199 6200 else if (bravalue == OP_ASSERT && subreqcuflags >= 0 && 6201 subfirstcuflags >= 0) 6202 { 6203 reqcu = subreqcu; 6204 reqcuflags = subreqcuflags; 6205 } 6206 6207 break; /* End of nested group handling */ 6208 6209 6210 /* ===================================================================*/ 6211 /* Handle named backreferences and recursions. */ 6212 6213 case META_BACKREF_BYNAME: 6214 case META_RECURSE_BYNAME: 6215 { 6216 int count, index; 6217 PCRE2_SPTR name; 6218 BOOL is_dupname = FALSE; 6219 named_group *ng = cb->named_groups; 6220 uint32_t length = *(++pptr); 6221 6222 GETPLUSOFFSET(offset, pptr); 6223 name = cb->start_pattern + offset; 6224 6225 /* In the first pass, the names generated in the pre-pass are available, 6226 but the main name table has not yet been created. Scan the list of names 6227 generated in the pre-pass in order to get a number and whether or not 6228 this name is duplicated. */ 6229 6230 groupnumber = 0; 6231 for (i = 0; i < cb->names_found; i++, ng++) 6232 { 6233 if (length == ng->length && 6234 PRIV(strncmp)(name, ng->name, length) == 0) 6235 { 6236 is_dupname = ng->isdup; 6237 groupnumber = ng->number; 6238 6239 /* For a recursion, that's all that is needed. We can now go to 6240 the code above that handles numerical recursion, applying it to 6241 the first group with the given name. */ 6242 6243 if (meta == META_RECURSE_BYNAME) 6244 { 6245 meta_arg = groupnumber; 6246 goto HANDLE_NUMERICAL_RECURSION; 6247 } 6248 6249 /* For a back reference, update the back reference map and the 6250 maximum back reference. Then, for each group, we must check to 6251 see if it is recursive, that is, it is inside the group that it 6252 references. A flag is set so that the group can be made atomic. 6253 */ 6254 6255 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; 6256 if (groupnumber > cb->top_backref) 6257 cb->top_backref = groupnumber; 6258 6259 for (oc = cb->open_caps; oc != NULL; oc = oc->next) 6260 { 6261 if (oc->number == groupnumber) 6262 { 6263 oc->flag = TRUE; 6264 break; 6265 } 6266 } 6267 } 6268 } 6269 6270 /* If the name was not found we have a bad reference. */ 6271 6272 if (groupnumber == 0) 6273 { 6274 *errorcodeptr = ERR15; 6275 cb->erroroffset = offset; 6276 return 0; 6277 } 6278 6279 /* If a back reference name is not duplicated, we can handle it as 6280 a numerical reference. */ 6281 6282 if (!is_dupname) 6283 { 6284 meta_arg = groupnumber; 6285 goto HANDLE_SINGLE_REFERENCE; 6286 } 6287 6288 /* If a back reference name is duplicated, we generate a different 6289 opcode to a numerical back reference. In the second pass we must 6290 search for the index and count in the final name table. */ 6291 6292 count = 0; /* Values for first pass (avoids compiler warning) */ 6293 index = 0; 6294 if (lengthptr == NULL && !find_dupname_details(name, length, &index, 6295 &count, errorcodeptr, cb)) return 0; 6296 6297 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 6298 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; 6299 PUT2INC(code, 0, index); 6300 PUT2INC(code, 0, count); 6301 } 6302 break; 6303 6304 6305 /* ===================================================================*/ 6306 /* Handle a numerical callout. */ 6307 6308 case META_CALLOUT_NUMBER: 6309 code[0] = OP_CALLOUT; 6310 PUT(code, 1, pptr[1]); /* Offset to next pattern item */ 6311 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ 6312 code[1 + 2*LINK_SIZE] = pptr[3]; 6313 pptr += 3; 6314 code += PRIV(OP_lengths)[OP_CALLOUT]; 6315 break; 6316 6317 6318 /* ===================================================================*/ 6319 /* Handle a callout with a string argument. In the pre-pass we just compute 6320 the length without generating anything. The length in pptr[3] includes both 6321 delimiters; in the actual compile only the first one is copied, but a 6322 terminating zero is added. Any doubled delimiters within the string make 6323 this an overestimate, but it is not worth bothering about. */ 6324 6325 case META_CALLOUT_STRING: 6326 if (lengthptr != NULL) 6327 { 6328 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE); 6329 pptr += 3; 6330 SKIPOFFSET(pptr); 6331 } 6332 6333 /* In the real compile we can copy the string. The starting delimiter is 6334 included so that the client can discover it if they want. We also pass the 6335 start offset to help a script language give better error messages. */ 6336 6337 else 6338 { 6339 PCRE2_SPTR pp; 6340 uint32_t delimiter; 6341 uint32_t length = pptr[3]; 6342 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); 6343 6344 code[0] = OP_CALLOUT_STR; 6345 PUT(code, 1, pptr[1]); /* Offset to next pattern item */ 6346 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ 6347 6348 pptr += 3; 6349 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */ 6350 pp = cb->start_pattern + offset; 6351 delimiter = *callout_string++ = *pp++; 6352 if (delimiter == CHAR_LEFT_CURLY_BRACKET) 6353 delimiter = CHAR_RIGHT_CURLY_BRACKET; 6354 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */ 6355 6356 /* The syntax of the pattern was checked in the parsing scan. The length 6357 includes both delimiters, but we have passed the opening one just above, 6358 so we reduce length before testing it. The test is for > 1 because we do 6359 not want to copy the final delimiter. This also ensures that pp[1] is 6360 accessible. */ 6361 6362 while (--length > 1) 6363 { 6364 if (*pp == delimiter && pp[1] == delimiter) 6365 { 6366 *callout_string++ = delimiter; 6367 pp += 2; 6368 length--; 6369 } 6370 else *callout_string++ = *pp++; 6371 } 6372 *callout_string++ = CHAR_NUL; 6373 6374 /* Set the length of the entire item, the advance to its end. */ 6375 6376 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code)); 6377 code = callout_string; 6378 } 6379 break; 6380 6381 6382 /* ===================================================================*/ 6383 /* Handle repetition. The different types are all sorted out in the parsing 6384 pass. */ 6385 6386 case META_MINMAX_PLUS: 6387 case META_MINMAX_QUERY: 6388 case META_MINMAX: 6389 repeat_min = *(++pptr); 6390 repeat_max = *(++pptr); 6391 goto REPEAT; 6392 6393 case META_ASTERISK: 6394 case META_ASTERISK_PLUS: 6395 case META_ASTERISK_QUERY: 6396 repeat_min = 0; 6397 repeat_max = REPEAT_UNLIMITED; 6398 goto REPEAT; 6399 6400 case META_PLUS: 6401 case META_PLUS_PLUS: 6402 case META_PLUS_QUERY: 6403 repeat_min = 1; 6404 repeat_max = REPEAT_UNLIMITED; 6405 goto REPEAT; 6406 6407 case META_QUERY: 6408 case META_QUERY_PLUS: 6409 case META_QUERY_QUERY: 6410 repeat_min = 0; 6411 repeat_max = 1; 6412 6413 REPEAT: 6414 if (previous_matched_char && repeat_min > 0) matched_char = TRUE; 6415 6416 /* Remember whether this is a variable length repeat, and default to 6417 single-char opcodes. */ 6418 6419 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 6420 op_type = 0; 6421 6422 /* If the repeat is {1} we can ignore it. */ 6423 6424 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; 6425 6426 /* Adjust first and required code units for a zero repeat. */ 6427 6428 if (repeat_min == 0) 6429 { 6430 firstcu = zerofirstcu; 6431 firstcuflags = zerofirstcuflags; 6432 reqcu = zeroreqcu; 6433 reqcuflags = zeroreqcuflags; 6434 } 6435 6436 /* Note the greediness and possessiveness. */ 6437 6438 switch (meta) 6439 { 6440 case META_MINMAX_PLUS: 6441 case META_ASTERISK_PLUS: 6442 case META_PLUS_PLUS: 6443 case META_QUERY_PLUS: 6444 repeat_type = 0; /* Force greedy */ 6445 possessive_quantifier = TRUE; 6446 break; 6447 6448 case META_MINMAX_QUERY: 6449 case META_ASTERISK_QUERY: 6450 case META_PLUS_QUERY: 6451 case META_QUERY_QUERY: 6452 repeat_type = greedy_non_default; 6453 possessive_quantifier = FALSE; 6454 break; 6455 6456 default: 6457 repeat_type = greedy_default; 6458 possessive_quantifier = FALSE; 6459 break; 6460 } 6461 6462 /* Save start of previous item, in case we have to move it up in order to 6463 insert something before it, and remember what it was. */ 6464 6465 tempcode = previous; 6466 op_previous = *previous; 6467 6468 /* Now handle repetition for the different types of item. */ 6469 6470 switch (op_previous) 6471 { 6472 /* If previous was a character or negated character match, abolish the 6473 item and generate a repeat item instead. If a char item has a minimum of 6474 more than one, ensure that it is set in reqcu - it might not be if a 6475 sequence such as x{3} is the first thing in a branch because the x will 6476 have gone into firstcu instead. */ 6477 6478 case OP_CHAR: 6479 case OP_CHARI: 6480 case OP_NOT: 6481 case OP_NOTI: 6482 op_type = chartypeoffset[op_previous - OP_CHAR]; 6483 6484 /* Deal with UTF characters that take up more than one code unit. */ 6485 6486 #ifdef MAYBE_UTF_MULTI 6487 if (utf && NOT_FIRSTCU(code[-1])) 6488 { 6489 PCRE2_UCHAR *lastchar = code - 1; 6490 BACKCHAR(lastchar); 6491 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ 6492 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ 6493 } 6494 else 6495 #endif /* MAYBE_UTF_MULTI */ 6496 6497 /* Handle the case of a single code unit - either with no UTF support, or 6498 with UTF disabled, or for a single-code-unit UTF character. */ 6499 { 6500 mcbuffer[0] = code[-1]; 6501 mclength = 1; 6502 if (op_previous <= OP_CHARI && repeat_min > 1) 6503 { 6504 reqcu = mcbuffer[0]; 6505 reqcuflags = req_caseopt | cb->req_varyopt; 6506 } 6507 } 6508 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 6509 6510 /* If previous was a character class or a back reference, we put the 6511 repeat stuff after it, but just skip the item if the repeat was {0,0}. */ 6512 6513 #ifdef SUPPORT_WIDE_CHARS 6514 case OP_XCLASS: 6515 #endif 6516 case OP_CLASS: 6517 case OP_NCLASS: 6518 case OP_REF: 6519 case OP_REFI: 6520 case OP_DNREF: 6521 case OP_DNREFI: 6522 6523 if (repeat_max == 0) 6524 { 6525 code = previous; 6526 goto END_REPEAT; 6527 } 6528 6529 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED) 6530 *code++ = OP_CRSTAR + repeat_type; 6531 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED) 6532 *code++ = OP_CRPLUS + repeat_type; 6533 else if (repeat_min == 0 && repeat_max == 1) 6534 *code++ = OP_CRQUERY + repeat_type; 6535 else 6536 { 6537 *code++ = OP_CRRANGE + repeat_type; 6538 PUT2INC(code, 0, repeat_min); 6539 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */ 6540 PUT2INC(code, 0, repeat_max); 6541 } 6542 break; 6543 6544 /* If previous is OP_FAIL, it was generated by an empty class [] 6545 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be 6546 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse 6547 time. We can just ignore this repeat. */ 6548 6549 case OP_FAIL: 6550 goto END_REPEAT; 6551 6552 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets 6553 because pcre2_match() could not handle backtracking into recursively 6554 called groups. Now that this backtracking is available, we no longer need 6555 to do this. However, we still need to replicate recursions as we do for 6556 groups so as to have independent backtracking points. We can replicate 6557 for the minimum number of repeats directly. For optional repeats we now 6558 wrap the recursion in OP_BRA brackets and make use of the bracket 6559 repetition. */ 6560 6561 case OP_RECURSE: 6562 6563 /* Generate unwrapped repeats for a non-zero minimum, except when the 6564 minimum is 1 and the maximum unlimited, because that can be handled with 6565 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the 6566 minimum, we just need to generate the appropriate additional copies. 6567 Otherwise we need to generate one more, to simulate the situation when 6568 the minimum is zero. */ 6569 6570 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED)) 6571 { 6572 int replicate = repeat_min; 6573 if (repeat_min == repeat_max) replicate--; 6574 6575 /* In the pre-compile phase, we don't actually do the replication. We 6576 just adjust the length as if we had. Do some paranoid checks for 6577 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit 6578 integer type when available, otherwise double. */ 6579 6580 if (lengthptr != NULL) 6581 { 6582 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE); 6583 if ((INT64_OR_DOUBLE)replicate* 6584 (INT64_OR_DOUBLE)(1 + LINK_SIZE) > 6585 (INT64_OR_DOUBLE)INT_MAX || 6586 OFLOW_MAX - *lengthptr < delta) 6587 { 6588 *errorcodeptr = ERR20; 6589 return 0; 6590 } 6591 *lengthptr += delta; 6592 } 6593 6594 else for (i = 0; i < replicate; i++) 6595 { 6596 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE)); 6597 previous = code; 6598 code += 1 + LINK_SIZE; 6599 } 6600 6601 /* If the number of repeats is fixed, we are done. Otherwise, adjust 6602 the counts and fall through. */ 6603 6604 if (repeat_min == repeat_max) break; 6605 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; 6606 repeat_min = 0; 6607 } 6608 6609 /* Wrap the recursion call in OP_BRA brackets. */ 6610 6611 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); 6612 op_previous = *previous = OP_BRA; 6613 PUT(previous, 1, 2 + 2*LINK_SIZE); 6614 previous[2 + 2*LINK_SIZE] = OP_KET; 6615 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); 6616 code += 2 + 2 * LINK_SIZE; 6617 length_prevgroup = 3 + 3*LINK_SIZE; 6618 group_return = -1; /* Set "may match empty string" */ 6619 6620 /* Now treat as a repeated OP_BRA. */ 6621 /* Fall through */ 6622 6623 /* If previous was a bracket group, we may have to replicate it in 6624 certain cases. Note that at this point we can encounter only the "basic" 6625 bracket opcodes such as BRA and CBRA, as this is the place where they get 6626 converted into the more special varieties such as BRAPOS and SBRA. 6627 Originally, PCRE did not allow repetition of assertions, but now it does, 6628 for Perl compatibility. */ 6629 6630 case OP_ASSERT: 6631 case OP_ASSERT_NOT: 6632 case OP_ASSERTBACK: 6633 case OP_ASSERTBACK_NOT: 6634 case OP_ONCE: 6635 case OP_BRA: 6636 case OP_CBRA: 6637 case OP_COND: 6638 { 6639 int len = (int)(code - previous); 6640 PCRE2_UCHAR *bralink = NULL; 6641 PCRE2_UCHAR *brazeroptr = NULL; 6642 6643 /* Repeating a DEFINE group (or any group where the condition is always 6644 FALSE and there is only one branch) is pointless, but Perl allows the 6645 syntax, so we just ignore the repeat. */ 6646 6647 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && 6648 previous[GET(previous, 1)] != OP_ALT) 6649 goto END_REPEAT; 6650 6651 /* There is no sense in actually repeating assertions. The only 6652 potential use of repetition is in cases when the assertion is optional. 6653 Therefore, if the minimum is greater than zero, just ignore the repeat. 6654 If the maximum is not zero or one, set it to 1. */ 6655 6656 if (op_previous < OP_ONCE) /* Assertion */ 6657 { 6658 if (repeat_min > 0) goto END_REPEAT; 6659 if (repeat_max > 1) repeat_max = 1; 6660 } 6661 6662 /* The case of a zero minimum is special because of the need to stick 6663 OP_BRAZERO in front of it, and because the group appears once in the 6664 data, whereas in other cases it appears the minimum number of times. For 6665 this reason, it is simplest to treat this case separately, as otherwise 6666 the code gets far too messy. There are several special subcases when the 6667 minimum is zero. */ 6668 6669 if (repeat_min == 0) 6670 { 6671 /* If the maximum is also zero, we used to just omit the group from 6672 the output altogether, like this: 6673 6674 ** if (repeat_max == 0) 6675 ** { 6676 ** code = previous; 6677 ** goto END_REPEAT; 6678 ** } 6679 6680 However, that fails when a group or a subgroup within it is 6681 referenced as a subroutine from elsewhere in the pattern, so now we 6682 stick in OP_SKIPZERO in front of it so that it is skipped on 6683 execution. As we don't have a list of which groups are referenced, we 6684 cannot do this selectively. 6685 6686 If the maximum is 1 or unlimited, we just have to stick in the 6687 BRAZERO and do no more at this point. */ 6688 6689 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED) 6690 { 6691 (void)memmove(previous + 1, previous, CU2BYTES(len)); 6692 code++; 6693 if (repeat_max == 0) 6694 { 6695 *previous++ = OP_SKIPZERO; 6696 goto END_REPEAT; 6697 } 6698 brazeroptr = previous; /* Save for possessive optimizing */ 6699 *previous++ = OP_BRAZERO + repeat_type; 6700 } 6701 6702 /* If the maximum is greater than 1 and limited, we have to replicate 6703 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 6704 The first one has to be handled carefully because it's the original 6705 copy, which has to be moved up. The remainder can be handled by code 6706 that is common with the non-zero minimum case below. We have to 6707 adjust the value or repeat_max, since one less copy is required. */ 6708 6709 else 6710 { 6711 int linkoffset; 6712 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); 6713 code += 2 + LINK_SIZE; 6714 *previous++ = OP_BRAZERO + repeat_type; 6715 *previous++ = OP_BRA; 6716 6717 /* We chain together the bracket link offset fields that have to be 6718 filled in later when the ends of the brackets are reached. */ 6719 6720 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink); 6721 bralink = previous; 6722 PUTINC(previous, 0, linkoffset); 6723 } 6724 6725 if (repeat_max != REPEAT_UNLIMITED) repeat_max--; 6726 } 6727 6728 /* If the minimum is greater than zero, replicate the group as many 6729 times as necessary, and adjust the maximum to the number of subsequent 6730 copies that we need. */ 6731 6732 else 6733 { 6734 if (repeat_min > 1) 6735 { 6736 /* In the pre-compile phase, we don't actually do the replication. 6737 We just adjust the length as if we had. Do some paranoid checks for 6738 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit 6739 integer type when available, otherwise double. */ 6740 6741 if (lengthptr != NULL) 6742 { 6743 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup; 6744 if ((INT64_OR_DOUBLE)(repeat_min - 1)* 6745 (INT64_OR_DOUBLE)length_prevgroup > 6746 (INT64_OR_DOUBLE)INT_MAX || 6747 OFLOW_MAX - *lengthptr < delta) 6748 { 6749 *errorcodeptr = ERR20; 6750 return 0; 6751 } 6752 *lengthptr += delta; 6753 } 6754 6755 /* This is compiling for real. If there is a set first code unit 6756 for the group, and we have not yet set a "required code unit", set 6757 it. */ 6758 6759 else 6760 { 6761 if (groupsetfirstcu && reqcuflags < 0) 6762 { 6763 reqcu = firstcu; 6764 reqcuflags = firstcuflags; 6765 } 6766 for (i = 1; (uint32_t)i < repeat_min; i++) 6767 { 6768 memcpy(code, previous, CU2BYTES(len)); 6769 code += len; 6770 } 6771 } 6772 } 6773 6774 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; 6775 } 6776 6777 /* This code is common to both the zero and non-zero minimum cases. If 6778 the maximum is limited, it replicates the group in a nested fashion, 6779 remembering the bracket starts on a stack. In the case of a zero 6780 minimum, the first one was set up above. In all cases the repeat_max 6781 now specifies the number of additional copies needed. Again, we must 6782 remember to replicate entries on the forward reference list. */ 6783 6784 if (repeat_max != REPEAT_UNLIMITED) 6785 { 6786 /* In the pre-compile phase, we don't actually do the replication. We 6787 just adjust the length as if we had. For each repetition we must add 6788 1 to the length for BRAZERO and for all but the last repetition we 6789 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some 6790 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type 6791 is a 64-bit integer type when available, otherwise double. */ 6792 6793 if (lengthptr != NULL && repeat_max > 0) 6794 { 6795 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 6796 2 - 2*LINK_SIZE; /* Last one doesn't nest */ 6797 if ((INT64_OR_DOUBLE)repeat_max * 6798 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) 6799 > (INT64_OR_DOUBLE)INT_MAX || 6800 OFLOW_MAX - *lengthptr < delta) 6801 { 6802 *errorcodeptr = ERR20; 6803 return 0; 6804 } 6805 *lengthptr += delta; 6806 } 6807 6808 /* This is compiling for real */ 6809 6810 else for (i = repeat_max - 1; i >= 0; i--) 6811 { 6812 *code++ = OP_BRAZERO + repeat_type; 6813 6814 /* All but the final copy start a new nesting, maintaining the 6815 chain of brackets outstanding. */ 6816 6817 if (i != 0) 6818 { 6819 int linkoffset; 6820 *code++ = OP_BRA; 6821 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink); 6822 bralink = code; 6823 PUTINC(code, 0, linkoffset); 6824 } 6825 6826 memcpy(code, previous, CU2BYTES(len)); 6827 code += len; 6828 } 6829 6830 /* Now chain through the pending brackets, and fill in their length 6831 fields (which are holding the chain links pro tem). */ 6832 6833 while (bralink != NULL) 6834 { 6835 int oldlinkoffset; 6836 int linkoffset = (int)(code - bralink + 1); 6837 PCRE2_UCHAR *bra = code - linkoffset; 6838 oldlinkoffset = GET(bra, 1); 6839 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 6840 *code++ = OP_KET; 6841 PUTINC(code, 0, linkoffset); 6842 PUT(bra, 1, linkoffset); 6843 } 6844 } 6845 6846 /* If the maximum is unlimited, set a repeater in the final copy. For 6847 ONCE brackets, that's all we need to do. However, possessively repeated 6848 ONCE brackets can be converted into non-capturing brackets, as the 6849 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to 6850 deal with possessive ONCEs specially. 6851 6852 Otherwise, when we are doing the actual compile phase, check to see 6853 whether this group is one that could match an empty string. If so, 6854 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so 6855 that runtime checking can be done. [This check is also applied to ONCE 6856 groups at runtime, but in a different way.] 6857 6858 Then, if the quantifier was possessive and the bracket is not a 6859 conditional, we convert the BRA code to the POS form, and the KET code to 6860 KETRPOS. (It turns out to be convenient at runtime to detect this kind of 6861 subpattern at both the start and at the end.) The use of special opcodes 6862 makes it possible to reduce greatly the stack usage in pcre2_match(). If 6863 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. 6864 6865 Then, if the minimum number of matches is 1 or 0, cancel the possessive 6866 flag so that the default action below, of wrapping everything inside 6867 atomic brackets, does not happen. When the minimum is greater than 1, 6868 there will be earlier copies of the group, and so we still have to wrap 6869 the whole thing. */ 6870 6871 else 6872 { 6873 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; 6874 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); 6875 6876 /* Convert possessive ONCE brackets to non-capturing */ 6877 6878 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; 6879 6880 /* For non-possessive ONCE brackets, all we need to do is to 6881 set the KET. */ 6882 6883 if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type; 6884 6885 /* Handle non-ONCE brackets and possessive ONCEs (which have been 6886 converted to non-capturing above). */ 6887 6888 else 6889 { 6890 /* In the compile phase, adjust the opcode if the group can match 6891 an empty string. For a conditional group with only one branch, the 6892 value of group_return will not show "could be empty", so we must 6893 check that separately. */ 6894 6895 if (lengthptr == NULL) 6896 { 6897 if (group_return < 0) *bracode += OP_SBRA - OP_BRA; 6898 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) 6899 *bracode = OP_SCOND; 6900 } 6901 6902 /* Handle possessive quantifiers. */ 6903 6904 if (possessive_quantifier) 6905 { 6906 /* For COND brackets, we wrap the whole thing in a possessively 6907 repeated non-capturing bracket, because we have not invented POS 6908 versions of the COND opcodes. */ 6909 6910 if (*bracode == OP_COND || *bracode == OP_SCOND) 6911 { 6912 int nlen = (int)(code - bracode); 6913 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); 6914 code += 1 + LINK_SIZE; 6915 nlen += 1 + LINK_SIZE; 6916 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; 6917 *code++ = OP_KETRPOS; 6918 PUTINC(code, 0, nlen); 6919 PUT(bracode, 1, nlen); 6920 } 6921 6922 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ 6923 6924 else 6925 { 6926 *bracode += 1; /* Switch to xxxPOS opcodes */ 6927 *ketcode = OP_KETRPOS; 6928 } 6929 6930 /* If the minimum is zero, mark it as possessive, then unset the 6931 possessive flag when the minimum is 0 or 1. */ 6932 6933 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; 6934 if (repeat_min < 2) possessive_quantifier = FALSE; 6935 } 6936 6937 /* Non-possessive quantifier */ 6938 6939 else *ketcode = OP_KETRMAX + repeat_type; 6940 } 6941 } 6942 } 6943 break; 6944 6945 /* If previous was a character type match (\d or similar), abolish it and 6946 create a suitable repeat item. The code is shared with single-character 6947 repeats by setting op_type to add a suitable offset into repeat_type. 6948 Note the the Unicode property types will be present only when 6949 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code 6950 here because it just makes it horribly messy. */ 6951 6952 default: 6953 if (op_previous >= OP_EODN) /* Not a character type - internal error */ 6954 { 6955 *errorcodeptr = ERR10; 6956 return 0; 6957 } 6958 else 6959 { 6960 int prop_type, prop_value; 6961 PCRE2_UCHAR *oldcode; 6962 6963 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 6964 mclength = 0; /* Not a character */ 6965 6966 if (op_previous == OP_PROP || op_previous == OP_NOTPROP) 6967 { 6968 prop_type = previous[1]; 6969 prop_value = previous[2]; 6970 } 6971 else 6972 { 6973 /* Come here from just above with a character in mcbuffer/mclength. */ 6974 OUTPUT_SINGLE_REPEAT: 6975 prop_type = prop_value = -1; 6976 } 6977 6978 /* At this point, if prop_type == prop_value == -1 we either have a 6979 character in mcbuffer when mclength is greater than zero, or we have 6980 mclength zero, in which case there is a non-property character type in 6981 op_previous. If prop_type/value are not negative, we have a property 6982 character type in op_previous. */ 6983 6984 oldcode = code; /* Save where we were */ 6985 code = previous; /* Usually overwrite previous item */ 6986 6987 /* If the maximum is zero then the minimum must also be zero; Perl allows 6988 this case, so we do too - by simply omitting the item altogether. */ 6989 6990 if (repeat_max == 0) goto END_REPEAT; 6991 6992 /* Combine the op_type with the repeat_type */ 6993 6994 repeat_type += op_type; 6995 6996 /* A minimum of zero is handled either as the special case * or ?, or as 6997 an UPTO, with the maximum given. */ 6998 6999 if (repeat_min == 0) 7000 { 7001 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type; 7002 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 7003 else 7004 { 7005 *code++ = OP_UPTO + repeat_type; 7006 PUT2INC(code, 0, repeat_max); 7007 } 7008 } 7009 7010 /* A repeat minimum of 1 is optimized into some special cases. If the 7011 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is 7012 left in place and, if the maximum is greater than 1, we use OP_UPTO with 7013 one less than the maximum. */ 7014 7015 else if (repeat_min == 1) 7016 { 7017 if (repeat_max == REPEAT_UNLIMITED) 7018 *code++ = OP_PLUS + repeat_type; 7019 else 7020 { 7021 code = oldcode; /* Leave previous item in place */ 7022 if (repeat_max == 1) goto END_REPEAT; 7023 *code++ = OP_UPTO + repeat_type; 7024 PUT2INC(code, 0, repeat_max - 1); 7025 } 7026 } 7027 7028 /* The case {n,n} is just an EXACT, while the general case {n,m} is 7029 handled as an EXACT followed by an UPTO or STAR or QUERY. */ 7030 7031 else 7032 { 7033 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 7034 PUT2INC(code, 0, repeat_min); 7035 7036 /* Unless repeat_max equals repeat_min, fill in the data for EXACT, 7037 and then generate the second opcode. For a repeated Unicode property 7038 match, there are two extra values that define the required property, 7039 and mclength is set zero to indicate this. */ 7040 7041 if (repeat_max != repeat_min) 7042 { 7043 if (mclength > 0) 7044 { 7045 memcpy(code, mcbuffer, CU2BYTES(mclength)); 7046 code += mclength; 7047 } 7048 else 7049 { 7050 *code++ = op_previous; 7051 if (prop_type >= 0) 7052 { 7053 *code++ = prop_type; 7054 *code++ = prop_value; 7055 } 7056 } 7057 7058 /* Now set up the following opcode */ 7059 7060 if (repeat_max == REPEAT_UNLIMITED) 7061 *code++ = OP_STAR + repeat_type; 7062 else 7063 { 7064 repeat_max -= repeat_min; 7065 if (repeat_max == 1) 7066 { 7067 *code++ = OP_QUERY + repeat_type; 7068 } 7069 else 7070 { 7071 *code++ = OP_UPTO + repeat_type; 7072 PUT2INC(code, 0, repeat_max); 7073 } 7074 } 7075 } 7076 } 7077 7078 /* Fill in the character or character type for the final opcode. */ 7079 7080 if (mclength > 0) 7081 { 7082 memcpy(code, mcbuffer, CU2BYTES(mclength)); 7083 code += mclength; 7084 } 7085 else 7086 { 7087 *code++ = op_previous; 7088 if (prop_type >= 0) 7089 { 7090 *code++ = prop_type; 7091 *code++ = prop_value; 7092 } 7093 } 7094 } 7095 break; 7096 } /* End of switch on different op_previous values */ 7097 7098 7099 /* If the character following a repeat is '+', possessive_quantifier is 7100 TRUE. For some opcodes, there are special alternative opcodes for this 7101 case. For anything else, we wrap the entire repeated item inside OP_ONCE 7102 brackets. Logically, the '+' notation is just syntactic sugar, taken from 7103 Sun's Java package, but the special opcodes can optimize it. 7104 7105 Some (but not all) possessively repeated subpatterns have already been 7106 completely handled in the code just above. For them, possessive_quantifier 7107 is always FALSE at this stage. Note that the repeated item starts at 7108 tempcode, not at previous, which might be the first part of a string whose 7109 (former) last char we repeated. */ 7110 7111 if (possessive_quantifier) 7112 { 7113 int len; 7114 7115 /* Possessifying an EXACT quantifier has no effect, so we can ignore it. 7116 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, 7117 {5,}, or {5,10}). We skip over an EXACT item; if the length of what 7118 remains is greater than zero, there's a further opcode that can be 7119 handled. If not, do nothing, leaving the EXACT alone. */ 7120 7121 switch(*tempcode) 7122 { 7123 case OP_TYPEEXACT: 7124 tempcode += PRIV(OP_lengths)[*tempcode] + 7125 ((tempcode[1 + IMM2_SIZE] == OP_PROP 7126 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 7127 break; 7128 7129 /* CHAR opcodes are used for exacts whose count is 1. */ 7130 7131 case OP_CHAR: 7132 case OP_CHARI: 7133 case OP_NOT: 7134 case OP_NOTI: 7135 case OP_EXACT: 7136 case OP_EXACTI: 7137 case OP_NOTEXACT: 7138 case OP_NOTEXACTI: 7139 tempcode += PRIV(OP_lengths)[*tempcode]; 7140 #ifdef SUPPORT_UNICODE 7141 if (utf && HAS_EXTRALEN(tempcode[-1])) 7142 tempcode += GET_EXTRALEN(tempcode[-1]); 7143 #endif 7144 break; 7145 7146 /* For the class opcodes, the repeat operator appears at the end; 7147 adjust tempcode to point to it. */ 7148 7149 case OP_CLASS: 7150 case OP_NCLASS: 7151 tempcode += 1 + 32/sizeof(PCRE2_UCHAR); 7152 break; 7153 7154 #ifdef SUPPORT_WIDE_CHARS 7155 case OP_XCLASS: 7156 tempcode += GET(tempcode, 1); 7157 break; 7158 #endif 7159 } 7160 7161 /* If tempcode is equal to code (which points to the end of the repeated 7162 item), it means we have skipped an EXACT item but there is no following 7163 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In 7164 all other cases, tempcode will be pointing to the repeat opcode, and will 7165 be less than code, so the value of len will be greater than 0. */ 7166 7167 len = (int)(code - tempcode); 7168 if (len > 0) 7169 { 7170 unsigned int repcode = *tempcode; 7171 7172 /* There is a table for possessifying opcodes, all of which are less 7173 than OP_CALLOUT. A zero entry means there is no possessified version. 7174 */ 7175 7176 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) 7177 *tempcode = opcode_possessify[repcode]; 7178 7179 /* For opcode without a special possessified version, wrap the item in 7180 ONCE brackets. */ 7181 7182 else 7183 { 7184 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); 7185 code += 1 + LINK_SIZE; 7186 len += 1 + LINK_SIZE; 7187 tempcode[0] = OP_ONCE; 7188 *code++ = OP_KET; 7189 PUTINC(code, 0, len); 7190 PUT(tempcode, 1, len); 7191 } 7192 } 7193 } 7194 7195 /* We set the "follows varying string" flag for subsequently encountered 7196 reqcus if it isn't already set and we have just passed a varying length 7197 item. */ 7198 7199 END_REPEAT: 7200 cb->req_varyopt |= reqvary; 7201 break; 7202 7203 7204 /* ===================================================================*/ 7205 /* Handle a 32-bit data character with a value greater than META_END. */ 7206 7207 case META_BIGVALUE: 7208 pptr++; 7209 goto NORMAL_CHAR; 7210 7211 7212 /* ===============================================================*/ 7213 /* Handle a back reference by number, which is the meta argument. The 7214 pattern offsets for back references to group numbers less than 10 are held 7215 in a special vector, to avoid using more than two parsed pattern elements 7216 in 64-bit environments. We only need the offset to the first occurrence, 7217 because if that doesn't fail, subsequent ones will also be OK. */ 7218 7219 case META_BACKREF: 7220 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; 7221 else GETPLUSOFFSET(offset, pptr); 7222 7223 if (meta_arg > cb->bracount) 7224 { 7225 cb->erroroffset = offset; 7226 *errorcodeptr = ERR15; /* Non-existent subpattern */ 7227 return 0; 7228 } 7229 7230 /* Come here from named backref handling when the reference is to a 7231 single group (that is, not to a duplicated name). The back reference 7232 data will have already been updated. We must disable firstcu if not 7233 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':' 7234 later. */ 7235 7236 HANDLE_SINGLE_REFERENCE: 7237 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; 7238 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; 7239 PUT2INC(code, 0, meta_arg); 7240 7241 /* Update the map of back references, and keep the highest one. We 7242 could do this in parse_regex() for numerical back references, but not 7243 for named back references, because we don't know the numbers to which 7244 named back references refer. So we do it all in this function. */ 7245 7246 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; 7247 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; 7248 7249 /* Check to see if this back reference is recursive, that it, it 7250 is inside the group that it references. A flag is set so that the 7251 group can be made atomic. */ 7252 7253 for (oc = cb->open_caps; oc != NULL; oc = oc->next) 7254 { 7255 if (oc->number == meta_arg) 7256 { 7257 oc->flag = TRUE; 7258 break; 7259 } 7260 } 7261 break; 7262 7263 7264 /* ===============================================================*/ 7265 /* Handle recursion by inserting the number of the called group (which is 7266 the meta argument) after OP_RECURSE. At the end of compiling the pattern is 7267 scanned and these numbers are replaced by offsets within the pattern. It is 7268 done like this to avoid problems with forward references and adjusting 7269 offsets when groups are duplicated and moved (as discovered in previous 7270 implementations). Note that a recursion does not have a set first character 7271 (relevant if it is repeated, because it will then be wrapped with ONCE 7272 brackets). */ 7273 7274 case META_RECURSE: 7275 GETPLUSOFFSET(offset, pptr); 7276 if (meta_arg > cb->bracount) 7277 { 7278 cb->erroroffset = offset; 7279 *errorcodeptr = ERR15; /* Non-existent subpattern */ 7280 return 0; 7281 } 7282 HANDLE_NUMERICAL_RECURSION: 7283 *code = OP_RECURSE; 7284 PUT(code, 1, meta_arg); 7285 code += 1 + LINK_SIZE; 7286 groupsetfirstcu = FALSE; 7287 cb->had_recurse = TRUE; 7288 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 7289 break; 7290 7291 7292 /* ===============================================================*/ 7293 /* Handle capturing parentheses; the number is the meta argument. */ 7294 7295 case META_CAPTURE: 7296 bravalue = OP_CBRA; 7297 skipunits = IMM2_SIZE; 7298 PUT2(code, 1+LINK_SIZE, meta_arg); 7299 cb->lastcapture = meta_arg; 7300 goto GROUP_PROCESS_NOTE_EMPTY; 7301 7302 7303 /* ===============================================================*/ 7304 /* Handle escape sequence items. For ones like \d, the ESC_values are 7305 arranged to be the same as the corresponding OP_values in the default case 7306 when PCRE2_UCP is not set (which is the only case in which they will appear 7307 here). 7308 7309 Note: \Q and \E are never seen here, as they were dealt with in 7310 parse_pattern(). Neither are numerical back references or recursions, which 7311 were turned into META_BACKREF or META_RECURSE items, respectively. \k and 7312 \g, when followed by names, are turned into META_BACKREF_BYNAME or 7313 META_RECURSE_BYNAME. */ 7314 7315 case META_ESCAPE: 7316 7317 /* We can test for escape sequences that consume a character because their 7318 values lie between ESC_b and ESC_Z; this may have to change if any new ones 7319 are ever created. For these sequences, we disable the setting of a first 7320 character if it hasn't already been set. */ 7321 7322 if (meta_arg > ESC_b && meta_arg < ESC_Z) 7323 { 7324 matched_char = TRUE; 7325 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 7326 } 7327 7328 /* Set values to reset to if this is followed by a zero repeat. */ 7329 7330 zerofirstcu = firstcu; 7331 zerofirstcuflags = firstcuflags; 7332 zeroreqcu = reqcu; 7333 zeroreqcuflags = reqcuflags; 7334 7335 /* If Unicode is not supported, \P and \p are not allowed and are 7336 faulted at parse time, so will never appear here. */ 7337 7338 #ifdef SUPPORT_UNICODE 7339 if (meta_arg == ESC_P || meta_arg == ESC_p) 7340 { 7341 uint32_t ptype = *(++pptr) >> 16; 7342 uint32_t pdata = *pptr & 0xffff; 7343 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; 7344 *code++ = ptype; 7345 *code++ = pdata; 7346 break; /* End META_ESCAPE */ 7347 } 7348 #endif 7349 7350 /* For the rest (including \X when Unicode is supported - if not it's 7351 faulted at parse time), the OP value is the escape value when PCRE2_UCP is 7352 not set; if it is set, these escapes do not show up here because they are 7353 converted into Unicode property tests in parse_regex(). Note that \b and \B 7354 do a one-character lookbehind, and \A also behaves as if it does. */ 7355 7356 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ 7357 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) && 7358 cb->max_lookbehind == 0) 7359 cb->max_lookbehind = 1; 7360 7361 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY 7362 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */ 7363 7364 #if PCRE2_CODE_UNIT_WIDTH == 32 7365 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg; 7366 #else 7367 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg; 7368 #endif 7369 break; /* End META_ESCAPE */ 7370 7371 7372 /* ===================================================================*/ 7373 /* Handle an unrecognized meta value. A parsed pattern value less than 7374 META_END is a literal. Otherwise we have a problem. */ 7375 7376 default: 7377 if (meta >= META_END) 7378 { 7379 #ifdef DEBUG_SHOW_PARSED 7380 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr); 7381 #endif 7382 *errorcodeptr = ERR89; /* Internal error - unrecognized. */ 7383 return 0; 7384 } 7385 7386 /* Handle a literal character. We come here by goto in the case of a 7387 32-bit, non-UTF character whose value is greater than META_END. */ 7388 7389 NORMAL_CHAR: 7390 meta = *pptr; /* Get the full 32 bits */ 7391 NORMAL_CHAR_SET: /* Character is already in meta */ 7392 matched_char = TRUE; 7393 7394 /* For caseless UTF mode, check whether this character has more than one 7395 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ 7396 7397 #ifdef SUPPORT_UNICODE 7398 if (utf && (options & PCRE2_CASELESS) != 0) 7399 { 7400 uint32_t caseset = UCD_CASESET(meta); 7401 if (caseset != 0) 7402 { 7403 *code++ = OP_PROP; 7404 *code++ = PT_CLIST; 7405 *code++ = caseset; 7406 if (firstcuflags == REQ_UNSET) 7407 firstcuflags = zerofirstcuflags = REQ_NONE; 7408 break; /* End handling this meta item */ 7409 } 7410 } 7411 #endif 7412 7413 /* Caseful matches, or not one of the multicase characters. Get the 7414 character's code units into mcbuffer, with the length in mclength. When not 7415 in UTF mode, the length is always 1. */ 7416 7417 #ifdef SUPPORT_UNICODE 7418 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else 7419 #endif 7420 { 7421 mclength = 1; 7422 mcbuffer[0] = meta; 7423 } 7424 7425 /* Generate the appropriate code */ 7426 7427 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; 7428 memcpy(code, mcbuffer, CU2BYTES(mclength)); 7429 code += mclength; 7430 7431 /* Remember if \r or \n were seen */ 7432 7433 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) 7434 cb->external_flags |= PCRE2_HASCRORLF; 7435 7436 /* Set the first and required code units appropriately. If no previous 7437 first code unit, set it from this character, but revert to none on a zero 7438 repeat. Otherwise, leave the firstcu value alone, and don't change it on 7439 a zero repeat. */ 7440 7441 if (firstcuflags == REQ_UNSET) 7442 { 7443 zerofirstcuflags = REQ_NONE; 7444 zeroreqcu = reqcu; 7445 zeroreqcuflags = reqcuflags; 7446 7447 /* If the character is more than one code unit long, we can set firstcu 7448 only if it is not to be matched caselessly. */ 7449 7450 if (mclength == 1 || req_caseopt == 0) 7451 { 7452 firstcu = mcbuffer[0]; 7453 firstcuflags = req_caseopt; 7454 if (mclength != 1) 7455 { 7456 reqcu = code[-1]; 7457 reqcuflags = cb->req_varyopt; 7458 } 7459 } 7460 else firstcuflags = reqcuflags = REQ_NONE; 7461 } 7462 7463 /* firstcu was previously set; we can set reqcu only if the length is 7464 1 or the matching is caseful. */ 7465 7466 else 7467 { 7468 zerofirstcu = firstcu; 7469 zerofirstcuflags = firstcuflags; 7470 zeroreqcu = reqcu; 7471 zeroreqcuflags = reqcuflags; 7472 if (mclength == 1 || req_caseopt == 0) 7473 { 7474 reqcu = code[-1]; 7475 reqcuflags = req_caseopt | cb->req_varyopt; 7476 } 7477 } 7478 break; /* End default meta handling */ 7479 } /* End of big switch */ 7480 } /* End of big loop */ 7481 7482 /* Control never reaches here. */ 7483 } 7484 7485 7486 7487 /************************************************* 7488 * Compile regex: a sequence of alternatives * 7489 *************************************************/ 7490 7491 /* On entry, pptr is pointing past the bracket meta, but on return it points to 7492 the closing bracket or META_END. The code variable is pointing at the code unit 7493 into which the BRA operator has been stored. This function is used during the 7494 pre-compile phase when we are trying to find out the amount of memory needed, 7495 as well as during the real compile phase. The value of lengthptr distinguishes 7496 the two phases. 7497 7498 Arguments: 7499 options option bits, including any changes for this subpattern 7500 codeptr -> the address of the current code pointer 7501 pptrptr -> the address of the current parsed pattern pointer 7502 errorcodeptr -> pointer to error code variable 7503 skipunits skip this many code units at start (for brackets and OP_COND) 7504 firstcuptr place to put the first required code unit 7505 firstcuflagsptr place to put the first code unit flags, or a negative number 7506 reqcuptr place to put the last required code unit 7507 reqcuflagsptr place to put the last required code unit flags, or a negative number 7508 bcptr pointer to the chain of currently open branches 7509 cb points to the data block with tables pointers etc. 7510 lengthptr NULL during the real compile phase 7511 points to length accumulator during pre-compile phase 7512 7513 Returns: 0 There has been an error 7514 +1 Success, this group must match at least one character 7515 -1 Success, this group may match an empty string 7516 */ 7517 7518 static int 7519 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, 7520 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr, 7521 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr, 7522 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr) 7523 { 7524 PCRE2_UCHAR *code = *codeptr; 7525 PCRE2_UCHAR *last_branch = code; 7526 PCRE2_UCHAR *start_bracket = code; 7527 BOOL lookbehind; 7528 open_capitem capitem; 7529 int capnumber = 0; 7530 int okreturn = 1; 7531 uint32_t *pptr = *pptrptr; 7532 uint32_t firstcu, reqcu; 7533 uint32_t lookbehindlength; 7534 int32_t firstcuflags, reqcuflags; 7535 uint32_t branchfirstcu, branchreqcu; 7536 int32_t branchfirstcuflags, branchreqcuflags; 7537 PCRE2_SIZE length; 7538 branch_chain bc; 7539 7540 /* If set, call the external function that checks for stack availability. */ 7541 7542 if (cb->cx->stack_guard != NULL && 7543 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) 7544 { 7545 *errorcodeptr= ERR33; 7546 return 0; 7547 } 7548 7549 /* Miscellaneous initialization */ 7550 7551 bc.outer = bcptr; 7552 bc.current_branch = code; 7553 7554 firstcu = reqcu = 0; 7555 firstcuflags = reqcuflags = REQ_UNSET; 7556 7557 /* Accumulate the length for use in the pre-compile phase. Start with the 7558 length of the BRA and KET and any extra code units that are required at the 7559 beginning. We accumulate in a local variable to save frequent testing of 7560 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the 7561 start and end of each alternative, because compiled items are discarded during 7562 the pre-compile phase so that the workspace is not exceeded. */ 7563 7564 length = 2 + 2*LINK_SIZE + skipunits; 7565 7566 /* Remember if this is a lookbehind assertion, and if it is, save its length 7567 and skip over the pattern offset. */ 7568 7569 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT; 7570 if (lookbehind) 7571 { 7572 lookbehindlength = META_DATA(pptr[-1]); 7573 pptr += SIZEOFFSET; 7574 } 7575 else lookbehindlength = 0; 7576 7577 /* If this is a capturing subpattern, add to the chain of open capturing items 7578 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA 7579 need be tested here; changing this opcode to one of its variants, e.g. 7580 OP_SCBRAPOS, happens later, after the group has been compiled. */ 7581 7582 if (*code == OP_CBRA) 7583 { 7584 capnumber = GET2(code, 1 + LINK_SIZE); 7585 capitem.number = capnumber; 7586 capitem.next = cb->open_caps; 7587 capitem.flag = FALSE; 7588 capitem.assert_depth = cb->assert_depth; 7589 cb->open_caps = &capitem; 7590 } 7591 7592 /* Offset is set zero to mark that this bracket is still open */ 7593 7594 PUT(code, 1, 0); 7595 code += 1 + LINK_SIZE + skipunits; 7596 7597 /* Loop for each alternative branch */ 7598 7599 for (;;) 7600 { 7601 int branch_return; 7602 7603 /* Insert OP_REVERSE if this is as lookbehind assertion. */ 7604 7605 if (lookbehind && lookbehindlength > 0) 7606 { 7607 *code++ = OP_REVERSE; 7608 PUTINC(code, 0, lookbehindlength); 7609 length += 1 + LINK_SIZE; 7610 } 7611 7612 /* Now compile the branch; in the pre-compile phase its length gets added 7613 into the length. */ 7614 7615 if ((branch_return = 7616 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu, 7617 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, 7618 cb, (lengthptr == NULL)? NULL : &length)) == 0) 7619 return 0; 7620 7621 /* If a branch can match an empty string, so can the whole group. */ 7622 7623 if (branch_return < 0) okreturn = -1; 7624 7625 /* In the real compile phase, there is some post-processing to be done. */ 7626 7627 if (lengthptr == NULL) 7628 { 7629 /* If this is the first branch, the firstcu and reqcu values for the 7630 branch become the values for the regex. */ 7631 7632 if (*last_branch != OP_ALT) 7633 { 7634 firstcu = branchfirstcu; 7635 firstcuflags = branchfirstcuflags; 7636 reqcu = branchreqcu; 7637 reqcuflags = branchreqcuflags; 7638 } 7639 7640 /* If this is not the first branch, the first char and reqcu have to 7641 match the values from all the previous branches, except that if the 7642 previous value for reqcu didn't have REQ_VARY set, it can still match, 7643 and we set REQ_VARY for the regex. */ 7644 7645 else 7646 { 7647 /* If we previously had a firstcu, but it doesn't match the new branch, 7648 we have to abandon the firstcu for the regex, but if there was 7649 previously no reqcu, it takes on the value of the old firstcu. */ 7650 7651 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) 7652 { 7653 if (firstcuflags >= 0) 7654 { 7655 if (reqcuflags < 0) 7656 { 7657 reqcu = firstcu; 7658 reqcuflags = firstcuflags; 7659 } 7660 } 7661 firstcuflags = REQ_NONE; 7662 } 7663 7664 /* If we (now or from before) have no firstcu, a firstcu from the 7665 branch becomes a reqcu if there isn't a branch reqcu. */ 7666 7667 if (firstcuflags < 0 && branchfirstcuflags >= 0 && 7668 branchreqcuflags < 0) 7669 { 7670 branchreqcu = branchfirstcu; 7671 branchreqcuflags = branchfirstcuflags; 7672 } 7673 7674 /* Now ensure that the reqcus match */ 7675 7676 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || 7677 reqcu != branchreqcu) 7678 reqcuflags = REQ_NONE; 7679 else 7680 { 7681 reqcu = branchreqcu; 7682 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */ 7683 } 7684 } 7685 } 7686 7687 /* Handle reaching the end of the expression, either ')' or end of pattern. 7688 In the real compile phase, go back through the alternative branches and 7689 reverse the chain of offsets, with the field in the BRA item now becoming an 7690 offset to the first alternative. If there are no alternatives, it points to 7691 the end of the group. The length in the terminating ket is always the length 7692 of the whole bracketed item. Return leaving the pointer at the terminating 7693 char. */ 7694 7695 if (META_CODE(*pptr) != META_ALT) 7696 { 7697 if (lengthptr == NULL) 7698 { 7699 PCRE2_SIZE branch_length = code - last_branch; 7700 do 7701 { 7702 PCRE2_SIZE prev_length = GET(last_branch, 1); 7703 PUT(last_branch, 1, branch_length); 7704 branch_length = prev_length; 7705 last_branch -= branch_length; 7706 } 7707 while (branch_length > 0); 7708 } 7709 7710 /* Fill in the ket */ 7711 7712 *code = OP_KET; 7713 PUT(code, 1, (int)(code - start_bracket)); 7714 code += 1 + LINK_SIZE; 7715 7716 /* If it was a capturing subpattern, check to see if it contained any 7717 recursive back references. If so, we must wrap it in atomic brackets. In 7718 any event, remove the block from the chain. */ 7719 7720 if (capnumber > 0) 7721 { 7722 if (cb->open_caps->flag) 7723 { 7724 (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket, 7725 CU2BYTES(code - start_bracket)); 7726 *start_bracket = OP_ONCE; 7727 code += 1 + LINK_SIZE; 7728 PUT(start_bracket, 1, (int)(code - start_bracket)); 7729 *code = OP_KET; 7730 PUT(code, 1, (int)(code - start_bracket)); 7731 code += 1 + LINK_SIZE; 7732 length += 2 + 2*LINK_SIZE; 7733 } 7734 cb->open_caps = cb->open_caps->next; 7735 } 7736 7737 /* Set values to pass back */ 7738 7739 *codeptr = code; 7740 *pptrptr = pptr; 7741 *firstcuptr = firstcu; 7742 *firstcuflagsptr = firstcuflags; 7743 *reqcuptr = reqcu; 7744 *reqcuflagsptr = reqcuflags; 7745 if (lengthptr != NULL) 7746 { 7747 if (OFLOW_MAX - *lengthptr < length) 7748 { 7749 *errorcodeptr = ERR20; 7750 return 0; 7751 } 7752 *lengthptr += length; 7753 } 7754 return okreturn; 7755 } 7756 7757 /* Another branch follows. In the pre-compile phase, we can move the code 7758 pointer back to where it was for the start of the first branch. (That is, 7759 pretend that each branch is the only one.) 7760 7761 In the real compile phase, insert an ALT node. Its length field points back 7762 to the previous branch while the bracket remains open. At the end the chain 7763 is reversed. It's done like this so that the start of the bracket has a 7764 zero offset until it is closed, making it possible to detect recursion. */ 7765 7766 if (lengthptr != NULL) 7767 { 7768 code = *codeptr + 1 + LINK_SIZE + skipunits; 7769 length += 1 + LINK_SIZE; 7770 } 7771 else 7772 { 7773 *code = OP_ALT; 7774 PUT(code, 1, (int)(code - last_branch)); 7775 bc.current_branch = last_branch = code; 7776 code += 1 + LINK_SIZE; 7777 } 7778 7779 /* Set the lookbehind length (if not in a lookbehind the value will be zero) 7780 and then advance past the vertical bar. */ 7781 7782 lookbehindlength = META_DATA(*pptr); 7783 pptr++; 7784 } 7785 /* Control never reaches here */ 7786 } 7787 7788 7789 7790 /************************************************* 7791 * Check for anchored pattern * 7792 *************************************************/ 7793 7794 /* Try to find out if this is an anchored regular expression. Consider each 7795 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket 7796 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then 7797 it's anchored. However, if this is a multiline pattern, then only OP_SOD will 7798 be found, because ^ generates OP_CIRCM in that mode. 7799 7800 We can also consider a regex to be anchored if OP_SOM starts all its branches. 7801 This is the code for \G, which means "match at start of match position, taking 7802 into account the match offset". 7803 7804 A branch is also implicitly anchored if it starts with .* and DOTALL is set, 7805 because that will try the rest of the pattern at all possible matching points, 7806 so there is no point trying again.... er .... 7807 7808 .... except when the .* appears inside capturing parentheses, and there is a 7809 subsequent back reference to those parentheses. We haven't enough information 7810 to catch that case precisely. 7811 7812 At first, the best we could do was to detect when .* was in capturing brackets 7813 and the highest back reference was greater than or equal to that level. 7814 However, by keeping a bitmap of the first 31 back references, we can catch some 7815 of the more common cases more precisely. 7816 7817 ... A second exception is when the .* appears inside an atomic group, because 7818 this prevents the number of characters it matches from being adjusted. 7819 7820 Arguments: 7821 code points to start of the compiled pattern 7822 bracket_map a bitmap of which brackets we are inside while testing; this 7823 handles up to substring 31; after that we just have to take 7824 the less precise approach 7825 cb points to the compile data block 7826 atomcount atomic group level 7827 inassert TRUE if in an assertion 7828 7829 Returns: TRUE or FALSE 7830 */ 7831 7832 static BOOL 7833 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, 7834 int atomcount, BOOL inassert) 7835 { 7836 do { 7837 PCRE2_SPTR scode = first_significant_code( 7838 code + PRIV(OP_lengths)[*code], FALSE); 7839 int op = *scode; 7840 7841 /* Non-capturing brackets */ 7842 7843 if (op == OP_BRA || op == OP_BRAPOS || 7844 op == OP_SBRA || op == OP_SBRAPOS) 7845 { 7846 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) 7847 return FALSE; 7848 } 7849 7850 /* Capturing brackets */ 7851 7852 else if (op == OP_CBRA || op == OP_CBRAPOS || 7853 op == OP_SCBRA || op == OP_SCBRAPOS) 7854 { 7855 int n = GET2(scode, 1+LINK_SIZE); 7856 int new_map = bracket_map | ((n < 32)? (1u << n) : 1); 7857 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; 7858 } 7859 7860 /* Positive forward assertion */ 7861 7862 else if (op == OP_ASSERT) 7863 { 7864 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; 7865 } 7866 7867 /* Condition. If there is no second branch, it can't be anchored. */ 7868 7869 else if (op == OP_COND || op == OP_SCOND) 7870 { 7871 if (scode[GET(scode,1)] != OP_ALT) return FALSE; 7872 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) 7873 return FALSE; 7874 } 7875 7876 /* Atomic groups */ 7877 7878 else if (op == OP_ONCE) 7879 { 7880 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) 7881 return FALSE; 7882 } 7883 7884 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and 7885 it isn't in brackets that are or may be referenced or inside an atomic 7886 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, 7887 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ 7888 with the subject "aab", which matches "b", i.e. not at the start of a line. 7889 There is also an option that disables auto-anchoring. */ 7890 7891 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || 7892 op == OP_TYPEPOSSTAR)) 7893 { 7894 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || 7895 atomcount > 0 || cb->had_pruneorskip || inassert || 7896 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) 7897 return FALSE; 7898 } 7899 7900 /* Check for explicit anchoring */ 7901 7902 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; 7903 7904 code += GET(code, 1); 7905 } 7906 while (*code == OP_ALT); /* Loop for each alternative */ 7907 return TRUE; 7908 } 7909 7910 7911 7912 /************************************************* 7913 * Check for starting with ^ or .* * 7914 *************************************************/ 7915 7916 /* This is called to find out if every branch starts with ^ or .* so that 7917 "first char" processing can be done to speed things up in multiline 7918 matching and for non-DOTALL patterns that start with .* (which must start at 7919 the beginning or after \n). As in the case of is_anchored() (see above), we 7920 have to take account of back references to capturing brackets that contain .* 7921 because in that case we can't make the assumption. Also, the appearance of .* 7922 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE 7923 or *SKIP does not count, because once again the assumption no longer holds. 7924 7925 Arguments: 7926 code points to start of the compiled pattern or a group 7927 bracket_map a bitmap of which brackets we are inside while testing; this 7928 handles up to substring 31; after that we just have to take 7929 the less precise approach 7930 cb points to the compile data 7931 atomcount atomic group level 7932 inassert TRUE if in an assertion 7933 7934 Returns: TRUE or FALSE 7935 */ 7936 7937 static BOOL 7938 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, 7939 int atomcount, BOOL inassert) 7940 { 7941 do { 7942 PCRE2_SPTR scode = first_significant_code( 7943 code + PRIV(OP_lengths)[*code], FALSE); 7944 int op = *scode; 7945 7946 /* If we are at the start of a conditional assertion group, *both* the 7947 conditional assertion *and* what follows the condition must satisfy the test 7948 for start of line. Other kinds of condition fail. Note that there may be an 7949 auto-callout at the start of a condition. */ 7950 7951 if (op == OP_COND) 7952 { 7953 scode += 1 + LINK_SIZE; 7954 7955 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; 7956 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); 7957 7958 switch (*scode) 7959 { 7960 case OP_CREF: 7961 case OP_DNCREF: 7962 case OP_RREF: 7963 case OP_DNRREF: 7964 case OP_FAIL: 7965 case OP_FALSE: 7966 case OP_TRUE: 7967 return FALSE; 7968 7969 default: /* Assertion */ 7970 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; 7971 do scode += GET(scode, 1); while (*scode == OP_ALT); 7972 scode += 1 + LINK_SIZE; 7973 break; 7974 } 7975 scode = first_significant_code(scode, FALSE); 7976 op = *scode; 7977 } 7978 7979 /* Non-capturing brackets */ 7980 7981 if (op == OP_BRA || op == OP_BRAPOS || 7982 op == OP_SBRA || op == OP_SBRAPOS) 7983 { 7984 if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) 7985 return FALSE; 7986 } 7987 7988 /* Capturing brackets */ 7989 7990 else if (op == OP_CBRA || op == OP_CBRAPOS || 7991 op == OP_SCBRA || op == OP_SCBRAPOS) 7992 { 7993 int n = GET2(scode, 1+LINK_SIZE); 7994 int new_map = bracket_map | ((n < 32)? (1u << n) : 1); 7995 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; 7996 } 7997 7998 /* Positive forward assertions */ 7999 8000 else if (op == OP_ASSERT) 8001 { 8002 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) 8003 return FALSE; 8004 } 8005 8006 /* Atomic brackets */ 8007 8008 else if (op == OP_ONCE) 8009 { 8010 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) 8011 return FALSE; 8012 } 8013 8014 /* .* means "start at start or after \n" if it isn't in atomic brackets or 8015 brackets that may be referenced or an assertion, and as long as the pattern 8016 does not contain *PRUNE or *SKIP, because these break the feature. Consider, 8017 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", 8018 i.e. not at the start of a line. There is also an option that disables this 8019 optimization. */ 8020 8021 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) 8022 { 8023 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || 8024 atomcount > 0 || cb->had_pruneorskip || inassert || 8025 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) 8026 return FALSE; 8027 } 8028 8029 /* Check for explicit circumflex; anything else gives a FALSE result. Note 8030 in particular that this includes atomic brackets OP_ONCE because the number 8031 of characters matched by .* cannot be adjusted inside them. */ 8032 8033 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; 8034 8035 /* Move on to the next alternative */ 8036 8037 code += GET(code, 1); 8038 } 8039 while (*code == OP_ALT); /* Loop for each alternative */ 8040 return TRUE; 8041 } 8042 8043 8044 8045 /************************************************* 8046 * Scan compiled regex for recursion reference * 8047 *************************************************/ 8048 8049 /* This function scans through a compiled pattern until it finds an instance of 8050 OP_RECURSE. 8051 8052 Arguments: 8053 code points to start of expression 8054 utf TRUE in UTF mode 8055 8056 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found 8057 */ 8058 8059 static PCRE2_SPTR 8060 find_recurse(PCRE2_SPTR code, BOOL utf) 8061 { 8062 for (;;) 8063 { 8064 PCRE2_UCHAR c = *code; 8065 if (c == OP_END) return NULL; 8066 if (c == OP_RECURSE) return code; 8067 8068 /* XCLASS is used for classes that cannot be represented just by a bit map. 8069 This includes negated single high-valued characters. CALLOUT_STR is used for 8070 callouts with string arguments. In both cases the length in the table is 8071 zero; the actual length is stored in the compiled code. */ 8072 8073 if (c == OP_XCLASS) code += GET(code, 1); 8074 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); 8075 8076 /* Otherwise, we can get the item's length from the table, except that for 8077 repeated character types, we have to test for \p and \P, which have an extra 8078 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, 8079 we must add in its length. */ 8080 8081 else 8082 { 8083 switch(c) 8084 { 8085 case OP_TYPESTAR: 8086 case OP_TYPEMINSTAR: 8087 case OP_TYPEPLUS: 8088 case OP_TYPEMINPLUS: 8089 case OP_TYPEQUERY: 8090 case OP_TYPEMINQUERY: 8091 case OP_TYPEPOSSTAR: 8092 case OP_TYPEPOSPLUS: 8093 case OP_TYPEPOSQUERY: 8094 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 8095 break; 8096 8097 case OP_TYPEPOSUPTO: 8098 case OP_TYPEUPTO: 8099 case OP_TYPEMINUPTO: 8100 case OP_TYPEEXACT: 8101 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 8102 code += 2; 8103 break; 8104 8105 case OP_MARK: 8106 case OP_COMMIT_ARG: 8107 case OP_PRUNE_ARG: 8108 case OP_SKIP_ARG: 8109 case OP_THEN_ARG: 8110 code += code[1]; 8111 break; 8112 } 8113 8114 /* Add in the fixed length from the table */ 8115 8116 code += PRIV(OP_lengths)[c]; 8117 8118 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may 8119 be followed by a multi-unit character. The length in the table is a 8120 minimum, so we have to arrange to skip the extra units. */ 8121 8122 #ifdef MAYBE_UTF_MULTI 8123 if (utf) switch(c) 8124 { 8125 case OP_CHAR: 8126 case OP_CHARI: 8127 case OP_NOT: 8128 case OP_NOTI: 8129 case OP_EXACT: 8130 case OP_EXACTI: 8131 case OP_NOTEXACT: 8132 case OP_NOTEXACTI: 8133 case OP_UPTO: 8134 case OP_UPTOI: 8135 case OP_NOTUPTO: 8136 case OP_NOTUPTOI: 8137 case OP_MINUPTO: 8138 case OP_MINUPTOI: 8139 case OP_NOTMINUPTO: 8140 case OP_NOTMINUPTOI: 8141 case OP_POSUPTO: 8142 case OP_POSUPTOI: 8143 case OP_NOTPOSUPTO: 8144 case OP_NOTPOSUPTOI: 8145 case OP_STAR: 8146 case OP_STARI: 8147 case OP_NOTSTAR: 8148 case OP_NOTSTARI: 8149 case OP_MINSTAR: 8150 case OP_MINSTARI: 8151 case OP_NOTMINSTAR: 8152 case OP_NOTMINSTARI: 8153 case OP_POSSTAR: 8154 case OP_POSSTARI: 8155 case OP_NOTPOSSTAR: 8156 case OP_NOTPOSSTARI: 8157 case OP_PLUS: 8158 case OP_PLUSI: 8159 case OP_NOTPLUS: 8160 case OP_NOTPLUSI: 8161 case OP_MINPLUS: 8162 case OP_MINPLUSI: 8163 case OP_NOTMINPLUS: 8164 case OP_NOTMINPLUSI: 8165 case OP_POSPLUS: 8166 case OP_POSPLUSI: 8167 case OP_NOTPOSPLUS: 8168 case OP_NOTPOSPLUSI: 8169 case OP_QUERY: 8170 case OP_QUERYI: 8171 case OP_NOTQUERY: 8172 case OP_NOTQUERYI: 8173 case OP_MINQUERY: 8174 case OP_MINQUERYI: 8175 case OP_NOTMINQUERY: 8176 case OP_NOTMINQUERYI: 8177 case OP_POSQUERY: 8178 case OP_POSQUERYI: 8179 case OP_NOTPOSQUERY: 8180 case OP_NOTPOSQUERYI: 8181 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 8182 break; 8183 } 8184 #else 8185 (void)(utf); /* Keep compiler happy by referencing function argument */ 8186 #endif /* MAYBE_UTF_MULTI */ 8187 } 8188 } 8189 } 8190 8191 8192 8193 /************************************************* 8194 * Check for asserted fixed first code unit * 8195 *************************************************/ 8196 8197 /* During compilation, the "first code unit" settings from forward assertions 8198 are discarded, because they can cause conflicts with actual literals that 8199 follow. However, if we end up without a first code unit setting for an 8200 unanchored pattern, it is worth scanning the regex to see if there is an 8201 initial asserted first code unit. If all branches start with the same asserted 8202 code unit, or with a non-conditional bracket all of whose alternatives start 8203 with the same asserted code unit (recurse ad lib), then we return that code 8204 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with 8205 REQ_NONE in the flags. 8206 8207 Arguments: 8208 code points to start of compiled pattern 8209 flags points to the first code unit flags 8210 inassert non-zero if in an assertion 8211 8212 Returns: the fixed first code unit, or 0 with REQ_NONE in flags 8213 */ 8214 8215 static uint32_t 8216 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert) 8217 { 8218 uint32_t c = 0; 8219 int cflags = REQ_NONE; 8220 8221 *flags = REQ_NONE; 8222 do { 8223 uint32_t d; 8224 int dflags; 8225 int xl = (*code == OP_CBRA || *code == OP_SCBRA || 8226 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; 8227 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); 8228 PCRE2_UCHAR op = *scode; 8229 8230 switch(op) 8231 { 8232 default: 8233 return 0; 8234 8235 case OP_BRA: 8236 case OP_BRAPOS: 8237 case OP_CBRA: 8238 case OP_SCBRA: 8239 case OP_CBRAPOS: 8240 case OP_SCBRAPOS: 8241 case OP_ASSERT: 8242 case OP_ONCE: 8243 d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0)); 8244 if (dflags < 0) 8245 return 0; 8246 if (cflags < 0) { c = d; cflags = dflags; } 8247 else if (c != d || cflags != dflags) return 0; 8248 break; 8249 8250 case OP_EXACT: 8251 scode += IMM2_SIZE; 8252 /* Fall through */ 8253 8254 case OP_CHAR: 8255 case OP_PLUS: 8256 case OP_MINPLUS: 8257 case OP_POSPLUS: 8258 if (inassert == 0) return 0; 8259 if (cflags < 0) { c = scode[1]; cflags = 0; } 8260 else if (c != scode[1]) return 0; 8261 break; 8262 8263 case OP_EXACTI: 8264 scode += IMM2_SIZE; 8265 /* Fall through */ 8266 8267 case OP_CHARI: 8268 case OP_PLUSI: 8269 case OP_MINPLUSI: 8270 case OP_POSPLUSI: 8271 if (inassert == 0) return 0; 8272 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } 8273 else if (c != scode[1]) return 0; 8274 break; 8275 } 8276 8277 code += GET(code, 1); 8278 } 8279 while (*code == OP_ALT); 8280 8281 *flags = cflags; 8282 return c; 8283 } 8284 8285 8286 8287 /************************************************* 8288 * Add an entry to the name/number table * 8289 *************************************************/ 8290 8291 /* This function is called between compiling passes to add an entry to the 8292 name/number table, maintaining alphabetical order. Checking for permitted 8293 and forbidden duplicates has already been done. 8294 8295 Arguments: 8296 cb the compile data block 8297 name the name to add 8298 length the length of the name 8299 groupno the group number 8300 tablecount the count of names in the table so far 8301 8302 Returns: nothing 8303 */ 8304 8305 static void 8306 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, 8307 unsigned int groupno, uint32_t tablecount) 8308 { 8309 uint32_t i; 8310 PCRE2_UCHAR *slot = cb->name_table; 8311 8312 for (i = 0; i < tablecount; i++) 8313 { 8314 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); 8315 if (crc == 0 && slot[IMM2_SIZE+length] != 0) 8316 crc = -1; /* Current name is a substring */ 8317 8318 /* Make space in the table and break the loop for an earlier name. For a 8319 duplicate or later name, carry on. We do this for duplicates so that in the 8320 simple case (when ?(| is not used) they are in order of their numbers. In all 8321 cases they are in the order in which they appear in the pattern. */ 8322 8323 if (crc < 0) 8324 { 8325 (void)memmove(slot + cb->name_entry_size, slot, 8326 CU2BYTES((tablecount - i) * cb->name_entry_size)); 8327 break; 8328 } 8329 8330 /* Continue the loop for a later or duplicate name */ 8331 8332 slot += cb->name_entry_size; 8333 } 8334 8335 PUT2(slot, 0, groupno); 8336 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); 8337 8338 /* Add a terminating zero and fill the rest of the slot with zeroes so that 8339 the memory is all initialized. Otherwise valgrind moans about uninitialized 8340 memory when saving serialized compiled patterns. */ 8341 8342 memset(slot + IMM2_SIZE + length, 0, 8343 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE)); 8344 } 8345 8346 8347 8348 /************************************************* 8349 * Skip in parsed pattern * 8350 *************************************************/ 8351 8352 /* This function is called to skip parts of the parsed pattern when finding the 8353 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find 8354 the end of the branch, it is called to skip over an internal lookaround, and it 8355 is also called to skip to the end of a class, during which it will never 8356 encounter nested groups (but there's no need to have special code for that). 8357 8358 When called to find the end of a branch or group, pptr must point to the first 8359 meta code inside the branch, not the branch-starting code. In other cases it 8360 can point to the item that causes the function to be called. 8361 8362 Arguments: 8363 pptr current pointer to skip from 8364 skiptype PSKIP_CLASS when skipping to end of class 8365 PSKIP_ALT when META_ALT ends the skip 8366 PSKIP_KET when only META_KET ends the skip 8367 8368 Returns: new value of pptr 8369 NULL if META_END is reached - should never occur 8370 or for an unknown meta value - likewise 8371 */ 8372 8373 static uint32_t * 8374 parsed_skip(uint32_t *pptr, uint32_t skiptype) 8375 { 8376 uint32_t nestlevel = 0; 8377 8378 for (;; pptr++) 8379 { 8380 uint32_t meta = META_CODE(*pptr); 8381 8382 switch(meta) 8383 { 8384 default: /* Just skip over most items */ 8385 if (meta < META_END) continue; /* Literal */ 8386 break; 8387 8388 /* This should never occur. */ 8389 8390 case META_END: 8391 return NULL; 8392 8393 /* The data for these items is variable in length. */ 8394 8395 case META_BACKREF: /* Offset is present only if group >= 10 */ 8396 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET; 8397 break; 8398 8399 case META_ESCAPE: /* A few escapes are followed by data items. */ 8400 switch (META_DATA(*pptr)) 8401 { 8402 case ESC_P: 8403 case ESC_p: 8404 pptr += 1; 8405 break; 8406 8407 case ESC_g: 8408 case ESC_k: 8409 pptr += 1 + SIZEOFFSET; 8410 break; 8411 } 8412 break; 8413 8414 case META_MARK: /* Add the length of the name. */ 8415 case META_COMMIT_ARG: 8416 case META_PRUNE_ARG: 8417 case META_SKIP_ARG: 8418 case META_THEN_ARG: 8419 pptr += pptr[1]; 8420 break; 8421 8422 /* These are the "active" items in this loop. */ 8423 8424 case META_CLASS_END: 8425 if (skiptype == PSKIP_CLASS) return pptr; 8426 break; 8427 8428 case META_ATOMIC: 8429 case META_CAPTURE: 8430 case META_COND_ASSERT: 8431 case META_COND_DEFINE: 8432 case META_COND_NAME: 8433 case META_COND_NUMBER: 8434 case META_COND_RNAME: 8435 case META_COND_RNUMBER: 8436 case META_COND_VERSION: 8437 case META_LOOKAHEAD: 8438 case META_LOOKAHEADNOT: 8439 case META_LOOKBEHIND: 8440 case META_LOOKBEHINDNOT: 8441 case META_NOCAPTURE: 8442 nestlevel++; 8443 break; 8444 8445 case META_ALT: 8446 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr; 8447 break; 8448 8449 case META_KET: 8450 if (nestlevel == 0) return pptr; 8451 nestlevel--; 8452 break; 8453 } 8454 8455 /* The extra data item length for each meta is in a table. */ 8456 8457 meta = (meta >> 16) & 0x7fff; 8458 if (meta >= sizeof(meta_extra_lengths)) return NULL; 8459 pptr += meta_extra_lengths[meta]; 8460 } 8461 /* Control never reaches here */ 8462 return pptr; 8463 } 8464 8465 8466 8467 /************************************************* 8468 * Find length of a parsed group * 8469 *************************************************/ 8470 8471 /* This is called for nested groups within a branch of a lookbehind whose 8472 length is being computed. If all the branches in the nested group have the same 8473 length, that is OK. On entry, the pointer must be at the first element after 8474 the group initializing code. On exit it points to OP_KET. Caching is used to 8475 improve processing speed when the same capturing group occurs many times. 8476 8477 Arguments: 8478 pptrptr pointer to pointer in the parsed pattern 8479 isinline FALSE if a reference or recursion; TRUE for inline group 8480 errcodeptr pointer to the errorcode 8481 lcptr pointer to the loop counter 8482 group number of captured group or -1 for a non-capturing group 8483 recurses chain of recurse_check to catch mutual recursion 8484 cb pointer to the compile data 8485 8486 Returns: the group length or a negative number 8487 */ 8488 8489 static int 8490 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr, 8491 int group, parsed_recurse_check *recurses, compile_block *cb) 8492 { 8493 int branchlength; 8494 int grouplength = -1; 8495 8496 /* The cache can be used only if there is no possibility of there being two 8497 groups with the same number. We do not need to set the end pointer for a group 8498 that is being processed as a back reference or recursion, but we must do so for 8499 an inline group. */ 8500 8501 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0) 8502 { 8503 uint32_t groupinfo = cb->groupinfo[group]; 8504 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1; 8505 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) 8506 { 8507 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET); 8508 return groupinfo & GI_FIXED_LENGTH_MASK; 8509 } 8510 } 8511 8512 /* Scan the group. In this case we find the end pointer of necessity. */ 8513 8514 for(;;) 8515 { 8516 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); 8517 if (branchlength < 0) goto ISNOTFIXED; 8518 if (grouplength == -1) grouplength = branchlength; 8519 else if (grouplength != branchlength) goto ISNOTFIXED; 8520 if (**pptrptr == META_KET) break; 8521 *pptrptr += 1; /* Skip META_ALT */ 8522 } 8523 8524 if (group > 0) 8525 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); 8526 return grouplength; 8527 8528 ISNOTFIXED: 8529 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH; 8530 return -1; 8531 } 8532 8533 8534 8535 /************************************************* 8536 * Find length of a parsed branch * 8537 *************************************************/ 8538 8539 /* Return a fixed length for a branch in a lookbehind, giving an error if the 8540 length is not fixed. If any lookbehinds are encountered on the way, they get 8541 their length set. On entry, *pptrptr points to the first element inside the 8542 branch. On exit it is set to point to the ALT or KET. 8543 8544 Arguments: 8545 pptrptr pointer to pointer in the parsed pattern 8546 errcodeptr pointer to error code 8547 lcptr pointer to loop counter 8548 recurses chain of recurse_check to catch mutual recursion 8549 cb pointer to compile block 8550 8551 Returns: the length, or a negative value on error 8552 */ 8553 8554 static int 8555 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, 8556 parsed_recurse_check *recurses, compile_block *cb) 8557 { 8558 int branchlength = 0; 8559 int grouplength; 8560 uint32_t lastitemlength = 0; 8561 uint32_t *pptr = *pptrptr; 8562 PCRE2_SIZE offset; 8563 parsed_recurse_check this_recurse; 8564 8565 /* A large and/or complex regex can take too long to process. This can happen 8566 more often when (?| groups are present in the pattern because their length 8567 cannot be cached. */ 8568 8569 if ((*lcptr)++ > 2000) 8570 { 8571 *errcodeptr = ERR35; /* Lookbehind is too complicated */ 8572 return -1; 8573 } 8574 8575 /* Scan the branch, accumulating the length. */ 8576 8577 for (;; pptr++) 8578 { 8579 parsed_recurse_check *r; 8580 uint32_t *gptr, *gptrend; 8581 uint32_t escape; 8582 uint32_t group = 0; 8583 uint32_t itemlength = 0; 8584 8585 if (*pptr < META_END) 8586 { 8587 itemlength = 1; 8588 } 8589 8590 else switch (META_CODE(*pptr)) 8591 { 8592 case META_KET: 8593 case META_ALT: 8594 goto EXIT; 8595 8596 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the 8597 actual termination. */ 8598 8599 case META_ACCEPT: 8600 case META_FAIL: 8601 pptr = parsed_skip(pptr, PSKIP_ALT); 8602 if (pptr == NULL) goto PARSED_SKIP_FAILED; 8603 goto EXIT; 8604 8605 case META_MARK: 8606 case META_COMMIT_ARG: 8607 case META_PRUNE_ARG: 8608 case META_SKIP_ARG: 8609 case META_THEN_ARG: 8610 pptr += pptr[1] + 1; 8611 break; 8612 8613 case META_CIRCUMFLEX: 8614 case META_COMMIT: 8615 case META_DOLLAR: 8616 case META_PRUNE: 8617 case META_SKIP: 8618 case META_THEN: 8619 break; 8620 8621 case META_OPTIONS: 8622 pptr += 1; 8623 break; 8624 8625 case META_BIGVALUE: 8626 itemlength = 1; 8627 pptr += 1; 8628 break; 8629 8630 case META_CLASS: 8631 case META_CLASS_NOT: 8632 itemlength = 1; 8633 pptr = parsed_skip(pptr, PSKIP_CLASS); 8634 if (pptr == NULL) goto PARSED_SKIP_FAILED; 8635 break; 8636 8637 case META_CLASS_EMPTY_NOT: 8638 case META_DOT: 8639 itemlength = 1; 8640 break; 8641 8642 case META_CALLOUT_NUMBER: 8643 pptr += 3; 8644 break; 8645 8646 case META_CALLOUT_STRING: 8647 pptr += 3 + SIZEOFFSET; 8648 break; 8649 8650 /* Only some escapes consume a character. Of those, \R and \X are never 8651 allowed because they might match more than character. \C is allowed only in 8652 32-bit and non-UTF 8/16-bit modes. */ 8653 8654 case META_ESCAPE: 8655 escape = META_DATA(*pptr); 8656 if (escape == ESC_R || escape == ESC_X) return -1; 8657 if (escape > ESC_b && escape < ESC_Z) 8658 { 8659 #if PCRE2_CODE_UNIT_WIDTH != 32 8660 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C) 8661 { 8662 *errcodeptr = ERR36; 8663 return -1; 8664 } 8665 #endif 8666 itemlength = 1; 8667 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */ 8668 } 8669 break; 8670 8671 /* Lookaheads can be ignored, but we must start the skip inside the group 8672 so that it isn't treated as a group within the branch. */ 8673 8674 case META_LOOKAHEAD: 8675 case META_LOOKAHEADNOT: 8676 pptr = parsed_skip(pptr + 1, PSKIP_KET); 8677 if (pptr == NULL) goto PARSED_SKIP_FAILED; 8678 8679 /* Also ignore any qualifiers that follow a lookahead assertion. */ 8680 8681 switch (pptr[1]) 8682 { 8683 case META_ASTERISK: 8684 case META_ASTERISK_PLUS: 8685 case META_ASTERISK_QUERY: 8686 case META_PLUS: 8687 case META_PLUS_PLUS: 8688 case META_PLUS_QUERY: 8689 case META_QUERY: 8690 case META_QUERY_PLUS: 8691 case META_QUERY_QUERY: 8692 pptr++; 8693 break; 8694 8695 case META_MINMAX: 8696 case META_MINMAX_PLUS: 8697 case META_MINMAX_QUERY: 8698 pptr += 3; 8699 break; 8700 8701 default: 8702 break; 8703 } 8704 break; 8705 8706 /* Lookbehinds can be ignored, but must themselves be checked. */ 8707 8708 case META_LOOKBEHIND: 8709 case META_LOOKBEHINDNOT: 8710 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb)) 8711 return -1; 8712 break; 8713 8714 /* Back references and recursions are handled by very similar code. At this 8715 stage, the names generated in the parsing pass are available, but the main 8716 name table has not yet been created. So for the named varieties, scan the 8717 list of names in order to get the number of the first one in the pattern, 8718 and whether or not this name is duplicated. */ 8719 8720 case META_BACKREF_BYNAME: 8721 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0) 8722 goto ISNOTFIXED; 8723 /* Fall through */ 8724 8725 case META_RECURSE_BYNAME: 8726 { 8727 int i; 8728 PCRE2_SPTR name; 8729 BOOL is_dupname = FALSE; 8730 named_group *ng = cb->named_groups; 8731 uint32_t meta_code = META_CODE(*pptr); 8732 uint32_t length = *(++pptr); 8733 8734 GETPLUSOFFSET(offset, pptr); 8735 name = cb->start_pattern + offset; 8736 for (i = 0; i < cb->names_found; i++, ng++) 8737 { 8738 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0) 8739 { 8740 group = ng->number; 8741 is_dupname = ng->isdup; 8742 break; 8743 } 8744 } 8745 8746 if (group == 0) 8747 { 8748 *errcodeptr = ERR15; /* Non-existent subpattern */ 8749 cb->erroroffset = offset; 8750 return -1; 8751 } 8752 8753 /* A numerical back reference can be fixed length if duplicate capturing 8754 groups are not being used. A non-duplicate named back reference can also 8755 be handled. */ 8756 8757 if (meta_code == META_RECURSE_BYNAME || 8758 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)) 8759 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */ 8760 } 8761 goto ISNOTFIXED; /* Duplicate name or number */ 8762 8763 /* The offset values for back references < 10 are in a separate vector 8764 because otherwise they would use more than two parsed pattern elements on 8765 64-bit systems. */ 8766 8767 case META_BACKREF: 8768 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 || 8769 (cb->external_flags & PCRE2_DUPCAPUSED) != 0) 8770 goto ISNOTFIXED; 8771 group = META_DATA(*pptr); 8772 if (group < 10) 8773 { 8774 offset = cb->small_ref_offset[group]; 8775 goto RECURSE_OR_BACKREF_LENGTH; 8776 } 8777 8778 /* Fall through */ 8779 /* For groups >= 10 - picking up group twice does no harm. */ 8780 8781 /* A true recursion implies not fixed length, but a subroutine call may 8782 be OK. Back reference "recursions" are also failed. */ 8783 8784 case META_RECURSE: 8785 group = META_DATA(*pptr); 8786 GETPLUSOFFSET(offset, pptr); 8787 8788 RECURSE_OR_BACKREF_LENGTH: 8789 if (group > cb->bracount) 8790 { 8791 cb->erroroffset = offset; 8792 *errcodeptr = ERR15; /* Non-existent subpattern */ 8793 return -1; 8794 } 8795 if (group == 0) goto ISNOTFIXED; /* Local recursion */ 8796 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++) 8797 { 8798 if (META_CODE(*gptr) == META_BIGVALUE) gptr++; 8799 else if (*gptr == (META_CAPTURE | group)) break; 8800 } 8801 8802 /* We must start the search for the end of the group at the first meta code 8803 inside the group. Otherwise it will be treated as an enclosed group. */ 8804 8805 gptrend = parsed_skip(gptr + 1, PSKIP_KET); 8806 if (gptrend == NULL) goto PARSED_SKIP_FAILED; 8807 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ 8808 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; 8809 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ 8810 this_recurse.prev = recurses; 8811 this_recurse.groupptr = gptr; 8812 8813 /* We do not need to know the position of the end of the group, that is, 8814 gptr is not used after the call to get_grouplength(). Setting the second 8815 argument FALSE stops it scanning for the end when the length can be found 8816 in the cache. */ 8817 8818 gptr++; 8819 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, 8820 &this_recurse, cb); 8821 if (grouplength < 0) 8822 { 8823 if (*errcodeptr == 0) goto ISNOTFIXED; 8824 return -1; /* Error already set */ 8825 } 8826 itemlength = grouplength; 8827 break; 8828 8829 /* Check nested groups - advance past the initial data for each type and 8830 then seek a fixed length with get_grouplength(). */ 8831 8832 case META_COND_NAME: 8833 case META_COND_NUMBER: 8834 case META_COND_RNAME: 8835 case META_COND_RNUMBER: 8836 case META_COND_DEFINE: 8837 pptr += 2 + SIZEOFFSET; 8838 goto CHECK_GROUP; 8839 8840 case META_COND_ASSERT: 8841 pptr += 1; 8842 goto CHECK_GROUP; 8843 8844 case META_COND_VERSION: 8845 pptr += 4; 8846 goto CHECK_GROUP; 8847 8848 case META_CAPTURE: 8849 group = META_DATA(*pptr); 8850 /* Fall through */ 8851 8852 case META_ATOMIC: 8853 case META_NOCAPTURE: 8854 pptr++; 8855 CHECK_GROUP: 8856 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, 8857 recurses, cb); 8858 if (grouplength < 0) return -1; 8859 itemlength = grouplength; 8860 break; 8861 8862 /* Exact repetition is OK; variable repetition is not. A repetition of zero 8863 must subtract the length that has already been added. */ 8864 8865 case META_MINMAX: 8866 case META_MINMAX_PLUS: 8867 case META_MINMAX_QUERY: 8868 if (pptr[1] == pptr[2]) 8869 { 8870 if (pptr[1] == 0) branchlength -= lastitemlength; 8871 else itemlength = (pptr[1] - 1) * lastitemlength; 8872 pptr += 2; 8873 break; 8874 } 8875 /* Fall through */ 8876 8877 /* Any other item means this branch does not have a fixed length. */ 8878 8879 default: 8880 ISNOTFIXED: 8881 *errcodeptr = ERR25; /* Not fixed length */ 8882 return -1; 8883 } 8884 8885 /* Add the item length to the branchlength, and save it for use if the next 8886 thing is a quantifier. */ 8887 8888 branchlength += itemlength; 8889 lastitemlength = itemlength; 8890 8891 /* Ensure that the length does not overflow the limit. */ 8892 8893 if (branchlength > LOOKBEHIND_MAX) 8894 { 8895 *errcodeptr = ERR87; 8896 return -1; 8897 } 8898 } 8899 8900 EXIT: 8901 *pptrptr = pptr; 8902 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength; 8903 return branchlength; 8904 8905 PARSED_SKIP_FAILED: 8906 *errcodeptr = ERR90; 8907 return -1; 8908 } 8909 8910 8911 8912 /************************************************* 8913 * Set lengths in a lookbehind * 8914 *************************************************/ 8915 8916 /* This function is called for each lookbehind, to set the lengths in its 8917 branches. An error occurs if any branch does not have a fixed length that is 8918 less than the maximum (65535). On exit, the pointer must be left on the final 8919 ket. 8920 8921 Arguments: 8922 pptrptr pointer to pointer in the parsed pattern 8923 errcodeptr pointer to error code 8924 lcptr pointer to loop counter 8925 recurses chain of recurse_check to catch mutual recursion 8926 cb pointer to compile block 8927 8928 Returns: TRUE if all is well 8929 FALSE otherwise, with error code and offset set 8930 */ 8931 8932 static BOOL 8933 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, 8934 parsed_recurse_check *recurses, compile_block *cb) 8935 { 8936 PCRE2_SIZE offset; 8937 int branchlength; 8938 uint32_t *bptr = *pptrptr; 8939 8940 READPLUSOFFSET(offset, bptr); /* Offset for error messages */ 8941 *pptrptr += SIZEOFFSET; 8942 8943 do 8944 { 8945 *pptrptr += 1; 8946 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); 8947 if (branchlength < 0) 8948 { 8949 /* The errorcode and offset may already be set from a nested lookbehind. */ 8950 if (*errcodeptr == 0) *errcodeptr = ERR25; 8951 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; 8952 return FALSE; 8953 } 8954 *bptr |= branchlength; /* branchlength never more than 65535 */ 8955 bptr = *pptrptr; 8956 } 8957 while (*bptr == META_ALT); 8958 8959 return TRUE; 8960 } 8961 8962 8963 8964 /************************************************* 8965 * Check parsed pattern lookbehinds * 8966 *************************************************/ 8967 8968 /* This function is called at the end of parsing a pattern if any lookbehinds 8969 were encountered. It scans the parsed pattern for them, calling 8970 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and 8971 the error offset is marked unset. The enables the functions above not to 8972 override settings from deeper nestings. 8973 8974 Arguments cb points to the compile block 8975 Returns: 0 on success, or an errorcode (cb->erroroffset will be set) 8976 */ 8977 8978 static int 8979 check_lookbehinds(compile_block *cb) 8980 { 8981 uint32_t *pptr; 8982 int errorcode = 0; 8983 int loopcount = 0; 8984 8985 cb->erroroffset = PCRE2_UNSET; 8986 8987 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++) 8988 { 8989 if (*pptr < META_END) continue; /* Literal */ 8990 8991 switch (META_CODE(*pptr)) 8992 { 8993 default: 8994 return ERR70; /* Unrecognized meta code */ 8995 8996 case META_ESCAPE: 8997 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p) 8998 pptr += 1; 8999 break; 9000 9001 case META_ACCEPT: 9002 case META_ALT: 9003 case META_ASTERISK: 9004 case META_ASTERISK_PLUS: 9005 case META_ASTERISK_QUERY: 9006 case META_ATOMIC: 9007 case META_BACKREF: 9008 case META_CAPTURE: 9009 case META_CIRCUMFLEX: 9010 case META_CLASS: 9011 case META_CLASS_EMPTY: 9012 case META_CLASS_EMPTY_NOT: 9013 case META_CLASS_END: 9014 case META_CLASS_NOT: 9015 case META_COMMIT: 9016 case META_COND_ASSERT: 9017 case META_DOLLAR: 9018 case META_DOT: 9019 case META_FAIL: 9020 case META_KET: 9021 case META_LOOKAHEAD: 9022 case META_LOOKAHEADNOT: 9023 case META_NOCAPTURE: 9024 case META_PLUS: 9025 case META_PLUS_PLUS: 9026 case META_PLUS_QUERY: 9027 case META_PRUNE: 9028 case META_QUERY: 9029 case META_QUERY_PLUS: 9030 case META_QUERY_QUERY: 9031 case META_RANGE_ESCAPED: 9032 case META_RANGE_LITERAL: 9033 case META_SKIP: 9034 case META_THEN: 9035 break; 9036 9037 case META_RECURSE: 9038 pptr += SIZEOFFSET; 9039 break; 9040 9041 case META_BACKREF_BYNAME: 9042 case META_COND_DEFINE: 9043 case META_COND_NAME: 9044 case META_COND_NUMBER: 9045 case META_COND_RNAME: 9046 case META_COND_RNUMBER: 9047 case META_RECURSE_BYNAME: 9048 pptr += 1 + SIZEOFFSET; 9049 break; 9050 9051 case META_CALLOUT_STRING: 9052 pptr += 3 + SIZEOFFSET; 9053 break; 9054 9055 case META_BIGVALUE: 9056 case META_OPTIONS: 9057 case META_POSIX: 9058 case META_POSIX_NEG: 9059 pptr += 1; 9060 break; 9061 9062 case META_MINMAX: 9063 case META_MINMAX_QUERY: 9064 case META_MINMAX_PLUS: 9065 pptr += 2; 9066 break; 9067 9068 case META_CALLOUT_NUMBER: 9069 case META_COND_VERSION: 9070 pptr += 3; 9071 break; 9072 9073 case META_MARK: 9074 case META_COMMIT_ARG: 9075 case META_PRUNE_ARG: 9076 case META_SKIP_ARG: 9077 case META_THEN_ARG: 9078 pptr += 1 + pptr[1]; 9079 break; 9080 9081 case META_LOOKBEHIND: 9082 case META_LOOKBEHINDNOT: 9083 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb)) 9084 return errorcode; 9085 break; 9086 } 9087 } 9088 9089 return 0; 9090 } 9091 9092 9093 9094 /************************************************* 9095 * External function to compile a pattern * 9096 *************************************************/ 9097 9098 /* This function reads a regular expression in the form of a string and returns 9099 a pointer to a block of store holding a compiled version of the expression. 9100 9101 Arguments: 9102 pattern the regular expression 9103 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED 9104 options option bits 9105 errorptr pointer to errorcode 9106 erroroffset pointer to error offset 9107 ccontext points to a compile context or is NULL 9108 9109 Returns: pointer to compiled data block, or NULL on error, 9110 with errorcode and erroroffset set 9111 */ 9112 9113 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION 9114 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, 9115 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) 9116 { 9117 BOOL utf; /* Set TRUE for UTF mode */ 9118 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ 9119 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ 9120 pcre2_real_code *re = NULL; /* What we will return */ 9121 compile_block cb; /* "Static" compile-time data */ 9122 const uint8_t *tables; /* Char tables base pointer */ 9123 9124 PCRE2_UCHAR *code; /* Current pointer in compiled code */ 9125 PCRE2_SPTR codestart; /* Start of compiled code */ 9126 PCRE2_SPTR ptr; /* Current pointer in pattern */ 9127 uint32_t *pptr; /* Current pointer in parsed pattern */ 9128 9129 PCRE2_SIZE length = 1; /* Allow for final END opcode */ 9130 PCRE2_SIZE usedlength; /* Actual length used */ 9131 PCRE2_SIZE re_blocksize; /* Size of memory block */ 9132 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */ 9133 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */ 9134 9135 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ 9136 uint32_t firstcu, reqcu; /* Value of first/req code unit */ 9137 uint32_t setflags = 0; /* NL and BSR set flags */ 9138 9139 uint32_t skipatstart; /* When checking (*UTF) etc */ 9140 uint32_t limit_heap = UINT32_MAX; 9141 uint32_t limit_match = UINT32_MAX; /* Unset match limits */ 9142 uint32_t limit_depth = UINT32_MAX; 9143 9144 int newline = 0; /* Unset; can be set by the pattern */ 9145 int bsr = 0; /* Unset; can be set by the pattern */ 9146 int errorcode = 0; /* Initialize to avoid compiler warn */ 9147 int regexrc; /* Return from compile */ 9148 9149 uint32_t i; /* Local loop counter */ 9150 9151 /* Comments at the head of this file explain about these variables. */ 9152 9153 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; 9154 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE]; 9155 named_group named_groups[NAMED_GROUP_LIST_SIZE]; 9156 9157 /* The workspace is used in different ways in the different compiling phases. 9158 It needs to be 16-bit aligned for the preliminary parsing scan. */ 9159 9160 uint32_t c16workspace[C16_WORK_SIZE]; 9161 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace; 9162 9163 9164 /* -------------- Check arguments and set up the pattern ----------------- */ 9165 9166 /* There must be error code and offset pointers. */ 9167 9168 if (errorptr == NULL || erroroffset == NULL) return NULL; 9169 *errorptr = ERR0; 9170 *erroroffset = 0; 9171 9172 /* There must be a pattern! */ 9173 9174 if (pattern == NULL) 9175 { 9176 *errorptr = ERR16; 9177 return NULL; 9178 } 9179 9180 /* A NULL compile context means "use a default context" */ 9181 9182 if (ccontext == NULL) 9183 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); 9184 9185 /* Check that all undefined public option bits are zero. */ 9186 9187 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 || 9188 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0) 9189 { 9190 *errorptr = ERR17; 9191 return NULL; 9192 } 9193 9194 if ((options & PCRE2_LITERAL) != 0 && 9195 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 || 9196 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0)) 9197 { 9198 *errorptr = ERR92; 9199 return NULL; 9200 } 9201 9202 /* A zero-terminated pattern is indicated by the special length value 9203 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ 9204 9205 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED))) 9206 patlen = PRIV(strlen)(pattern); 9207 9208 if (patlen > ccontext->max_pattern_length) 9209 { 9210 *errorptr = ERR88; 9211 return NULL; 9212 } 9213 9214 /* From here on, all returns from this function should end up going via the 9215 EXIT label. */ 9216 9217 9218 /* ------------ Initialize the "static" compile data -------------- */ 9219 9220 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); 9221 9222 cb.lcc = tables + lcc_offset; /* Individual */ 9223 cb.fcc = tables + fcc_offset; /* character */ 9224 cb.cbits = tables + cbits_offset; /* tables */ 9225 cb.ctypes = tables + ctypes_offset; 9226 9227 cb.assert_depth = 0; 9228 cb.bracount = 0; 9229 cb.cx = ccontext; 9230 cb.dupnames = FALSE; 9231 cb.end_pattern = pattern + patlen; 9232 cb.erroroffset = 0; 9233 cb.external_flags = 0; 9234 cb.external_options = options; 9235 cb.groupinfo = stack_groupinfo; 9236 cb.had_recurse = FALSE; 9237 cb.lastcapture = 0; 9238 cb.max_lookbehind = 0; 9239 cb.name_entry_size = 0; 9240 cb.name_table = NULL; 9241 cb.named_groups = named_groups; 9242 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; 9243 cb.names_found = 0; 9244 cb.open_caps = NULL; 9245 cb.parens_depth = 0; 9246 cb.parsed_pattern = stack_parsed_pattern; 9247 cb.req_varyopt = 0; 9248 cb.start_code = cworkspace; 9249 cb.start_pattern = pattern; 9250 cb.start_workspace = cworkspace; 9251 cb.workspace_size = COMPILE_WORK_SIZE; 9252 9253 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back 9254 references to help in deciding whether (.*) can be treated as anchored or not. 9255 */ 9256 9257 cb.top_backref = 0; 9258 cb.backref_map = 0; 9259 9260 /* Escape sequences \1 to \9 are always back references, but as they are only 9261 two characters long, only two elements can be used in the parsed_pattern 9262 vector. The first contains the reference, and we'd like to use the second to 9263 record the offset in the pattern, so that forward references to non-existent 9264 groups can be diagnosed later with an offset. However, on 64-bit systems, 9265 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first 9266 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other 9267 references have enough space for the offset to be put into the parsed pattern. 9268 */ 9269 9270 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; 9271 9272 9273 /* --------------- Start looking at the pattern --------------- */ 9274 9275 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at 9276 the start of the pattern, and remember the offset to the actual regex. With 9277 valgrind support, make the terminator of a zero-terminated pattern 9278 inaccessible. This catches bugs that would otherwise only show up for 9279 non-zero-terminated patterns. */ 9280 9281 #ifdef SUPPORT_VALGRIND 9282 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); 9283 #endif 9284 9285 ptr = pattern; 9286 skipatstart = 0; 9287 9288 if ((options & PCRE2_LITERAL) == 0) 9289 { 9290 while (patlen - skipatstart >= 2 && 9291 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && 9292 ptr[skipatstart+1] == CHAR_ASTERISK) 9293 { 9294 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) 9295 { 9296 uint32_t c, pp; 9297 pso *p = pso_list + i; 9298 9299 if (patlen - skipatstart - 2 >= p->length && 9300 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name), 9301 p->length) == 0) 9302 { 9303 skipatstart += p->length + 2; 9304 switch(p->type) 9305 { 9306 case PSO_OPT: 9307 cb.external_options |= p->value; 9308 break; 9309 9310 case PSO_FLG: 9311 setflags |= p->value; 9312 break; 9313 9314 case PSO_NL: 9315 newline = p->value; 9316 setflags |= PCRE2_NL_SET; 9317 break; 9318 9319 case PSO_BSR: 9320 bsr = p->value; 9321 setflags |= PCRE2_BSR_SET; 9322 break; 9323 9324 case PSO_LIMM: 9325 case PSO_LIMD: 9326 case PSO_LIMH: 9327 c = 0; 9328 pp = skipatstart; 9329 if (!IS_DIGIT(ptr[pp])) 9330 { 9331 errorcode = ERR60; 9332 ptr += pp; 9333 goto HAD_EARLY_ERROR; 9334 } 9335 while (IS_DIGIT(ptr[pp])) 9336 { 9337 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ 9338 c = c*10 + (ptr[pp++] - CHAR_0); 9339 } 9340 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) 9341 { 9342 errorcode = ERR60; 9343 ptr += pp; 9344 goto HAD_EARLY_ERROR; 9345 } 9346 if (p->type == PSO_LIMH) limit_heap = c; 9347 else if (p->type == PSO_LIMM) limit_match = c; 9348 else limit_depth = c; 9349 skipatstart += pp - skipatstart; 9350 break; 9351 } 9352 break; /* Out of the table scan loop */ 9353 } 9354 } 9355 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ 9356 } 9357 } 9358 9359 /* End of pattern-start options; advance to start of real regex. */ 9360 9361 ptr += skipatstart; 9362 9363 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */ 9364 9365 #ifndef SUPPORT_UNICODE 9366 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) 9367 { 9368 errorcode = ERR32; 9369 goto HAD_EARLY_ERROR; 9370 } 9371 #endif 9372 9373 /* Check UTF. We have the original options in 'options', with that value as 9374 modified by (*UTF) etc in cb->external_options. The extra option 9375 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the 9376 surrogate code points cannot be represented in UTF-16. */ 9377 9378 utf = (cb.external_options & PCRE2_UTF) != 0; 9379 if (utf) 9380 { 9381 if ((options & PCRE2_NEVER_UTF) != 0) 9382 { 9383 errorcode = ERR74; 9384 goto HAD_EARLY_ERROR; 9385 } 9386 if ((options & PCRE2_NO_UTF_CHECK) == 0 && 9387 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) 9388 goto HAD_ERROR; /* Offset was set by valid_utf() */ 9389 9390 #if PCRE2_CODE_UNIT_WIDTH == 16 9391 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0) 9392 { 9393 errorcode = ERR91; 9394 goto HAD_EARLY_ERROR; 9395 } 9396 #endif 9397 } 9398 9399 /* Check UCP lockout. */ 9400 9401 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == 9402 (PCRE2_UCP|PCRE2_NEVER_UCP)) 9403 { 9404 errorcode = ERR75; 9405 goto HAD_EARLY_ERROR; 9406 } 9407 9408 /* Process the BSR setting. */ 9409 9410 if (bsr == 0) bsr = ccontext->bsr_convention; 9411 9412 /* Process the newline setting. */ 9413 9414 if (newline == 0) newline = ccontext->newline_convention; 9415 cb.nltype = NLTYPE_FIXED; 9416 switch(newline) 9417 { 9418 case PCRE2_NEWLINE_CR: 9419 cb.nllen = 1; 9420 cb.nl[0] = CHAR_CR; 9421 break; 9422 9423 case PCRE2_NEWLINE_LF: 9424 cb.nllen = 1; 9425 cb.nl[0] = CHAR_NL; 9426 break; 9427 9428 case PCRE2_NEWLINE_NUL: 9429 cb.nllen = 1; 9430 cb.nl[0] = CHAR_NUL; 9431 break; 9432 9433 case PCRE2_NEWLINE_CRLF: 9434 cb.nllen = 2; 9435 cb.nl[0] = CHAR_CR; 9436 cb.nl[1] = CHAR_NL; 9437 break; 9438 9439 case PCRE2_NEWLINE_ANY: 9440 cb.nltype = NLTYPE_ANY; 9441 break; 9442 9443 case PCRE2_NEWLINE_ANYCRLF: 9444 cb.nltype = NLTYPE_ANYCRLF; 9445 break; 9446 9447 default: 9448 errorcode = ERR56; 9449 goto HAD_EARLY_ERROR; 9450 } 9451 9452 /* Pre-scan the pattern to do two things: (1) Discover the named groups and 9453 their numerical equivalents, so that this information is always available for 9454 the remaining processing. (2) At the same time, parse the pattern and put a 9455 processed version into the parsed_pattern vector. This has escapes interpreted 9456 and comments removed (amongst other things). 9457 9458 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned 9459 32-bit ints in the parsed pattern is bounded by the length of the pattern plus 9460 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is 9461 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal 9462 characters greater than META_END (0x80000000) have to be coded as two units. In 9463 this case, therefore, we scan the pattern to check for such values. */ 9464 9465 #if PCRE2_CODE_UNIT_WIDTH == 32 9466 if (!utf) 9467 { 9468 PCRE2_SPTR p; 9469 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++; 9470 } 9471 #endif 9472 9473 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT 9474 is set we have to assume a numerical callout (4 elements) for each character 9475 plus one at the end. This is overkill, but memory is plentiful these days. For 9476 many smaller patterns the vector on the stack (which was set up above) can be 9477 used. */ 9478 9479 parsed_size_needed = patlen - skipatstart + big32count; 9480 9481 if ((ccontext->extra_options & 9482 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0) 9483 parsed_size_needed += 4; 9484 9485 if ((options & PCRE2_AUTO_CALLOUT) != 0) 9486 parsed_size_needed = (parsed_size_needed + 1) * 5; 9487 9488 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) 9489 { 9490 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( 9491 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); 9492 if (heap_parsed_pattern == NULL) 9493 { 9494 *errorptr = ERR21; 9495 goto EXIT; 9496 } 9497 cb.parsed_pattern = heap_parsed_pattern; 9498 } 9499 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; 9500 9501 /* Do the parsing scan. */ 9502 9503 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); 9504 if (errorcode != 0) goto HAD_CB_ERROR; 9505 9506 /* Workspace is needed to remember information about numbered groups: whether a 9507 group can match an empty string and what its fixed length is. This is done to 9508 avoid the possibility of recursive references causing very long compile times 9509 when checking these features. Unnumbered groups do not have this exposure since 9510 they cannot be referenced. We use an indexed vector for this purpose. If there 9511 are sufficiently few groups, the default vector on the stack, as set up above, 9512 can be used. Otherwise we have to get/free a special vector. The vector must be 9513 initialized to zero. */ 9514 9515 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE) 9516 { 9517 cb.groupinfo = ccontext->memctl.malloc( 9518 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); 9519 if (cb.groupinfo == NULL) 9520 { 9521 errorcode = ERR21; 9522 cb.erroroffset = 0; 9523 goto HAD_CB_ERROR; 9524 } 9525 } 9526 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t)); 9527 9528 /* If there were any lookbehinds, scan the parsed pattern to figure out their 9529 lengths. */ 9530 9531 if (has_lookbehind) 9532 { 9533 errorcode = check_lookbehinds(&cb); 9534 if (errorcode != 0) goto HAD_CB_ERROR; 9535 } 9536 9537 /* For debugging, there is a function that shows the parsed data vector. */ 9538 9539 #ifdef DEBUG_SHOW_PARSED 9540 fprintf(stderr, "+++ Pre-scan complete:\n"); 9541 show_parsed(&cb); 9542 #endif 9543 9544 /* For debugging capturing information this code can be enabled. */ 9545 9546 #ifdef DEBUG_SHOW_CAPTURES 9547 { 9548 named_group *ng = cb.named_groups; 9549 fprintf(stderr, "+++Captures: %d\n", cb.bracount); 9550 for (i = 0; i < cb.names_found; i++, ng++) 9551 { 9552 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); 9553 } 9554 } 9555 #endif 9556 9557 /* Pretend to compile the pattern while actually just accumulating the amount 9558 of memory required in the 'length' variable. This behaviour is triggered by 9559 passing a non-NULL final argument to compile_regex(). We pass a block of 9560 workspace (cworkspace) for it to compile parts of the pattern into; the 9561 compiled code is discarded when it is no longer needed, so hopefully this 9562 workspace will never overflow, though there is a test for its doing so. 9563 9564 On error, errorcode will be set non-zero, so we don't need to look at the 9565 result of the function. The initial options have been put into the cb block, 9566 but we still have to pass a separate options variable (the first argument) 9567 because the options may change as the pattern is processed. */ 9568 9569 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */ 9570 pptr = cb.parsed_pattern; 9571 code = cworkspace; 9572 *code = OP_BRA; 9573 9574 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu, 9575 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length); 9576 9577 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */ 9578 9579 /* This should be caught in compile_regex(), but just in case... */ 9580 9581 if (length > MAX_PATTERN_SIZE) 9582 { 9583 errorcode = ERR20; 9584 goto HAD_CB_ERROR; 9585 } 9586 9587 /* Compute the size of, and then get and initialize, the data block for storing 9588 the compiled pattern and names table. Integer overflow should no longer be 9589 possible because nowadays we limit the maximum value of cb.names_found and 9590 cb.name_entry_size. */ 9591 9592 re_blocksize = sizeof(pcre2_real_code) + 9593 CU2BYTES(length + 9594 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size); 9595 re = (pcre2_real_code *) 9596 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); 9597 if (re == NULL) 9598 { 9599 errorcode = ERR21; 9600 goto HAD_CB_ERROR; 9601 } 9602 9603 /* The compiler may put padding at the end of the pcre2_real_code structure in 9604 order to round it up to a multiple of 4 or 8 bytes. This means that when a 9605 compiled pattern is copied (for example, when serialized) undefined bytes are 9606 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly 9607 write to the last 8 bytes of the structure before setting the fields. */ 9608 9609 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8); 9610 re->memctl = ccontext->memctl; 9611 re->tables = tables; 9612 re->executable_jit = NULL; 9613 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); 9614 re->blocksize = re_blocksize; 9615 re->magic_number = MAGIC_NUMBER; 9616 re->compile_options = options; 9617 re->overall_options = cb.external_options; 9618 re->extra_options = ccontext->extra_options; 9619 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags; 9620 re->limit_heap = limit_heap; 9621 re->limit_match = limit_match; 9622 re->limit_depth = limit_depth; 9623 re->first_codeunit = 0; 9624 re->last_codeunit = 0; 9625 re->bsr_convention = bsr; 9626 re->newline_convention = newline; 9627 re->max_lookbehind = 0; 9628 re->minlength = 0; 9629 re->top_bracket = 0; 9630 re->top_backref = 0; 9631 re->name_entry_size = cb.name_entry_size; 9632 re->name_count = cb.names_found; 9633 9634 /* The basic block is immediately followed by the name table, and the compiled 9635 code follows after that. */ 9636 9637 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + 9638 re->name_entry_size * re->name_count; 9639 9640 /* Update the compile data block for the actual compile. The starting points of 9641 the name/number translation table and of the code are passed around in the 9642 compile data block. The start/end pattern and initial options are already set 9643 from the pre-compile phase, as is the name_entry_size field. */ 9644 9645 cb.parens_depth = 0; 9646 cb.assert_depth = 0; 9647 cb.lastcapture = 0; 9648 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); 9649 cb.start_code = codestart; 9650 cb.req_varyopt = 0; 9651 cb.had_accept = FALSE; 9652 cb.had_pruneorskip = FALSE; 9653 cb.open_caps = NULL; 9654 9655 /* If any named groups were found, create the name/number table from the list 9656 created in the pre-pass. */ 9657 9658 if (cb.names_found > 0) 9659 { 9660 named_group *ng = cb.named_groups; 9661 for (i = 0; i < cb.names_found; i++, ng++) 9662 add_name_to_table(&cb, ng->name, ng->length, ng->number, i); 9663 } 9664 9665 /* Set up a starting, non-extracting bracket, then compile the expression. On 9666 error, errorcode will be set non-zero, so we don't need to look at the result 9667 of the function here. */ 9668 9669 pptr = cb.parsed_pattern; 9670 code = (PCRE2_UCHAR *)codestart; 9671 *code = OP_BRA; 9672 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0, 9673 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); 9674 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY; 9675 re->top_bracket = cb.bracount; 9676 re->top_backref = cb.top_backref; 9677 re->max_lookbehind = cb.max_lookbehind; 9678 9679 if (cb.had_accept) 9680 { 9681 reqcu = 0; /* Must disable after (*ACCEPT) */ 9682 reqcuflags = REQ_NONE; 9683 } 9684 9685 /* Fill in the final opcode and check for disastrous overflow. If no overflow, 9686 but the estimated length exceeds the really used length, adjust the value of 9687 re->blocksize, and if valgrind support is configured, mark the extra allocated 9688 memory as unaddressable, so that any out-of-bound reads can be detected. */ 9689 9690 *code++ = OP_END; 9691 usedlength = code - codestart; 9692 if (usedlength > length) errorcode = ERR23; else 9693 { 9694 re->blocksize -= CU2BYTES(length - usedlength); 9695 #ifdef SUPPORT_VALGRIND 9696 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength)); 9697 #endif 9698 } 9699 9700 /* Scan the pattern for recursion/subroutine calls and convert the group 9701 numbers into offsets. Maintain a small cache so that repeated groups containing 9702 recursions are efficiently handled. */ 9703 9704 #define RSCAN_CACHE_SIZE 8 9705 9706 if (errorcode == 0 && cb.had_recurse) 9707 { 9708 PCRE2_UCHAR *rcode; 9709 PCRE2_SPTR rgroup; 9710 unsigned int ccount = 0; 9711 int start = RSCAN_CACHE_SIZE; 9712 recurse_cache rc[RSCAN_CACHE_SIZE]; 9713 9714 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); 9715 rcode != NULL; 9716 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) 9717 { 9718 int p, groupnumber; 9719 9720 groupnumber = (int)GET(rcode, 1); 9721 if (groupnumber == 0) rgroup = codestart; else 9722 { 9723 PCRE2_SPTR search_from = codestart; 9724 rgroup = NULL; 9725 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) 9726 { 9727 if (groupnumber == rc[p].groupnumber) 9728 { 9729 rgroup = rc[p].group; 9730 break; 9731 } 9732 9733 /* Group n+1 must always start to the right of group n, so we can save 9734 search time below when the new group number is greater than any of the 9735 previously found groups. */ 9736 9737 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group; 9738 } 9739 9740 if (rgroup == NULL) 9741 { 9742 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber); 9743 if (rgroup == NULL) 9744 { 9745 errorcode = ERR53; 9746 break; 9747 } 9748 if (--start < 0) start = RSCAN_CACHE_SIZE - 1; 9749 rc[start].groupnumber = groupnumber; 9750 rc[start].group = rgroup; 9751 if (ccount < RSCAN_CACHE_SIZE) ccount++; 9752 } 9753 } 9754 9755 PUT(rcode, 1, rgroup - codestart); 9756 } 9757 } 9758 9759 /* In rare debugging situations we sometimes need to look at the compiled code 9760 at this stage. */ 9761 9762 #ifdef DEBUG_CALL_PRINTINT 9763 pcre2_printint(re, stderr, TRUE); 9764 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); 9765 #endif 9766 9767 /* Unless disabled, check whether any single character iterators can be 9768 auto-possessified. The function overwrites the appropriate opcode values, so 9769 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is 9770 used in this code because at least one compiler gives a warning about loss of 9771 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the 9772 function call. */ 9773 9774 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) 9775 { 9776 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; 9777 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; 9778 } 9779 9780 /* Failed to compile, or error while post-processing. */ 9781 9782 if (errorcode != 0) goto HAD_CB_ERROR; 9783 9784 /* Successful compile. If the anchored option was not passed, set it if 9785 we can determine that the pattern is anchored by virtue of ^ characters or \A 9786 or anything else, such as starting with non-atomic .* when DOTALL is set and 9787 there are no occurrences of *PRUNE or *SKIP (though there is an option to 9788 disable this case). */ 9789 9790 if ((re->overall_options & PCRE2_ANCHORED) == 0 && 9791 is_anchored(codestart, 0, &cb, 0, FALSE)) 9792 re->overall_options |= PCRE2_ANCHORED; 9793 9794 /* Set up the first code unit or startline flag, the required code unit, and 9795 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE 9796 is set, as the data it would create will not be used. Note that a first code 9797 unit (but not the startline flag) is useful for anchored patterns because it 9798 can still give a quick "no match" and also avoid searching for a last code 9799 unit. */ 9800 9801 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) 9802 { 9803 /* If we do not have a first code unit, see if there is one that is asserted 9804 (these are not saved during the compile because they can cause conflicts with 9805 actual literals that follow). */ 9806 9807 if (firstcuflags < 0) 9808 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); 9809 9810 /* Save the data for a first code unit. */ 9811 9812 if (firstcuflags >= 0) 9813 { 9814 re->first_codeunit = firstcu; 9815 re->flags |= PCRE2_FIRSTSET; 9816 9817 /* Handle caseless first code units. */ 9818 9819 if ((firstcuflags & REQ_CASELESS) != 0) 9820 { 9821 if (firstcu < 128 || (!utf && firstcu < 255)) 9822 { 9823 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; 9824 } 9825 9826 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In 9827 8-bit UTF mode, codepoints in the range 128-255 are introductory code 9828 points and cannot have another case. In 16-bit and 32-bit modes, we can 9829 check wide characters when UTF (and therefore UCP) is supported. */ 9830 9831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 9832 else if (firstcu <= MAX_UTF_CODE_POINT && 9833 UCD_OTHERCASE(firstcu) != firstcu) 9834 re->flags |= PCRE2_FIRSTCASELESS; 9835 #endif 9836 } 9837 } 9838 9839 /* When there is no first code unit, for non-anchored patterns, see if we can 9840 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all 9841 branches start with ^ and also when all branches start with non-atomic .* for 9842 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option 9843 that disables this case.) */ 9844 9845 else if ((re->overall_options & PCRE2_ANCHORED) == 0 && 9846 is_startline(codestart, 0, &cb, 0, FALSE)) 9847 re->flags |= PCRE2_STARTLINE; 9848 9849 /* Handle the "required code unit", if one is set. In the case of an anchored 9850 pattern, do this only if it follows a variable length item in the pattern. */ 9851 9852 if (reqcuflags >= 0 && 9853 ((re->overall_options & PCRE2_ANCHORED) == 0 || 9854 (reqcuflags & REQ_VARY) != 0)) 9855 { 9856 re->last_codeunit = reqcu; 9857 re->flags |= PCRE2_LASTSET; 9858 9859 /* Handle caseless required code units as for first code units (above). */ 9860 9861 if ((reqcuflags & REQ_CASELESS) != 0) 9862 { 9863 if (reqcu < 128 || (!utf && reqcu < 255)) 9864 { 9865 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; 9866 } 9867 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 9868 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) 9869 re->flags |= PCRE2_LASTCASELESS; 9870 #endif 9871 } 9872 } 9873 9874 /* Finally, study the compiled pattern to set up information such as a bitmap 9875 of starting code units and a minimum matching length. */ 9876 9877 if (PRIV(study)(re) != 0) 9878 { 9879 errorcode = ERR31; 9880 goto HAD_CB_ERROR; 9881 } 9882 } /* End of start-of-match optimizations. */ 9883 9884 /* Control ends up here in all cases. When running under valgrind, make a 9885 pattern's terminating zero defined again. If memory was obtained for the parsed 9886 version of the pattern, free it before returning. Also free the list of named 9887 groups if a larger one had to be obtained, and likewise the group information 9888 vector. */ 9889 9890 EXIT: 9891 #ifdef SUPPORT_VALGRIND 9892 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); 9893 #endif 9894 if (cb.parsed_pattern != stack_parsed_pattern) 9895 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); 9896 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) 9897 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); 9898 if (cb.groupinfo != stack_groupinfo) 9899 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); 9900 return re; /* Will be NULL after an error */ 9901 9902 /* Errors discovered in parse_regex() set the offset value in the compile 9903 block. Errors discovered before it is called must compute it from the ptr 9904 value. After parse_regex() is called, the offset in the compile block is set to 9905 the end of the pattern, but certain errors in compile_regex() may reset it if 9906 an offset is available in the parsed pattern. */ 9907 9908 HAD_CB_ERROR: 9909 ptr = pattern + cb.erroroffset; 9910 9911 HAD_EARLY_ERROR: 9912 *erroroffset = ptr - pattern; 9913 9914 HAD_ERROR: 9915 *errorptr = errorcode; 9916 pcre2_code_free(re); 9917 re = NULL; 9918 goto EXIT; 9919 } 9920 9921 /* End of pcre2_compile.c */ 9922