1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 #ifdef HAVE_CONFIG_H 43 #include "config.h" 44 #endif 45 46 #define NLBLOCK cb /* Block containing newline information */ 47 #define PSSTART start_pattern /* Field containing processed string start */ 48 #define PSEND end_pattern /* Field containing processed string end */ 49 50 #include "pcre2_internal.h" 51 52 /* In rare error cases debugging might require calling pcre2_printint(). */ 53 54 #if 0 55 #ifdef EBCDIC 56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255) 57 #else 58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127) 59 #endif 60 #include "pcre2_printint.c" 61 #define CALL_PRINTINT 62 #endif 63 64 /* There are a few things that vary with different code unit sizes. Handle them 65 by defining macros in order to minimize #if usage. */ 66 67 #if PCRE2_CODE_UNIT_WIDTH == 8 68 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5 69 #define XDIGIT(c) xdigitab[c] 70 71 #else /* Either 16-bit or 32-bit */ 72 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff) 73 74 #if PCRE2_CODE_UNIT_WIDTH == 16 75 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6 76 77 #else /* 32-bit */ 78 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6 79 #endif 80 #endif 81 82 /* Function definitions to allow mutual recursion */ 83 84 static unsigned int 85 add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *, 86 const uint32_t *, unsigned int); 87 88 static BOOL 89 compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL, 90 uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *, 91 branch_chain *, compile_block *, size_t *); 92 93 94 95 /************************************************* 96 * Code parameters and static tables * 97 *************************************************/ 98 99 /* This value specifies the size of stack workspace, which is used in different 100 ways in the different pattern scans. The group-identifying pre-scan uses it to 101 handle nesting, and needs it to be 16-bit aligned. 102 103 During the first compiling phase, when determining how much memory is required, 104 the regex is partly compiled into this space, but the compiled parts are 105 discarded as soon as they can be, so that hopefully there will never be an 106 overrun. The code does, however, check for an overrun, which can occur for 107 pathological patterns. The size of the workspace depends on LINK_SIZE because 108 the length of compiled items varies with this. 109 110 In the real compile phase, the workspace is used for remembering data about 111 numbered groups, provided there are not too many of them (if there are, extra 112 memory is acquired). For this phase the memory must be 32-bit aligned. Having 113 defined the size in code units, we set up C32_WORK_SIZE as the number of 114 elements in the 32-bit vector. */ 115 116 #define COMPILE_WORK_SIZE (2048*LINK_SIZE) /* Size in code units */ 117 118 #define C32_WORK_SIZE \ 119 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t)) 120 121 /* The overrun tests check for a slightly smaller size so that they detect the 122 overrun before it actually does run off the end of the data block. */ 123 124 #define WORK_SIZE_SAFETY_MARGIN (100) 125 126 /* This value determines the size of the initial vector that is used for 127 remembering named groups during the pre-compile. It is allocated on the stack, 128 but if it is too small, it is expanded, in a similar way to the workspace. The 129 value is the number of slots in the list. */ 130 131 #define NAMED_GROUP_LIST_SIZE 20 132 133 /* The original PCRE required patterns to be zero-terminated, and it simplifies 134 the compiling code if it is guaranteed that there is a zero code unit at the 135 end of the pattern, because this means that tests for coding sequences such as 136 (*SKIP) or even just (?<= can check a sequence of code units without having to 137 keep checking for the end of the pattern. The new PCRE2 API allows zero code 138 units within patterns if a positive length is given, but in order to keep most 139 of the compiling code as it was, we copy such patterns and add a zero on the 140 end. This value determines the size of space on the stack that is used if the 141 pattern fits; if not, heap memory is used. */ 142 143 #define COPIED_PATTERN_SIZE 1024 144 145 /* Maximum length value to check against when making sure that the variable 146 that holds the compiled pattern length does not overflow. We make it a bit less 147 than INT_MAX to allow for adding in group terminating bytes, so that we don't 148 have to check them every time. */ 149 150 #define OFLOW_MAX (INT_MAX - 20) 151 152 /* Macro for setting individual bits in class bitmaps. It took some 153 experimenting to figure out how to stop gcc 5.3.0 from warning with 154 -Wconversion. This version gets a warning: 155 156 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7)) 157 158 Let's hope the apparently less efficient version isn't actually so bad if the 159 compiler is clever with identical subexpressions. */ 160 161 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7))) 162 163 /* Private flags added to firstcu and reqcu. */ 164 165 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */ 166 #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */ 167 /* Negative values for the firstcu and reqcu flags */ 168 #define REQ_UNSET (-2) /* Not yet found anything */ 169 #define REQ_NONE (-1) /* Found not fixed char */ 170 171 /* These flags are used in the groupinfo vector. */ 172 173 #define GI_SET_COULD_BE_EMPTY 0x80000000u 174 #define GI_COULD_BE_EMPTY 0x40000000u 175 #define GI_NOT_FIXED_LENGTH 0x20000000u 176 #define GI_SET_FIXED_LENGTH 0x10000000u 177 #define GI_FIXED_LENGTH_MASK 0x0000ffffu 178 179 /* This bit (which is greater than any UTF value) is used to indicate that a 180 variable contains a number of code units instead of an actual code point. */ 181 182 #define UTF_LENGTH 0x10000000l 183 184 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC 185 and is fast (a good compiler can turn it into a subtraction and unsigned 186 comparison). */ 187 188 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) 189 190 /* Table to identify hex digits. The tables in chartables are dependent on the 191 locale, and may mark arbitrary characters as digits. We want to recognize only 192 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It 193 costs 256 bytes, but it is a lot faster than doing character value tests (at 194 least in some simple cases I timed), and in some applications one wants PCRE to 195 compile efficiently as well as match efficiently. The value in the table is 196 the binary hex digit value, or 0xff for non-hex digits. */ 197 198 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in 199 UTF-8 mode. */ 200 201 #ifndef EBCDIC 202 static const uint8_t xdigitab[] = 203 { 204 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */ 205 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ 206 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */ 207 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ 208 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */ 209 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */ 210 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */ 211 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */ 212 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */ 213 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */ 214 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */ 215 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */ 216 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */ 217 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */ 218 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */ 219 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */ 220 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */ 221 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */ 222 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */ 223 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */ 224 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */ 225 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */ 226 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */ 227 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ 228 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */ 229 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */ 230 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */ 231 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */ 232 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */ 233 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */ 234 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */ 235 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */ 236 237 #else 238 239 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ 240 241 static const uint8_t xdigitab[] = 242 { 243 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */ 244 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */ 245 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */ 246 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */ 247 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */ 248 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */ 249 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */ 250 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */ 251 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */ 252 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */ 253 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */ 254 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */ 255 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */ 256 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */ 257 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */ 258 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */ 259 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */ 260 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */ 261 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */ 262 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */ 263 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */ 264 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */ 265 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */ 266 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */ 267 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */ 268 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */ 269 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */ 270 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */ 271 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */ 272 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */ 273 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */ 274 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */ 275 #endif /* EBCDIC */ 276 277 278 /* Table for handling alphanumeric escaped characters. Positive returns are 279 simple data values; negative values are for special things like \d and so on. 280 Zero means further processing is needed (for things like \x), or the escape is 281 invalid. */ 282 283 /* This is the "normal" table for ASCII systems or for EBCDIC systems running 284 in UTF-8 mode. It runs from '0' to 'z'. */ 285 286 #ifndef EBCDIC 287 #define ESCAPES_FIRST CHAR_0 288 #define ESCAPES_LAST CHAR_z 289 #define UPPER_CASE(c) (c-32) 290 291 static const short int escapes[] = { 292 0, 0, 293 0, 0, 294 0, 0, 295 0, 0, 296 0, 0, 297 CHAR_COLON, CHAR_SEMICOLON, 298 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, 299 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, 300 CHAR_COMMERCIAL_AT, -ESC_A, 301 -ESC_B, -ESC_C, 302 -ESC_D, -ESC_E, 303 0, -ESC_G, 304 -ESC_H, 0, 305 0, -ESC_K, 306 0, 0, 307 -ESC_N, 0, 308 -ESC_P, -ESC_Q, 309 -ESC_R, -ESC_S, 310 0, 0, 311 -ESC_V, -ESC_W, 312 -ESC_X, 0, 313 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, 314 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, 315 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, 316 CHAR_GRAVE_ACCENT, ESC_a, 317 -ESC_b, 0, 318 -ESC_d, ESC_e, 319 ESC_f, 0, 320 -ESC_h, 0, 321 0, -ESC_k, 322 0, 0, 323 ESC_n, 0, 324 -ESC_p, 0, 325 ESC_r, -ESC_s, 326 ESC_tee, 0, 327 -ESC_v, -ESC_w, 328 0, 0, 329 -ESC_z 330 }; 331 332 #else 333 334 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. 335 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code 336 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a 337 because it is defined as 'a', which of course picks up the ASCII value. */ 338 339 #if 'a' == 0x81 /* Check for a real EBCDIC environment */ 340 #define ESCAPES_FIRST CHAR_a 341 #define ESCAPES_LAST CHAR_9 342 #define UPPER_CASE(c) (c+64) 343 #else /* Testing in an ASCII environment */ 344 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ 345 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ 346 #define UPPER_CASE(c) (c-32) 347 #endif 348 349 static const short int escapes[] = { 350 /* 80 */ ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, 351 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, 352 /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, 353 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, 354 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, 355 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, 356 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, 357 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', 358 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, 359 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, 360 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, 361 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, 362 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, 363 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, 364 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, 365 /* F8 */ 0, 0 366 }; 367 368 /* We also need a table of characters that may follow \c in an EBCDIC 369 environment for characters 0-31. */ 370 371 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; 372 373 #endif /* EBCDIC */ 374 375 376 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is 377 searched linearly. Put all the names into a single string, in order to reduce 378 the number of relocations when a shared library is dynamically linked. The 379 string is built from string macros so that it works in UTF-8 mode on EBCDIC 380 platforms. */ 381 382 typedef struct verbitem { 383 int len; /* Length of verb name */ 384 int op; /* Op when no arg, or -1 if arg mandatory */ 385 int op_arg; /* Op when arg present, or -1 if not allowed */ 386 } verbitem; 387 388 static const char verbnames[] = 389 "\0" /* Empty name is a shorthand for MARK */ 390 STRING_MARK0 391 STRING_ACCEPT0 392 STRING_COMMIT0 393 STRING_F0 394 STRING_FAIL0 395 STRING_PRUNE0 396 STRING_SKIP0 397 STRING_THEN; 398 399 static const verbitem verbs[] = { 400 { 0, -1, OP_MARK }, 401 { 4, -1, OP_MARK }, 402 { 6, OP_ACCEPT, -1 }, 403 { 6, OP_COMMIT, -1 }, 404 { 1, OP_FAIL, -1 }, 405 { 4, OP_FAIL, -1 }, 406 { 5, OP_PRUNE, OP_PRUNE_ARG }, 407 { 4, OP_SKIP, OP_SKIP_ARG }, 408 { 4, OP_THEN, OP_THEN_ARG } 409 }; 410 411 static const int verbcount = sizeof(verbs)/sizeof(verbitem); 412 413 414 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in 415 another regex library. */ 416 417 static const PCRE2_UCHAR sub_start_of_word[] = { 418 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, 419 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; 420 421 static const PCRE2_UCHAR sub_end_of_word[] = { 422 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, 423 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, 424 CHAR_RIGHT_PARENTHESIS, '\0' }; 425 426 427 /* Tables of names of POSIX character classes and their lengths. The names are 428 now all in a single string, to reduce the number of relocations when a shared 429 library is dynamically loaded. The list of lengths is terminated by a zero 430 length entry. The first three must be alpha, lower, upper, as this is assumed 431 for handling case independence. The indices for graph, print, and punct are 432 needed, so identify them. */ 433 434 static const char posix_names[] = 435 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 436 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 437 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 438 STRING_word0 STRING_xdigit; 439 440 static const uint8_t posix_name_lengths[] = { 441 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; 442 443 #define PC_GRAPH 8 444 #define PC_PRINT 9 445 #define PC_PUNCT 10 446 447 448 /* Table of class bit maps for each POSIX class. Each class is formed from a 449 base map, with an optional addition or removal of another map. Then, for some 450 classes, there is some additional tweaking: for [:blank:] the vertical space 451 characters are removed, and for [:alpha:] and [:alnum:] the underscore 452 character is removed. The triples in the table consist of the base map offset, 453 second map offset or -1 if no second map, and a non-negative value for map 454 addition or a negative value for map subtraction (if there are two maps). The 455 absolute value of the third field has these meanings: 0 => no tweaking, 1 => 456 remove vertical space characters, 2 => remove underscore. */ 457 458 static const int posix_class_maps[] = { 459 cbit_word, cbit_digit, -2, /* alpha */ 460 cbit_lower, -1, 0, /* lower */ 461 cbit_upper, -1, 0, /* upper */ 462 cbit_word, -1, 2, /* alnum - word without underscore */ 463 cbit_print, cbit_cntrl, 0, /* ascii */ 464 cbit_space, -1, 1, /* blank - a GNU extension */ 465 cbit_cntrl, -1, 0, /* cntrl */ 466 cbit_digit, -1, 0, /* digit */ 467 cbit_graph, -1, 0, /* graph */ 468 cbit_print, -1, 0, /* print */ 469 cbit_punct, -1, 0, /* punct */ 470 cbit_space, -1, 0, /* space */ 471 cbit_word, -1, 0, /* word - a Perl extension */ 472 cbit_xdigit,-1, 0 /* xdigit */ 473 }; 474 475 /* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by 476 Unicode property escapes. */ 477 478 #ifdef SUPPORT_UNICODE 479 static const PCRE2_UCHAR string_PNd[] = { 480 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 481 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 482 static const PCRE2_UCHAR string_pNd[] = { 483 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 484 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 485 static const PCRE2_UCHAR string_PXsp[] = { 486 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 487 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 488 static const PCRE2_UCHAR string_pXsp[] = { 489 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 490 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 491 static const PCRE2_UCHAR string_PXwd[] = { 492 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 493 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 494 static const PCRE2_UCHAR string_pXwd[] = { 495 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 496 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 497 498 static PCRE2_SPTR substitutes[] = { 499 string_PNd, /* \D */ 500 string_pNd, /* \d */ 501 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */ 502 string_pXsp, /* \s */ /* space and POSIX space are the same. */ 503 string_PXwd, /* \W */ 504 string_pXwd /* \w */ 505 }; 506 507 /* The POSIX class substitutes must be in the order of the POSIX class names, 508 defined above, and there are both positive and negative cases. NULL means no 509 general substitute of a Unicode property escape (\p or \P). However, for some 510 POSIX classes (e.g. graph, print, punct) a special property code is compiled 511 directly. */ 512 513 static const PCRE2_UCHAR string_pCc[] = { 514 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 515 CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 516 static const PCRE2_UCHAR string_pL[] = { 517 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 518 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 519 static const PCRE2_UCHAR string_pLl[] = { 520 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 521 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 522 static const PCRE2_UCHAR string_pLu[] = { 523 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 524 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 525 static const PCRE2_UCHAR string_pXan[] = { 526 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 527 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 528 static const PCRE2_UCHAR string_h[] = { 529 CHAR_BACKSLASH, CHAR_h, '\0' }; 530 static const PCRE2_UCHAR string_pXps[] = { 531 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 532 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 533 static const PCRE2_UCHAR string_PCc[] = { 534 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 535 CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 536 static const PCRE2_UCHAR string_PL[] = { 537 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 538 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 539 static const PCRE2_UCHAR string_PLl[] = { 540 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 541 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 542 static const PCRE2_UCHAR string_PLu[] = { 543 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 544 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 545 static const PCRE2_UCHAR string_PXan[] = { 546 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 547 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 548 static const PCRE2_UCHAR string_H[] = { 549 CHAR_BACKSLASH, CHAR_H, '\0' }; 550 static const PCRE2_UCHAR string_PXps[] = { 551 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 552 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 553 554 static PCRE2_SPTR posix_substitutes[] = { 555 string_pL, /* alpha */ 556 string_pLl, /* lower */ 557 string_pLu, /* upper */ 558 string_pXan, /* alnum */ 559 NULL, /* ascii */ 560 string_h, /* blank */ 561 string_pCc, /* cntrl */ 562 string_pNd, /* digit */ 563 NULL, /* graph */ 564 NULL, /* print */ 565 NULL, /* punct */ 566 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */ 567 string_pXwd, /* word */ /* Perl and POSIX space are the same */ 568 NULL, /* xdigit */ 569 /* Negated cases */ 570 string_PL, /* ^alpha */ 571 string_PLl, /* ^lower */ 572 string_PLu, /* ^upper */ 573 string_PXan, /* ^alnum */ 574 NULL, /* ^ascii */ 575 string_H, /* ^blank */ 576 string_PCc, /* ^cntrl */ 577 string_PNd, /* ^digit */ 578 NULL, /* ^graph */ 579 NULL, /* ^print */ 580 NULL, /* ^punct */ 581 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */ 582 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */ 583 NULL /* ^xdigit */ 584 }; 585 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *)) 586 #endif /* SUPPORT_UNICODE */ 587 588 /* Masks for checking option settings. */ 589 590 #define PUBLIC_COMPILE_OPTIONS \ 591 (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ 592 PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ 593 PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ 594 PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ 595 PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ 596 PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ 597 PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ 598 PCRE2_UTF) 599 600 /* Compile time error code numbers. They are given names so that they can more 601 easily be tracked. When a new number is added, the tables called eint1 and 602 eint2 in pcre2posix.c may need to be updated, and a new error text must be 603 added to compile_error_texts in pcre2_error.c. */ 604 605 enum { ERR0 = COMPILE_ERROR_BASE, 606 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, 607 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, 608 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, 609 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, 610 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, 611 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, 612 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, 613 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, 614 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 }; 615 616 /* Error codes that correspond to negative error codes returned by 617 find_fixedlength(). */ 618 619 static int fixed_length_errors[] = 620 { 621 ERR0, /* Not an error */ 622 ERR0, /* Not an error; -1 is used for "process later" */ 623 ERR25, /* Lookbehind is not fixed length */ 624 ERR36, /* \C in lookbehind is not allowed */ 625 ERR87, /* Lookbehind is too long */ 626 ERR86, /* Pattern too complicated */ 627 ERR70 /* Internal error: unknown opcode encountered */ 628 }; 629 630 /* This is a table of start-of-pattern options such as (*UTF) and settings such 631 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward 632 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is 633 generic and always supported. */ 634 635 enum { PSO_OPT, /* Value is an option bit */ 636 PSO_FLG, /* Value is a flag bit */ 637 PSO_NL, /* Value is a newline type */ 638 PSO_BSR, /* Value is a \R type */ 639 PSO_LIMM, /* Read integer value for match limit */ 640 PSO_LIMR }; /* Read integer value for recursion limit */ 641 642 typedef struct pso { 643 const uint8_t *name; 644 uint16_t length; 645 uint16_t type; 646 uint32_t value; 647 } pso; 648 649 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */ 650 651 static pso pso_list[] = { 652 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, 653 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, 654 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, 655 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, 656 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, 657 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, 658 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, 659 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, 660 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, 661 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, 662 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMR, 0 }, 663 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR }, 664 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF }, 665 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF }, 666 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY }, 667 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF }, 668 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF }, 669 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE } 670 }; 671 672 /* This table is used when converting repeating opcodes into possessified 673 versions as a result of an explicit possessive quantifier such as ++. A zero 674 value means there is no possessified version - in those cases the item in 675 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT 676 because all relevant opcodes are less than that. */ 677 678 static const uint8_t opcode_possessify[] = { 679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ 680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ 681 682 0, /* NOTI */ 683 OP_POSSTAR, 0, /* STAR, MINSTAR */ 684 OP_POSPLUS, 0, /* PLUS, MINPLUS */ 685 OP_POSQUERY, 0, /* QUERY, MINQUERY */ 686 OP_POSUPTO, 0, /* UPTO, MINUPTO */ 687 0, /* EXACT */ 688 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ 689 690 OP_POSSTARI, 0, /* STARI, MINSTARI */ 691 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ 692 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ 693 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ 694 0, /* EXACTI */ 695 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ 696 697 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ 698 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ 699 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ 700 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ 701 0, /* NOTEXACT */ 702 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ 703 704 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ 705 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ 706 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ 707 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ 708 0, /* NOTEXACTI */ 709 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ 710 711 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ 712 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ 713 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ 714 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ 715 0, /* TYPEEXACT */ 716 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ 717 718 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ 719 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ 720 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ 721 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ 722 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ 723 724 0, 0, 0, /* CLASS, NCLASS, XCLASS */ 725 0, 0, /* REF, REFI */ 726 0, 0, /* DNREF, DNREFI */ 727 0, 0 /* RECURSE, CALLOUT */ 728 }; 729 730 731 732 /************************************************* 733 * Copy compiled code * 734 *************************************************/ 735 736 /* Compiled JIT code cannot be copied, so the new compiled block has no 737 associated JIT data. */ 738 739 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION 740 pcre2_code_copy(const pcre2_code *code) 741 { 742 PCRE2_SIZE* ref_count; 743 pcre2_code *newcode; 744 745 if (code == NULL) return NULL; 746 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); 747 if (newcode == NULL) return NULL; 748 memcpy(newcode, code, code->blocksize); 749 newcode->executable_jit = NULL; 750 751 /* If the code is one that has been deserialized, increment the reference count 752 in the decoded tables. */ 753 754 if ((code->flags & PCRE2_DEREF_TABLES) != 0) 755 { 756 ref_count = (PCRE2_SIZE *)(code->tables + tables_length); 757 (*ref_count)++; 758 } 759 760 return newcode; 761 } 762 763 764 765 /************************************************* 766 * Free compiled code * 767 *************************************************/ 768 769 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 770 pcre2_code_free(pcre2_code *code) 771 { 772 PCRE2_SIZE* ref_count; 773 774 if (code != NULL) 775 { 776 if (code->executable_jit != NULL) 777 PRIV(jit_free)(code->executable_jit, &code->memctl); 778 779 if ((code->flags & PCRE2_DEREF_TABLES) != 0) 780 { 781 /* Decoded tables belong to the codes after deserialization, and they must 782 be freed when there are no more reference to them. The *ref_count should 783 always be > 0. */ 784 785 ref_count = (PCRE2_SIZE *)(code->tables + tables_length); 786 if (*ref_count > 0) 787 { 788 (*ref_count)--; 789 if (*ref_count == 0) 790 code->memctl.free((void *)code->tables, code->memctl.memory_data); 791 } 792 } 793 794 code->memctl.free(code, code->memctl.memory_data); 795 } 796 } 797 798 799 800 /************************************************* 801 * Insert an automatic callout point * 802 *************************************************/ 803 804 /* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert 805 callout points before each pattern item. 806 807 Arguments: 808 code current code pointer 809 ptr current pattern pointer 810 cb general compile-time data 811 812 Returns: new code pointer 813 */ 814 815 static PCRE2_UCHAR * 816 auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb) 817 { 818 code[0] = OP_CALLOUT; 819 PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */ 820 PUT(code, 1 + LINK_SIZE, 0); /* Default length */ 821 code[1 + 2*LINK_SIZE] = 255; 822 return code + PRIV(OP_lengths)[OP_CALLOUT]; 823 } 824 825 826 827 /************************************************* 828 * Complete a callout item * 829 *************************************************/ 830 831 /* A callout item contains the length of the next item in the pattern, which 832 we can't fill in till after we have reached the relevant point. This is used 833 for both automatic and manual callouts. 834 835 Arguments: 836 previous_callout points to previous callout item 837 ptr current pattern pointer 838 cb general compile-time data 839 840 Returns: nothing 841 */ 842 843 static void 844 complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr, 845 compile_block *cb) 846 { 847 size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1)); 848 PUT(previous_callout, 1 + LINK_SIZE, length); 849 } 850 851 852 853 /************************************************* 854 * Find the fixed length of a branch * 855 *************************************************/ 856 857 /* Scan a branch and compute the fixed length of subject that will match it, if 858 the length is fixed. This is needed for dealing with lookbehind assertions. In 859 UTF mode, the result is in code units rather than bytes. The branch is 860 temporarily terminated with OP_END when this function is called. 861 862 This function is called when a lookbehind assertion is encountered, so that if 863 it fails, the error message can point to the correct place in the pattern. 864 However, we cannot do this when the assertion contains subroutine calls, 865 because they can be forward references. We solve this by remembering this case 866 and doing the check at the end; a flag specifies which mode we are running in. 867 868 Lookbehind lengths are held in 16-bit fields and the maximum value is defined 869 as LOOKBEHIND_MAX. 870 871 Arguments: 872 code points to the start of the pattern (the bracket) 873 utf TRUE in UTF mode 874 atend TRUE if called when the pattern is complete 875 cb the "compile data" structure 876 recurses chain of recurse_check to catch mutual recursion 877 countptr pointer to counter, to catch over-complexity 878 879 Returns: if non-negative, the fixed length, 880 or -1 if an OP_RECURSE item was encountered and atend is FALSE 881 or -2 if there is no fixed length, 882 or -3 if \C was encountered (in UTF mode only) 883 or -4 if length is too long 884 or -5 if regex is too complicated 885 or -6 if an unknown opcode was encountered (internal error) 886 */ 887 888 #define FFL_LATER (-1) 889 #define FFL_NOTFIXED (-2) 890 #define FFL_BACKSLASHC (-3) 891 #define FFL_TOOLONG (-4) 892 #define FFL_TOOCOMPLICATED (-5) 893 #define FFL_UNKNOWNOP (-6) 894 895 static int 896 find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb, 897 recurse_check *recurses, int *countptr) 898 { 899 uint32_t length = 0xffffffffu; /* Unset */ 900 uint32_t group = 0; 901 uint32_t groupinfo = 0; 902 recurse_check this_recurse; 903 register uint32_t branchlength = 0; 904 register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE; 905 906 /* If this is a capturing group, we may have the answer cached, but we can only 907 use this information if there are no (?| groups in the pattern, because 908 otherwise group numbers are not unique. */ 909 910 if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA || 911 *code == OP_SCBRAPOS) 912 { 913 group = GET2(cc, 0); 914 cc += IMM2_SIZE; 915 groupinfo = cb->groupinfo[group]; 916 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0) 917 { 918 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED; 919 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) 920 return groupinfo & GI_FIXED_LENGTH_MASK; 921 } 922 } 923 924 /* A large and/or complex regex can take too long to process. This can happen 925 more often when (?| groups are present in the pattern. */ 926 927 if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED; 928 929 /* Scan along the opcodes for this branch. If we get to the end of the 930 branch, check the length against that of the other branches. */ 931 932 for (;;) 933 { 934 int d; 935 PCRE2_UCHAR *ce, *cs; 936 register PCRE2_UCHAR op = *cc; 937 938 if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG; 939 940 switch (op) 941 { 942 /* We only need to continue for OP_CBRA (normal capturing bracket) and 943 OP_BRA (normal non-capturing bracket) because the other variants of these 944 opcodes are all concerned with unlimited repeated groups, which of course 945 are not of fixed length. */ 946 947 case OP_CBRA: 948 case OP_BRA: 949 case OP_ONCE: 950 case OP_ONCE_NC: 951 case OP_COND: 952 d = find_fixedlength(cc, utf, atend, cb, recurses, countptr); 953 if (d < 0) return d; 954 branchlength += (uint32_t)d; 955 do cc += GET(cc, 1); while (*cc == OP_ALT); 956 cc += 1 + LINK_SIZE; 957 break; 958 959 /* Reached end of a branch; if it's a ket it is the end of a nested call. 960 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively 961 an ALT. If it is END it's the end of the outer call. All can be handled by 962 the same code. Note that we must not include the OP_KETRxxx opcodes here, 963 because they all imply an unlimited repeat. */ 964 965 case OP_ALT: 966 case OP_KET: 967 case OP_END: 968 case OP_ACCEPT: 969 case OP_ASSERT_ACCEPT: 970 if (length == 0xffffffffu) length = branchlength; 971 else if (length != branchlength) goto ISNOTFIXED; 972 if (*cc != OP_ALT) 973 { 974 if (group > 0) 975 { 976 groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length); 977 cb->groupinfo[group] = groupinfo; 978 } 979 return (int)length; 980 } 981 cc += 1 + LINK_SIZE; 982 branchlength = 0; 983 break; 984 985 /* A true recursion implies not fixed length, but a subroutine call may 986 be OK. If the subroutine is a forward reference, we can't deal with 987 it until the end of the pattern, so return FFL_LATER. */ 988 989 case OP_RECURSE: 990 if (!atend) return FFL_LATER; 991 cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */ 992 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ 993 if (cc > cs && cc < ce) goto ISNOTFIXED; /* Recursion */ 994 else /* Check for mutual recursion */ 995 { 996 recurse_check *r = recurses; 997 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 998 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ 999 } 1000 this_recurse.prev = recurses; 1001 this_recurse.group = cs; 1002 d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr); 1003 if (d < 0) return d; 1004 branchlength += (uint32_t)d; 1005 cc += 1 + LINK_SIZE; 1006 break; 1007 1008 /* Skip over assertive subpatterns. Note that we must increment cc by 1009 1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive 1010 situation this assertion may be the one that is ultimately being checked 1011 for having a fixed length, in which case its terminating OP_KET will have 1012 been temporarily replaced by OP_END. */ 1013 1014 case OP_ASSERT: 1015 case OP_ASSERT_NOT: 1016 case OP_ASSERTBACK: 1017 case OP_ASSERTBACK_NOT: 1018 do cc += GET(cc, 1); while (*cc == OP_ALT); 1019 cc += 1 + LINK_SIZE; 1020 break; 1021 1022 /* Skip over things that don't match chars */ 1023 1024 case OP_MARK: 1025 case OP_PRUNE_ARG: 1026 case OP_SKIP_ARG: 1027 case OP_THEN_ARG: 1028 cc += cc[1] + PRIV(OP_lengths)[*cc]; 1029 break; 1030 1031 case OP_CALLOUT: 1032 case OP_CIRC: 1033 case OP_CIRCM: 1034 case OP_CLOSE: 1035 case OP_COMMIT: 1036 case OP_CREF: 1037 case OP_FALSE: 1038 case OP_TRUE: 1039 case OP_DNCREF: 1040 case OP_DNRREF: 1041 case OP_DOLL: 1042 case OP_DOLLM: 1043 case OP_EOD: 1044 case OP_EODN: 1045 case OP_FAIL: 1046 case OP_NOT_WORD_BOUNDARY: 1047 case OP_PRUNE: 1048 case OP_REVERSE: 1049 case OP_RREF: 1050 case OP_SET_SOM: 1051 case OP_SKIP: 1052 case OP_SOD: 1053 case OP_SOM: 1054 case OP_THEN: 1055 case OP_WORD_BOUNDARY: 1056 cc += PRIV(OP_lengths)[*cc]; 1057 break; 1058 1059 case OP_CALLOUT_STR: 1060 cc += GET(cc, 1 + 2*LINK_SIZE); 1061 break; 1062 1063 /* Handle literal characters */ 1064 1065 case OP_CHAR: 1066 case OP_CHARI: 1067 case OP_NOT: 1068 case OP_NOTI: 1069 branchlength++; 1070 cc += 2; 1071 #ifdef SUPPORT_UNICODE 1072 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 1073 #endif 1074 break; 1075 1076 /* Handle exact repetitions. The count is already in characters, but we 1077 need to skip over a multibyte character in UTF8 mode. */ 1078 1079 case OP_EXACT: 1080 case OP_EXACTI: 1081 case OP_NOTEXACT: 1082 case OP_NOTEXACTI: 1083 branchlength += GET2(cc,1); 1084 cc += 2 + IMM2_SIZE; 1085 #ifdef SUPPORT_UNICODE 1086 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 1087 #endif 1088 break; 1089 1090 case OP_TYPEEXACT: 1091 branchlength += GET2(cc,1); 1092 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) 1093 cc += 2; 1094 cc += 1 + IMM2_SIZE + 1; 1095 break; 1096 1097 /* Handle single-char matchers */ 1098 1099 case OP_PROP: 1100 case OP_NOTPROP: 1101 cc += 2; 1102 /* Fall through */ 1103 1104 case OP_HSPACE: 1105 case OP_VSPACE: 1106 case OP_NOT_HSPACE: 1107 case OP_NOT_VSPACE: 1108 case OP_NOT_DIGIT: 1109 case OP_DIGIT: 1110 case OP_NOT_WHITESPACE: 1111 case OP_WHITESPACE: 1112 case OP_NOT_WORDCHAR: 1113 case OP_WORDCHAR: 1114 case OP_ANY: 1115 case OP_ALLANY: 1116 branchlength++; 1117 cc++; 1118 break; 1119 1120 /* The single-byte matcher isn't allowed. This only happens in UTF-8 or 1121 UTF-16 mode; otherwise \C is coded as OP_ALLANY. */ 1122 1123 case OP_ANYBYTE: 1124 return FFL_BACKSLASHC; 1125 1126 /* Check a class for variable quantification */ 1127 1128 case OP_CLASS: 1129 case OP_NCLASS: 1130 #ifdef SUPPORT_WIDE_CHARS 1131 case OP_XCLASS: 1132 /* The original code caused an unsigned overflow in 64 bit systems, 1133 so now we use a conditional statement. */ 1134 if (op == OP_XCLASS) 1135 cc += GET(cc, 1); 1136 else 1137 cc += PRIV(OP_lengths)[OP_CLASS]; 1138 #else 1139 cc += PRIV(OP_lengths)[OP_CLASS]; 1140 #endif 1141 1142 switch (*cc) 1143 { 1144 case OP_CRSTAR: 1145 case OP_CRMINSTAR: 1146 case OP_CRPLUS: 1147 case OP_CRMINPLUS: 1148 case OP_CRQUERY: 1149 case OP_CRMINQUERY: 1150 case OP_CRPOSSTAR: 1151 case OP_CRPOSPLUS: 1152 case OP_CRPOSQUERY: 1153 goto ISNOTFIXED; 1154 1155 case OP_CRRANGE: 1156 case OP_CRMINRANGE: 1157 case OP_CRPOSRANGE: 1158 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED; 1159 branchlength += GET2(cc,1); 1160 cc += 1 + 2 * IMM2_SIZE; 1161 break; 1162 1163 default: 1164 branchlength++; 1165 } 1166 break; 1167 1168 /* Anything else is variable length */ 1169 1170 case OP_ANYNL: 1171 case OP_BRAMINZERO: 1172 case OP_BRAPOS: 1173 case OP_BRAPOSZERO: 1174 case OP_BRAZERO: 1175 case OP_CBRAPOS: 1176 case OP_EXTUNI: 1177 case OP_KETRMAX: 1178 case OP_KETRMIN: 1179 case OP_KETRPOS: 1180 case OP_MINPLUS: 1181 case OP_MINPLUSI: 1182 case OP_MINQUERY: 1183 case OP_MINQUERYI: 1184 case OP_MINSTAR: 1185 case OP_MINSTARI: 1186 case OP_MINUPTO: 1187 case OP_MINUPTOI: 1188 case OP_NOTMINPLUS: 1189 case OP_NOTMINPLUSI: 1190 case OP_NOTMINQUERY: 1191 case OP_NOTMINQUERYI: 1192 case OP_NOTMINSTAR: 1193 case OP_NOTMINSTARI: 1194 case OP_NOTMINUPTO: 1195 case OP_NOTMINUPTOI: 1196 case OP_NOTPLUS: 1197 case OP_NOTPLUSI: 1198 case OP_NOTPOSPLUS: 1199 case OP_NOTPOSPLUSI: 1200 case OP_NOTPOSQUERY: 1201 case OP_NOTPOSQUERYI: 1202 case OP_NOTPOSSTAR: 1203 case OP_NOTPOSSTARI: 1204 case OP_NOTPOSUPTO: 1205 case OP_NOTPOSUPTOI: 1206 case OP_NOTQUERY: 1207 case OP_NOTQUERYI: 1208 case OP_NOTSTAR: 1209 case OP_NOTSTARI: 1210 case OP_NOTUPTO: 1211 case OP_NOTUPTOI: 1212 case OP_PLUS: 1213 case OP_PLUSI: 1214 case OP_POSPLUS: 1215 case OP_POSPLUSI: 1216 case OP_POSQUERY: 1217 case OP_POSQUERYI: 1218 case OP_POSSTAR: 1219 case OP_POSSTARI: 1220 case OP_POSUPTO: 1221 case OP_POSUPTOI: 1222 case OP_QUERY: 1223 case OP_QUERYI: 1224 case OP_REF: 1225 case OP_REFI: 1226 case OP_DNREF: 1227 case OP_DNREFI: 1228 case OP_SBRA: 1229 case OP_SBRAPOS: 1230 case OP_SCBRA: 1231 case OP_SCBRAPOS: 1232 case OP_SCOND: 1233 case OP_SKIPZERO: 1234 case OP_STAR: 1235 case OP_STARI: 1236 case OP_TYPEMINPLUS: 1237 case OP_TYPEMINQUERY: 1238 case OP_TYPEMINSTAR: 1239 case OP_TYPEMINUPTO: 1240 case OP_TYPEPLUS: 1241 case OP_TYPEPOSPLUS: 1242 case OP_TYPEPOSQUERY: 1243 case OP_TYPEPOSSTAR: 1244 case OP_TYPEPOSUPTO: 1245 case OP_TYPEQUERY: 1246 case OP_TYPESTAR: 1247 case OP_TYPEUPTO: 1248 case OP_UPTO: 1249 case OP_UPTOI: 1250 goto ISNOTFIXED; 1251 1252 /* Catch unrecognized opcodes so that when new ones are added they 1253 are not forgotten, as has happened in the past. */ 1254 1255 default: 1256 return FFL_UNKNOWNOP; 1257 } 1258 } 1259 /* Control never gets here except by goto. */ 1260 1261 ISNOTFIXED: 1262 if (group > 0) 1263 { 1264 groupinfo |= GI_NOT_FIXED_LENGTH; 1265 cb->groupinfo[group] = groupinfo; 1266 } 1267 return FFL_NOTFIXED; 1268 } 1269 1270 1271 1272 /************************************************* 1273 * Find first significant op code * 1274 *************************************************/ 1275 1276 /* This is called by several functions that scan a compiled expression looking 1277 for a fixed first character, or an anchoring op code etc. It skips over things 1278 that do not influence this. For some calls, it makes sense to skip negative 1279 forward and all backward assertions, and also the \b assertion; for others it 1280 does not. 1281 1282 Arguments: 1283 code pointer to the start of the group 1284 skipassert TRUE if certain assertions are to be skipped 1285 1286 Returns: pointer to the first significant opcode 1287 */ 1288 1289 static const PCRE2_UCHAR* 1290 first_significant_code(PCRE2_SPTR code, BOOL skipassert) 1291 { 1292 for (;;) 1293 { 1294 switch ((int)*code) 1295 { 1296 case OP_ASSERT_NOT: 1297 case OP_ASSERTBACK: 1298 case OP_ASSERTBACK_NOT: 1299 if (!skipassert) return code; 1300 do code += GET(code, 1); while (*code == OP_ALT); 1301 code += PRIV(OP_lengths)[*code]; 1302 break; 1303 1304 case OP_WORD_BOUNDARY: 1305 case OP_NOT_WORD_BOUNDARY: 1306 if (!skipassert) return code; 1307 /* Fall through */ 1308 1309 case OP_CALLOUT: 1310 case OP_CREF: 1311 case OP_DNCREF: 1312 case OP_RREF: 1313 case OP_DNRREF: 1314 case OP_FALSE: 1315 case OP_TRUE: 1316 code += PRIV(OP_lengths)[*code]; 1317 break; 1318 1319 case OP_CALLOUT_STR: 1320 code += GET(code, 1 + 2*LINK_SIZE); 1321 break; 1322 1323 default: 1324 return code; 1325 } 1326 } 1327 /* Control never reaches here */ 1328 } 1329 1330 1331 1332 /************************************************* 1333 * Scan compiled branch for non-emptiness * 1334 *************************************************/ 1335 1336 /* This function scans through a branch of a compiled pattern to see whether it 1337 can match the empty string. It is called at the end of compiling to check the 1338 entire pattern, and from compile_branch() when checking for an unlimited repeat 1339 of a group that can match nothing. In the latter case it is called only when 1340 doing the real compile, not during the pre-compile that measures the size of 1341 the compiled pattern. 1342 1343 Note that first_significant_code() skips over backward and negative forward 1344 assertions when its final argument is TRUE. If we hit an unclosed bracket, we 1345 return "empty" - this means we've struck an inner bracket whose current branch 1346 will already have been scanned. 1347 1348 Arguments: 1349 code points to start of search 1350 endcode points to where to stop 1351 utf TRUE if in UTF mode 1352 cb compile data 1353 atend TRUE if being called to check an entire pattern 1354 recurses chain of recurse_check to catch mutual recursion 1355 countptr pointer to count to catch over-complicated pattern 1356 1357 Returns: 0 if what is matched cannot be empty 1358 1 if what is matched could be empty 1359 -1 if the pattern is too complicated 1360 */ 1361 1362 #define CBE_NOTEMPTY 0 1363 #define CBE_EMPTY 1 1364 #define CBE_TOOCOMPLICATED (-1) 1365 1366 1367 static int 1368 could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, 1369 compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr) 1370 { 1371 uint32_t group = 0; 1372 uint32_t groupinfo = 0; 1373 register PCRE2_UCHAR c; 1374 recurse_check this_recurse; 1375 1376 /* If what we are checking has already been set as "could be empty", we know 1377 the answer. */ 1378 1379 if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY; 1380 1381 /* If this is a capturing group, we may have the answer cached, but we can only 1382 use this information if there are no (?| groups in the pattern, because 1383 otherwise group numbers are not unique. */ 1384 1385 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 && 1386 (*code == OP_CBRA || *code == OP_CBRAPOS)) 1387 { 1388 group = GET2(code, 1 + LINK_SIZE); 1389 groupinfo = cb->groupinfo[group]; 1390 if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0) 1391 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY; 1392 } 1393 1394 /* A large and/or complex regex can take too long to process. We have to assume 1395 it can match an empty string. This can happen more often when (?| groups are 1396 present in the pattern and the caching is disabled. Setting the cap at 1100 1397 allows the test for more than 1023 capturing patterns to work. */ 1398 1399 if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED; 1400 1401 /* Scan the opcodes for this branch. */ 1402 1403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); 1404 code < endcode; 1405 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) 1406 { 1407 PCRE2_SPTR ccode; 1408 1409 c = *code; 1410 1411 /* Skip over forward assertions; the other assertions are skipped by 1412 first_significant_code() with a TRUE final argument. */ 1413 1414 if (c == OP_ASSERT) 1415 { 1416 do code += GET(code, 1); while (*code == OP_ALT); 1417 c = *code; 1418 continue; 1419 } 1420 1421 /* For a recursion/subroutine call we can scan the recursion when this 1422 function is called at the end, to check a complete pattern. Before then, 1423 recursions just have the group number as their argument and in any case may 1424 be forward references. In that situation, we return CBE_EMPTY, just in case. 1425 It means that unlimited repeats of groups that contain recursions are always 1426 treated as "could be empty" - which just adds a bit more processing time 1427 because of the runtime check. */ 1428 1429 if (c == OP_RECURSE) 1430 { 1431 PCRE2_SPTR scode, endgroup; 1432 BOOL empty_branch; 1433 1434 if (!atend) goto ISTRUE; 1435 scode = cb->start_code + GET(code, 1); 1436 endgroup = scode; 1437 1438 /* We need to detect whether this is a recursive call, as otherwise there 1439 will be an infinite loop. If it is a recursion, just skip over it. Simple 1440 recursions are easily detected. For mutual recursions we keep a chain on 1441 the stack. */ 1442 1443 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); 1444 if (code >= scode && code <= endgroup) continue; /* Simple recursion */ 1445 else 1446 { 1447 recurse_check *r = recurses; 1448 for (r = recurses; r != NULL; r = r->prev) 1449 if (r->group == scode) break; 1450 if (r != NULL) continue; /* Mutual recursion */ 1451 } 1452 1453 /* Scan the referenced group, remembering it on the stack chain to detect 1454 mutual recursions. */ 1455 1456 empty_branch = FALSE; 1457 this_recurse.prev = recurses; 1458 this_recurse.group = scode; 1459 1460 do 1461 { 1462 int rc = could_be_empty_branch(scode, endcode, utf, cb, atend, 1463 &this_recurse, countptr); 1464 if (rc < 0) return rc; 1465 if (rc > 0) 1466 { 1467 empty_branch = TRUE; 1468 break; 1469 } 1470 scode += GET(scode, 1); 1471 } 1472 while (*scode == OP_ALT); 1473 1474 if (!empty_branch) goto ISFALSE; /* All branches are non-empty */ 1475 continue; 1476 } 1477 1478 /* Groups with zero repeats can of course be empty; skip them. */ 1479 1480 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || 1481 c == OP_BRAPOSZERO) 1482 { 1483 code += PRIV(OP_lengths)[c]; 1484 do code += GET(code, 1); while (*code == OP_ALT); 1485 c = *code; 1486 continue; 1487 } 1488 1489 /* A nested group that is already marked as "could be empty" can just be 1490 skipped. */ 1491 1492 if (c == OP_SBRA || c == OP_SBRAPOS || 1493 c == OP_SCBRA || c == OP_SCBRAPOS) 1494 { 1495 do code += GET(code, 1); while (*code == OP_ALT); 1496 c = *code; 1497 continue; 1498 } 1499 1500 /* For other groups, scan the branches. */ 1501 1502 if (c == OP_BRA || c == OP_BRAPOS || 1503 c == OP_CBRA || c == OP_CBRAPOS || 1504 c == OP_ONCE || c == OP_ONCE_NC || 1505 c == OP_COND || c == OP_SCOND) 1506 { 1507 BOOL empty_branch; 1508 if (GET(code, 1) == 0) goto ISTRUE; /* Hit unclosed bracket */ 1509 1510 /* If a conditional group has only one branch, there is a second, implied, 1511 empty branch, so just skip over the conditional, because it could be empty. 1512 Otherwise, scan the individual branches of the group. */ 1513 1514 if (c == OP_COND && code[GET(code, 1)] != OP_ALT) 1515 code += GET(code, 1); 1516 else 1517 { 1518 empty_branch = FALSE; 1519 do 1520 { 1521 if (!empty_branch) 1522 { 1523 int rc = could_be_empty_branch(code, endcode, utf, cb, atend, 1524 recurses, countptr); 1525 if (rc < 0) return rc; 1526 if (rc > 0) empty_branch = TRUE; 1527 } 1528 code += GET(code, 1); 1529 } 1530 while (*code == OP_ALT); 1531 if (!empty_branch) goto ISFALSE; /* All branches are non-empty */ 1532 } 1533 1534 c = *code; 1535 continue; 1536 } 1537 1538 /* Handle the other opcodes */ 1539 1540 switch (c) 1541 { 1542 /* Check for quantifiers after a class. XCLASS is used for classes that 1543 cannot be represented just by a bit map. This includes negated single 1544 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the 1545 actual length is stored in the compiled code, so we must update "code" 1546 here. */ 1547 1548 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 1549 case OP_XCLASS: 1550 ccode = code += GET(code, 1); 1551 goto CHECK_CLASS_REPEAT; 1552 #endif 1553 1554 case OP_CLASS: 1555 case OP_NCLASS: 1556 ccode = code + PRIV(OP_lengths)[OP_CLASS]; 1557 1558 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 1559 CHECK_CLASS_REPEAT: 1560 #endif 1561 1562 switch (*ccode) 1563 { 1564 case OP_CRSTAR: /* These could be empty; continue */ 1565 case OP_CRMINSTAR: 1566 case OP_CRQUERY: 1567 case OP_CRMINQUERY: 1568 case OP_CRPOSSTAR: 1569 case OP_CRPOSQUERY: 1570 break; 1571 1572 default: /* Non-repeat => class must match */ 1573 case OP_CRPLUS: /* These repeats aren't empty */ 1574 case OP_CRMINPLUS: 1575 case OP_CRPOSPLUS: 1576 goto ISFALSE; 1577 1578 case OP_CRRANGE: 1579 case OP_CRMINRANGE: 1580 case OP_CRPOSRANGE: 1581 if (GET2(ccode, 1) > 0) goto ISFALSE; /* Minimum > 0 */ 1582 break; 1583 } 1584 break; 1585 1586 /* Opcodes that must match a character */ 1587 1588 case OP_ANY: 1589 case OP_ALLANY: 1590 case OP_ANYBYTE: 1591 1592 case OP_PROP: 1593 case OP_NOTPROP: 1594 case OP_ANYNL: 1595 1596 case OP_NOT_HSPACE: 1597 case OP_HSPACE: 1598 case OP_NOT_VSPACE: 1599 case OP_VSPACE: 1600 case OP_EXTUNI: 1601 1602 case OP_NOT_DIGIT: 1603 case OP_DIGIT: 1604 case OP_NOT_WHITESPACE: 1605 case OP_WHITESPACE: 1606 case OP_NOT_WORDCHAR: 1607 case OP_WORDCHAR: 1608 1609 case OP_CHAR: 1610 case OP_CHARI: 1611 case OP_NOT: 1612 case OP_NOTI: 1613 1614 case OP_PLUS: 1615 case OP_PLUSI: 1616 case OP_MINPLUS: 1617 case OP_MINPLUSI: 1618 1619 case OP_NOTPLUS: 1620 case OP_NOTPLUSI: 1621 case OP_NOTMINPLUS: 1622 case OP_NOTMINPLUSI: 1623 1624 case OP_POSPLUS: 1625 case OP_POSPLUSI: 1626 case OP_NOTPOSPLUS: 1627 case OP_NOTPOSPLUSI: 1628 1629 case OP_EXACT: 1630 case OP_EXACTI: 1631 case OP_NOTEXACT: 1632 case OP_NOTEXACTI: 1633 1634 case OP_TYPEPLUS: 1635 case OP_TYPEMINPLUS: 1636 case OP_TYPEPOSPLUS: 1637 case OP_TYPEEXACT: 1638 goto ISFALSE; 1639 1640 /* These are going to continue, as they may be empty, but we have to 1641 fudge the length for the \p and \P cases. */ 1642 1643 case OP_TYPESTAR: 1644 case OP_TYPEMINSTAR: 1645 case OP_TYPEPOSSTAR: 1646 case OP_TYPEQUERY: 1647 case OP_TYPEMINQUERY: 1648 case OP_TYPEPOSQUERY: 1649 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 1650 break; 1651 1652 /* Same for these */ 1653 1654 case OP_TYPEUPTO: 1655 case OP_TYPEMINUPTO: 1656 case OP_TYPEPOSUPTO: 1657 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 1658 code += 2; 1659 break; 1660 1661 /* End of branch */ 1662 1663 case OP_KET: 1664 case OP_KETRMAX: 1665 case OP_KETRMIN: 1666 case OP_KETRPOS: 1667 case OP_ALT: 1668 goto ISTRUE; 1669 1670 /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, 1671 POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative 1672 versions may be followed by a multibyte character. */ 1673 1674 #ifdef MAYBE_UTF_MULTI 1675 case OP_STAR: 1676 case OP_STARI: 1677 case OP_NOTSTAR: 1678 case OP_NOTSTARI: 1679 1680 case OP_MINSTAR: 1681 case OP_MINSTARI: 1682 case OP_NOTMINSTAR: 1683 case OP_NOTMINSTARI: 1684 1685 case OP_POSSTAR: 1686 case OP_POSSTARI: 1687 case OP_NOTPOSSTAR: 1688 case OP_NOTPOSSTARI: 1689 1690 case OP_QUERY: 1691 case OP_QUERYI: 1692 case OP_NOTQUERY: 1693 case OP_NOTQUERYI: 1694 1695 case OP_MINQUERY: 1696 case OP_MINQUERYI: 1697 case OP_NOTMINQUERY: 1698 case OP_NOTMINQUERYI: 1699 1700 case OP_POSQUERY: 1701 case OP_POSQUERYI: 1702 case OP_NOTPOSQUERY: 1703 case OP_NOTPOSQUERYI: 1704 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); 1705 break; 1706 1707 case OP_UPTO: 1708 case OP_UPTOI: 1709 case OP_NOTUPTO: 1710 case OP_NOTUPTOI: 1711 1712 case OP_MINUPTO: 1713 case OP_MINUPTOI: 1714 case OP_NOTMINUPTO: 1715 case OP_NOTMINUPTOI: 1716 1717 case OP_POSUPTO: 1718 case OP_POSUPTOI: 1719 case OP_NOTPOSUPTO: 1720 case OP_NOTPOSUPTOI: 1721 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); 1722 break; 1723 #endif /* MAYBE_UTF_MULTI */ 1724 1725 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument 1726 string. */ 1727 1728 case OP_MARK: 1729 case OP_PRUNE_ARG: 1730 case OP_SKIP_ARG: 1731 case OP_THEN_ARG: 1732 code += code[1]; 1733 break; 1734 1735 /* None of the remaining opcodes are required to match a character. */ 1736 1737 default: 1738 break; 1739 } 1740 } 1741 1742 ISTRUE: 1743 groupinfo |= GI_COULD_BE_EMPTY; 1744 1745 ISFALSE: 1746 if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY; 1747 1748 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY; 1749 } 1750 1751 1752 1753 /************************************************* 1754 * Check for counted repeat * 1755 *************************************************/ 1756 1757 /* This function is called when a '{' is encountered in a place where it might 1758 start a quantifier. It looks ahead to see if it really is a quantifier, that 1759 is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits. 1760 1761 Argument: pointer to the first char after '{' 1762 Returns: TRUE or FALSE 1763 */ 1764 1765 static BOOL 1766 is_counted_repeat(PCRE2_SPTR p) 1767 { 1768 if (!IS_DIGIT(*p)) return FALSE; 1769 p++; 1770 while (IS_DIGIT(*p)) p++; 1771 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 1772 1773 if (*p++ != CHAR_COMMA) return FALSE; 1774 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 1775 1776 if (!IS_DIGIT(*p)) return FALSE; 1777 p++; 1778 while (IS_DIGIT(*p)) p++; 1779 1780 return (*p == CHAR_RIGHT_CURLY_BRACKET); 1781 } 1782 1783 1784 1785 /************************************************* 1786 * Handle escapes * 1787 *************************************************/ 1788 1789 /* This function is called when a \ has been encountered. It either returns a 1790 positive value for a simple escape such as \d, or 0 for a data character, which 1791 is placed in chptr. A backreference to group n is returned as negative n. On 1792 entry, ptr is pointing at the \. On exit, it points the final code unit of the 1793 escape sequence. 1794 1795 This function is also called from pcre2_substitute() to handle escape sequences 1796 in replacement strings. In this case, the cb argument is NULL, and only 1797 sequences that define a data character are recognised. The isclass argument is 1798 not relevant, but the options argument is the final value of the compiled 1799 pattern's options. 1800 1801 There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is 1802 processed, it is replaced by a nested alternative sequence. If this contains a 1803 backslash (which is usually does), ptrend does not point to its end - it still 1804 points to the end of the whole pattern. However, we can detect this case 1805 because cb->nestptr[0] will be non-NULL. The nested sequences are all zero- 1806 terminated and there are only ever two levels of nesting. 1807 1808 Arguments: 1809 ptrptr points to the input position pointer 1810 ptrend points to the end of the input 1811 chptr points to a returned data character 1812 errorcodeptr points to the errorcode variable (containing zero) 1813 options the current options bits 1814 isclass TRUE if inside a character class 1815 cb compile data block 1816 1817 Returns: zero => a data character 1818 positive => a special escape sequence 1819 negative => a back reference 1820 on error, errorcodeptr is set non-zero 1821 */ 1822 1823 int 1824 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, 1825 int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb) 1826 { 1827 BOOL utf = (options & PCRE2_UTF) != 0; 1828 PCRE2_SPTR ptr = *ptrptr + 1; 1829 register uint32_t c, cc; 1830 int escape = 0; 1831 int i; 1832 1833 /* Find the end of a nested insert. */ 1834 1835 if (cb != NULL && cb->nestptr[0] != NULL) 1836 ptrend = ptr + PRIV(strlen)(ptr); 1837 1838 /* If backslash is at the end of the string, it's an error. */ 1839 1840 if (ptr >= ptrend) 1841 { 1842 *errorcodeptr = ERR1; 1843 return 0; 1844 } 1845 1846 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 1847 ptr--; /* Set pointer back to the last code unit */ 1848 1849 /* Non-alphanumerics are literals, so we just leave the value in c. An initial 1850 value test saves a memory lookup for code points outside the alphanumeric 1851 range. Otherwise, do a table lookup. A non-zero result is something that can be 1852 returned immediately. Otherwise further processing is required. */ 1853 1854 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ 1855 1856 else if ((i = escapes[c - ESCAPES_FIRST]) != 0) 1857 { 1858 if (i > 0) c = (uint32_t)i; else /* Positive is a data character */ 1859 { 1860 escape = -i; /* Else return a special escape */ 1861 if (escape == ESC_P || escape == ESC_p || escape == ESC_X) 1862 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ 1863 } 1864 } 1865 1866 /* Escapes that need further processing, including those that are unknown. 1867 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u 1868 when BSUX is set). */ 1869 1870 else 1871 { 1872 PCRE2_SPTR oldptr; 1873 BOOL braced, negated, overflow; 1874 unsigned int s; 1875 1876 /* Filter calls from pcre2_substitute(). */ 1877 1878 if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x && 1879 (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0)) 1880 { 1881 *errorcodeptr = ERR3; 1882 return 0; 1883 } 1884 1885 switch (c) 1886 { 1887 /* A number of Perl escapes are not handled by PCRE. We give an explicit 1888 error. */ 1889 1890 case CHAR_l: 1891 case CHAR_L: 1892 *errorcodeptr = ERR37; 1893 break; 1894 1895 /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated 1896 specially, \u must be followed by four hex digits. Otherwise it is a 1897 lowercase u letter. */ 1898 1899 case CHAR_u: 1900 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else 1901 { 1902 uint32_t xc; 1903 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 1904 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ 1905 cc = (cc << 4) | xc; 1906 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ 1907 cc = (cc << 4) | xc; 1908 if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */ 1909 c = (cc << 4) | xc; 1910 ptr += 4; 1911 if (utf) 1912 { 1913 if (c > 0x10ffffU) *errorcodeptr = ERR77; 1914 else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 1915 } 1916 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; 1917 } 1918 break; 1919 1920 case CHAR_U: 1921 /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an 1922 upper case letter. */ 1923 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; 1924 break; 1925 1926 /* In a character class, \g is just a literal "g". Outside a character 1927 class, \g must be followed by one of a number of specific things: 1928 1929 (1) A number, either plain or braced. If positive, it is an absolute 1930 backreference. If negative, it is a relative backreference. This is a Perl 1931 5.10 feature. 1932 1933 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This 1934 is part of Perl's movement towards a unified syntax for back references. As 1935 this is synonymous with \k{name}, we fudge it up by pretending it really 1936 was \k. 1937 1938 (3) For Oniguruma compatibility we also support \g followed by a name or a 1939 number either in angle brackets or in single quotes. However, these are 1940 (possibly recursive) subroutine calls, _not_ backreferences. Just return 1941 the ESC_g code (cf \k). */ 1942 1943 case CHAR_g: 1944 if (isclass) break; 1945 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) 1946 { 1947 escape = ESC_g; 1948 break; 1949 } 1950 1951 /* Handle the Perl-compatible cases */ 1952 1953 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 1954 { 1955 PCRE2_SPTR p; 1956 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++) 1957 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; 1958 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET) 1959 { 1960 escape = ESC_k; 1961 break; 1962 } 1963 braced = TRUE; 1964 ptr++; 1965 } 1966 else braced = FALSE; 1967 1968 if (ptr[1] == CHAR_MINUS) 1969 { 1970 negated = TRUE; 1971 ptr++; 1972 } 1973 else negated = FALSE; 1974 1975 /* The integer range is limited by the machine's int representation. */ 1976 s = 0; 1977 overflow = FALSE; 1978 while (IS_DIGIT(ptr[1])) 1979 { 1980 if (s > INT_MAX / 10 - 1) /* Integer overflow */ 1981 { 1982 overflow = TRUE; 1983 break; 1984 } 1985 s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0); 1986 } 1987 if (overflow) /* Integer overflow */ 1988 { 1989 while (IS_DIGIT(ptr[1])) ptr++; 1990 *errorcodeptr = ERR61; 1991 break; 1992 } 1993 1994 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) 1995 { 1996 *errorcodeptr = ERR57; 1997 break; 1998 } 1999 2000 if (s == 0) 2001 { 2002 *errorcodeptr = ERR58; 2003 break; 2004 } 2005 2006 if (negated) 2007 { 2008 if (s > cb->bracount) 2009 { 2010 *errorcodeptr = ERR15; 2011 break; 2012 } 2013 s = cb->bracount - (s - 1); 2014 } 2015 2016 escape = -(int)s; 2017 break; 2018 2019 /* The handling of escape sequences consisting of a string of digits 2020 starting with one that is not zero is not straightforward. Perl has changed 2021 over the years. Nowadays \g{} for backreferences and \o{} for octal are 2022 recommended to avoid the ambiguities in the old syntax. 2023 2024 Outside a character class, the digits are read as a decimal number. If the 2025 number is less than 10, or if there are that many previous extracting left 2026 brackets, it is a back reference. Otherwise, up to three octal digits are 2027 read to form an escaped character code. Thus \123 is likely to be octal 123 2028 (cf \0123, which is octal 012 followed by the literal 3). 2029 2030 Inside a character class, \ followed by a digit is always either a literal 2031 8 or 9 or an octal number. */ 2032 2033 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: 2034 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 2035 2036 if (!isclass) 2037 { 2038 oldptr = ptr; 2039 /* The integer range is limited by the machine's int representation. */ 2040 s = c - CHAR_0; 2041 overflow = FALSE; 2042 while (IS_DIGIT(ptr[1])) 2043 { 2044 if (s > INT_MAX / 10 - 1) /* Integer overflow */ 2045 { 2046 overflow = TRUE; 2047 break; 2048 } 2049 s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0); 2050 } 2051 if (overflow) /* Integer overflow */ 2052 { 2053 while (IS_DIGIT(ptr[1])) ptr++; 2054 *errorcodeptr = ERR61; 2055 break; 2056 } 2057 2058 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x 2059 are octal escapes if there are not that many previous captures. */ 2060 2061 if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount) 2062 { 2063 escape = -(int)s; /* Indicates a back reference */ 2064 break; 2065 } 2066 ptr = oldptr; /* Put the pointer back and fall through */ 2067 } 2068 2069 /* Handle a digit following \ when the number is not a back reference, or 2070 we are within a character class. If the first digit is 8 or 9, Perl used to 2071 generate a binary zero byte and then treat the digit as a following 2072 literal. At least by Perl 5.18 this changed so as not to insert the binary 2073 zero. */ 2074 2075 if ((c = *ptr) >= CHAR_8) break; 2076 2077 /* Fall through with a digit less than 8 */ 2078 2079 /* \0 always starts an octal number, but we may drop through to here with a 2080 larger first octal digit. The original code used just to take the least 2081 significant 8 bits of octal numbers (I think this is what early Perls used 2082 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, 2083 but no more than 3 octal digits. */ 2084 2085 case CHAR_0: 2086 c -= CHAR_0; 2087 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) 2088 c = c * 8 + *(++ptr) - CHAR_0; 2089 #if PCRE2_CODE_UNIT_WIDTH == 8 2090 if (!utf && c > 0xff) *errorcodeptr = ERR51; 2091 #endif 2092 break; 2093 2094 /* \o is a relatively new Perl feature, supporting a more general way of 2095 specifying character codes in octal. The only supported form is \o{ddd}. */ 2096 2097 case CHAR_o: 2098 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else 2099 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else 2100 { 2101 ptr += 2; 2102 c = 0; 2103 overflow = FALSE; 2104 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) 2105 { 2106 cc = *ptr++; 2107 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ 2108 #if PCRE2_CODE_UNIT_WIDTH == 32 2109 if (c >= 0x20000000l) { overflow = TRUE; break; } 2110 #endif 2111 c = (c << 3) + (cc - CHAR_0); 2112 #if PCRE2_CODE_UNIT_WIDTH == 8 2113 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } 2114 #elif PCRE2_CODE_UNIT_WIDTH == 16 2115 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } 2116 #elif PCRE2_CODE_UNIT_WIDTH == 32 2117 if (utf && c > 0x10ffffU) { overflow = TRUE; break; } 2118 #endif 2119 } 2120 if (overflow) 2121 { 2122 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; 2123 *errorcodeptr = ERR34; 2124 } 2125 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 2126 { 2127 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 2128 } 2129 else *errorcodeptr = ERR64; 2130 } 2131 break; 2132 2133 /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by 2134 two hexadecimal digits. Otherwise it is a lowercase x letter. */ 2135 2136 case CHAR_x: 2137 if ((options & PCRE2_ALT_BSUX) != 0) 2138 { 2139 uint32_t xc; 2140 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 2141 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ 2142 c = (cc << 4) | xc; 2143 ptr += 2; 2144 } /* End PCRE2_ALT_BSUX handling */ 2145 2146 /* Handle \x in Perl's style. \x{ddd} is a character number which can be 2147 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex 2148 digits. If not, { used to be treated as a data character. However, Perl 2149 seems to read hex digits up to the first non-such, and ignore the rest, so 2150 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE 2151 now gives an error. */ 2152 2153 else 2154 { 2155 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 2156 { 2157 ptr += 2; 2158 if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 2159 { 2160 *errorcodeptr = ERR78; 2161 break; 2162 } 2163 c = 0; 2164 overflow = FALSE; 2165 2166 while ((cc = XDIGIT(*ptr)) != 0xff) 2167 { 2168 ptr++; 2169 if (c == 0 && cc == 0) continue; /* Leading zeroes */ 2170 #if PCRE2_CODE_UNIT_WIDTH == 32 2171 if (c >= 0x10000000l) { overflow = TRUE; break; } 2172 #endif 2173 c = (c << 4) | cc; 2174 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) 2175 { 2176 overflow = TRUE; 2177 break; 2178 } 2179 } 2180 2181 if (overflow) 2182 { 2183 while (XDIGIT(*ptr) != 0xff) ptr++; 2184 *errorcodeptr = ERR34; 2185 } 2186 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 2187 { 2188 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 2189 } 2190 2191 /* If the sequence of hex digits does not end with '}', give an error. 2192 We used just to recognize this construct and fall through to the normal 2193 \x handling, but nowadays Perl gives an error, which seems much more 2194 sensible, so we do too. */ 2195 2196 else *errorcodeptr = ERR67; 2197 } /* End of \x{} processing */ 2198 2199 /* Read a single-byte hex-defined char (up to two hex digits after \x) */ 2200 2201 else 2202 { 2203 c = 0; 2204 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 2205 ptr++; 2206 c = cc; 2207 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ 2208 ptr++; 2209 c = (c << 4) | cc; 2210 } /* End of \xdd handling */ 2211 } /* End of Perl-style \x handling */ 2212 break; 2213 2214 /* The handling of \c is different in ASCII and EBCDIC environments. In an 2215 ASCII (or Unicode) environment, an error is given if the character 2216 following \c is not a printable ASCII character. Otherwise, the following 2217 character is upper-cased if it is a letter, and after that the 0x40 bit is 2218 flipped. The result is the value of the escape. 2219 2220 In an EBCDIC environment the handling of \c is compatible with the 2221 specification in the perlebcdic document. The following character must be 2222 a letter or one of small number of special characters. These provide a 2223 means of defining the character values 0-31. 2224 2225 For testing the EBCDIC handling of \c in an ASCII environment, recognize 2226 the EBCDIC value of 'c' explicitly. */ 2227 2228 #if defined EBCDIC && 'a' != 0x81 2229 case 0x83: 2230 #else 2231 case CHAR_c: 2232 #endif 2233 2234 c = *(++ptr); 2235 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); 2236 if (c == CHAR_NULL && ptr >= ptrend) 2237 { 2238 *errorcodeptr = ERR2; 2239 break; 2240 } 2241 2242 /* Handle \c in an ASCII/Unicode environment. */ 2243 2244 #ifndef EBCDIC /* ASCII/UTF-8 coding */ 2245 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ 2246 { 2247 *errorcodeptr = ERR68; 2248 break; 2249 } 2250 c ^= 0x40; 2251 2252 /* Handle \c in an EBCDIC environment. The special case \c? is converted to 2253 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC 2254 encoding. (This is the way Perl indicates that it handles \c?.) The other 2255 valid sequences correspond to a list of specific characters. */ 2256 2257 #else 2258 if (c == CHAR_QUESTION_MARK) 2259 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; 2260 else 2261 { 2262 for (i = 0; i < 32; i++) 2263 { 2264 if (c == ebcdic_escape_c[i]) break; 2265 } 2266 if (i < 32) c = i; else *errorcodeptr = ERR68; 2267 } 2268 #endif /* EBCDIC */ 2269 2270 break; 2271 2272 /* Any other alphanumeric following \ is an error. Perl gives an error only 2273 if in warning mode, but PCRE doesn't have a warning mode. */ 2274 2275 default: 2276 *errorcodeptr = ERR3; 2277 break; 2278 } 2279 } 2280 2281 /* Perl supports \N{name} for character names, as well as plain \N for "not 2282 newline". PCRE does not support \N{name}. However, it does support 2283 quantification such as \N{2,3}. */ 2284 2285 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && 2286 !is_counted_repeat(ptr+2)) 2287 *errorcodeptr = ERR37; 2288 2289 /* If PCRE2_UCP is set, we change the values for \d etc. */ 2290 2291 if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w) 2292 escape += (ESC_DU - ESC_D); 2293 2294 /* Set the pointer to the final character before returning. */ 2295 2296 *ptrptr = ptr; 2297 *chptr = c; 2298 return escape; 2299 } 2300 2301 2302 2303 #ifdef SUPPORT_UNICODE 2304 /************************************************* 2305 * Handle \P and \p * 2306 *************************************************/ 2307 2308 /* This function is called after \P or \p has been encountered, provided that 2309 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the 2310 contents of ptrptr are pointing at the P or p. On exit, it is left pointing at 2311 the final code unit of the escape sequence. 2312 2313 Arguments: 2314 ptrptr the pattern position pointer 2315 negptr a boolean that is set TRUE for negation else FALSE 2316 ptypeptr an unsigned int that is set to the type value 2317 pdataptr an unsigned int that is set to the detailed property value 2318 errorcodeptr the error code variable 2319 cb the compile data 2320 2321 Returns: TRUE if the type value was found, or FALSE for an invalid type 2322 */ 2323 2324 static BOOL 2325 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr, 2326 unsigned int *pdataptr, int *errorcodeptr, compile_block *cb) 2327 { 2328 register PCRE2_UCHAR c; 2329 size_t i, bot, top; 2330 PCRE2_SPTR ptr = *ptrptr; 2331 PCRE2_UCHAR name[32]; 2332 2333 *negptr = FALSE; 2334 c = *(++ptr); 2335 2336 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for 2337 negation. */ 2338 2339 if (c == CHAR_LEFT_CURLY_BRACKET) 2340 { 2341 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) 2342 { 2343 *negptr = TRUE; 2344 ptr++; 2345 } 2346 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) 2347 { 2348 c = *(++ptr); 2349 if (c == CHAR_NULL) goto ERROR_RETURN; 2350 if (c == CHAR_RIGHT_CURLY_BRACKET) break; 2351 name[i] = c; 2352 } 2353 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; 2354 name[i] = 0; 2355 } 2356 2357 /* Otherwise there is just one following character, which must be an ASCII 2358 letter. */ 2359 2360 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) 2361 { 2362 name[0] = c; 2363 name[1] = 0; 2364 } 2365 else goto ERROR_RETURN; 2366 2367 *ptrptr = ptr; 2368 2369 /* Search for a recognized property name using binary chop. */ 2370 2371 bot = 0; 2372 top = PRIV(utt_size); 2373 2374 while (bot < top) 2375 { 2376 int r; 2377 i = (bot + top) >> 1; 2378 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); 2379 if (r == 0) 2380 { 2381 *ptypeptr = PRIV(utt)[i].type; 2382 *pdataptr = PRIV(utt)[i].value; 2383 return TRUE; 2384 } 2385 if (r > 0) bot = i + 1; else top = i; 2386 } 2387 *errorcodeptr = ERR47; /* Unrecognized name */ 2388 return FALSE; 2389 2390 ERROR_RETURN: /* Malformed \P or \p */ 2391 *errorcodeptr = ERR46; 2392 *ptrptr = ptr; 2393 return FALSE; 2394 } 2395 #endif 2396 2397 2398 2399 /************************************************* 2400 * Read repeat counts * 2401 *************************************************/ 2402 2403 /* Read an item of the form {n,m} and return the values. This is called only 2404 after is_counted_repeat() has confirmed that a repeat-count quantifier exists, 2405 so the syntax is guaranteed to be correct, but we need to check the values. 2406 2407 Arguments: 2408 p pointer to first char after '{' 2409 minp pointer to int for min 2410 maxp pointer to int for max 2411 returned as -1 if no max 2412 errorcodeptr points to error code variable 2413 2414 Returns: pointer to '}' on success; 2415 current ptr on error, with errorcodeptr set non-zero 2416 */ 2417 2418 static PCRE2_SPTR 2419 read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr) 2420 { 2421 int min = 0; 2422 int max = -1; 2423 2424 while (IS_DIGIT(*p)) 2425 { 2426 min = min * 10 + (int)(*p++ - CHAR_0); 2427 if (min > 65535) 2428 { 2429 *errorcodeptr = ERR5; 2430 return p; 2431 } 2432 } 2433 2434 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else 2435 { 2436 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) 2437 { 2438 max = 0; 2439 while(IS_DIGIT(*p)) 2440 { 2441 max = max * 10 + (int)(*p++ - CHAR_0); 2442 if (max > 65535) 2443 { 2444 *errorcodeptr = ERR5; 2445 return p; 2446 } 2447 } 2448 if (max < min) 2449 { 2450 *errorcodeptr = ERR4; 2451 return p; 2452 } 2453 } 2454 } 2455 2456 *minp = min; 2457 *maxp = max; 2458 return p; 2459 } 2460 2461 2462 2463 /************************************************* 2464 * Scan compiled regex for recursion reference * 2465 *************************************************/ 2466 2467 /* This function scans through a compiled pattern until it finds an instance of 2468 OP_RECURSE. 2469 2470 Arguments: 2471 code points to start of expression 2472 utf TRUE in UTF mode 2473 2474 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found 2475 */ 2476 2477 static PCRE2_SPTR 2478 find_recurse(PCRE2_SPTR code, BOOL utf) 2479 { 2480 for (;;) 2481 { 2482 register PCRE2_UCHAR c = *code; 2483 if (c == OP_END) return NULL; 2484 if (c == OP_RECURSE) return code; 2485 2486 /* XCLASS is used for classes that cannot be represented just by a bit map. 2487 This includes negated single high-valued characters. CALLOUT_STR is used for 2488 callouts with string arguments. In both cases the length in the table is 2489 zero; the actual length is stored in the compiled code. */ 2490 2491 if (c == OP_XCLASS) code += GET(code, 1); 2492 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); 2493 2494 /* Otherwise, we can get the item's length from the table, except that for 2495 repeated character types, we have to test for \p and \P, which have an extra 2496 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we 2497 must add in its length. */ 2498 2499 else 2500 { 2501 switch(c) 2502 { 2503 case OP_TYPESTAR: 2504 case OP_TYPEMINSTAR: 2505 case OP_TYPEPLUS: 2506 case OP_TYPEMINPLUS: 2507 case OP_TYPEQUERY: 2508 case OP_TYPEMINQUERY: 2509 case OP_TYPEPOSSTAR: 2510 case OP_TYPEPOSPLUS: 2511 case OP_TYPEPOSQUERY: 2512 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 2513 break; 2514 2515 case OP_TYPEPOSUPTO: 2516 case OP_TYPEUPTO: 2517 case OP_TYPEMINUPTO: 2518 case OP_TYPEEXACT: 2519 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 2520 code += 2; 2521 break; 2522 2523 case OP_MARK: 2524 case OP_PRUNE_ARG: 2525 case OP_SKIP_ARG: 2526 case OP_THEN_ARG: 2527 code += code[1]; 2528 break; 2529 } 2530 2531 /* Add in the fixed length from the table */ 2532 2533 code += PRIV(OP_lengths)[c]; 2534 2535 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may 2536 be followed by a multi-unit character. The length in the table is a 2537 minimum, so we have to arrange to skip the extra units. */ 2538 2539 #ifdef MAYBE_UTF_MULTI 2540 if (utf) switch(c) 2541 { 2542 case OP_CHAR: 2543 case OP_CHARI: 2544 case OP_NOT: 2545 case OP_NOTI: 2546 case OP_EXACT: 2547 case OP_EXACTI: 2548 case OP_NOTEXACT: 2549 case OP_NOTEXACTI: 2550 case OP_UPTO: 2551 case OP_UPTOI: 2552 case OP_NOTUPTO: 2553 case OP_NOTUPTOI: 2554 case OP_MINUPTO: 2555 case OP_MINUPTOI: 2556 case OP_NOTMINUPTO: 2557 case OP_NOTMINUPTOI: 2558 case OP_POSUPTO: 2559 case OP_POSUPTOI: 2560 case OP_NOTPOSUPTO: 2561 case OP_NOTPOSUPTOI: 2562 case OP_STAR: 2563 case OP_STARI: 2564 case OP_NOTSTAR: 2565 case OP_NOTSTARI: 2566 case OP_MINSTAR: 2567 case OP_MINSTARI: 2568 case OP_NOTMINSTAR: 2569 case OP_NOTMINSTARI: 2570 case OP_POSSTAR: 2571 case OP_POSSTARI: 2572 case OP_NOTPOSSTAR: 2573 case OP_NOTPOSSTARI: 2574 case OP_PLUS: 2575 case OP_PLUSI: 2576 case OP_NOTPLUS: 2577 case OP_NOTPLUSI: 2578 case OP_MINPLUS: 2579 case OP_MINPLUSI: 2580 case OP_NOTMINPLUS: 2581 case OP_NOTMINPLUSI: 2582 case OP_POSPLUS: 2583 case OP_POSPLUSI: 2584 case OP_NOTPOSPLUS: 2585 case OP_NOTPOSPLUSI: 2586 case OP_QUERY: 2587 case OP_QUERYI: 2588 case OP_NOTQUERY: 2589 case OP_NOTQUERYI: 2590 case OP_MINQUERY: 2591 case OP_MINQUERYI: 2592 case OP_NOTMINQUERY: 2593 case OP_NOTMINQUERYI: 2594 case OP_POSQUERY: 2595 case OP_POSQUERYI: 2596 case OP_NOTPOSQUERY: 2597 case OP_NOTPOSQUERYI: 2598 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 2599 break; 2600 } 2601 #else 2602 (void)(utf); /* Keep compiler happy by referencing function argument */ 2603 #endif /* MAYBE_UTF_MULTI */ 2604 } 2605 } 2606 } 2607 2608 2609 2610 /************************************************* 2611 * Check for POSIX class syntax * 2612 *************************************************/ 2613 2614 /* This function is called when the sequence "[:" or "[." or "[=" is 2615 encountered in a character class. It checks whether this is followed by a 2616 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we 2617 reach an unescaped ']' without the special preceding character, return FALSE. 2618 2619 Originally, this function only recognized a sequence of letters between the 2620 terminators, but it seems that Perl recognizes any sequence of characters, 2621 though of course unknown POSIX names are subsequently rejected. Perl gives an 2622 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE 2623 didn't consider this to be a POSIX class. Likewise for [:1234:]. 2624 2625 The problem in trying to be exactly like Perl is in the handling of escapes. We 2626 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX 2627 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code 2628 below handles the special cases \\ and \], but does not try to do any other 2629 escape processing. This makes it different from Perl for cases such as 2630 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does 2631 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes 2632 when Perl does, I think. 2633 2634 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. 2635 It seems that the appearance of a nested POSIX class supersedes an apparent 2636 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or 2637 a digit. This is handled by returning FALSE if the start of a new group with 2638 the same terminator is encountered, since the next closing sequence must close 2639 the nested group, not the outer one. 2640 2641 In Perl, unescaped square brackets may also appear as part of class names. For 2642 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for 2643 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not 2644 seem right at all. PCRE does not allow closing square brackets in POSIX class 2645 names. 2646 2647 Arguments: 2648 ptr pointer to the initial [ 2649 endptr where to return a pointer to the terminating ':', '.', or '=' 2650 2651 Returns: TRUE or FALSE 2652 */ 2653 2654 static BOOL 2655 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr) 2656 { 2657 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ 2658 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ 2659 2660 for (++ptr; *ptr != CHAR_NULL; ptr++) 2661 { 2662 if (*ptr == CHAR_BACKSLASH && 2663 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) 2664 ptr++; 2665 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || 2666 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; 2667 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 2668 { 2669 *endptr = ptr; 2670 return TRUE; 2671 } 2672 } 2673 2674 return FALSE; 2675 } 2676 2677 2678 2679 /************************************************* 2680 * Check POSIX class name * 2681 *************************************************/ 2682 2683 /* This function is called to check the name given in a POSIX-style class entry 2684 such as [:alnum:]. 2685 2686 Arguments: 2687 ptr points to the first letter 2688 len the length of the name 2689 2690 Returns: a value representing the name, or -1 if unknown 2691 */ 2692 2693 static int 2694 check_posix_name(PCRE2_SPTR ptr, int len) 2695 { 2696 const char *pn = posix_names; 2697 register int yield = 0; 2698 while (posix_name_lengths[yield] != 0) 2699 { 2700 if (len == posix_name_lengths[yield] && 2701 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; 2702 pn += posix_name_lengths[yield] + 1; 2703 yield++; 2704 } 2705 return -1; 2706 } 2707 2708 2709 2710 #ifdef SUPPORT_UNICODE 2711 /************************************************* 2712 * Get othercase range * 2713 *************************************************/ 2714 2715 /* This function is passed the start and end of a class range in UCT mode. It 2716 searches up the characters, looking for ranges of characters in the "other" 2717 case. Each call returns the next one, updating the start address. A character 2718 with multiple other cases is returned on its own with a special return value. 2719 2720 Arguments: 2721 cptr points to starting character value; updated 2722 d end value 2723 ocptr where to put start of othercase range 2724 odptr where to put end of othercase range 2725 2726 Yield: -1 when no more 2727 0 when a range is returned 2728 >0 the CASESET offset for char with multiple other cases 2729 in this case, ocptr contains the original 2730 */ 2731 2732 static int 2733 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, 2734 uint32_t *odptr) 2735 { 2736 uint32_t c, othercase, next; 2737 unsigned int co; 2738 2739 /* Find the first character that has an other case. If it has multiple other 2740 cases, return its case offset value. */ 2741 2742 for (c = *cptr; c <= d; c++) 2743 { 2744 if ((co = UCD_CASESET(c)) != 0) 2745 { 2746 *ocptr = c++; /* Character that has the set */ 2747 *cptr = c; /* Rest of input range */ 2748 return (int)co; 2749 } 2750 if ((othercase = UCD_OTHERCASE(c)) != c) break; 2751 } 2752 2753 if (c > d) return -1; /* Reached end of range */ 2754 2755 /* Found a character that has a single other case. Search for the end of the 2756 range, which is either the end of the input range, or a character that has zero 2757 or more than one other cases. */ 2758 2759 *ocptr = othercase; 2760 next = othercase + 1; 2761 2762 for (++c; c <= d; c++) 2763 { 2764 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; 2765 next++; 2766 } 2767 2768 *odptr = next - 1; /* End of othercase range */ 2769 *cptr = c; /* Rest of input range */ 2770 return 0; 2771 } 2772 #endif /* SUPPORT_UNICODE */ 2773 2774 2775 2776 /************************************************* 2777 * Add a character or range to a class * 2778 *************************************************/ 2779 2780 /* This function packages up the logic of adding a character or range of 2781 characters to a class. The character values in the arguments will be within the 2782 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is 2783 mutually recursive with the function immediately below. 2784 2785 Arguments: 2786 classbits the bit map for characters < 256 2787 uchardptr points to the pointer for extra data 2788 options the options word 2789 cb compile data 2790 start start of range character 2791 end end of range character 2792 2793 Returns: the number of < 256 characters added 2794 the pointer to extra data is updated 2795 */ 2796 2797 static unsigned int 2798 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, 2799 compile_block *cb, uint32_t start, uint32_t end) 2800 { 2801 uint32_t c; 2802 uint32_t classbits_end = (end <= 0xff ? end : 0xff); 2803 unsigned int n8 = 0; 2804 2805 /* If caseless matching is required, scan the range and process alternate 2806 cases. In Unicode, there are 8-bit characters that have alternate cases that 2807 are greater than 255 and vice-versa. Sometimes we can just extend the original 2808 range. */ 2809 2810 if ((options & PCRE2_CASELESS) != 0) 2811 { 2812 #ifdef SUPPORT_UNICODE 2813 if ((options & PCRE2_UTF) != 0) 2814 { 2815 int rc; 2816 uint32_t oc, od; 2817 2818 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ 2819 c = start; 2820 2821 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) 2822 { 2823 /* Handle a single character that has more than one other case. */ 2824 2825 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb, 2826 PRIV(ucd_caseless_sets) + rc, oc); 2827 2828 /* Do nothing if the other case range is within the original range. */ 2829 2830 else if (oc >= start && od <= end) continue; 2831 2832 /* Extend the original range if there is overlap, noting that if oc < c, we 2833 can't have od > end because a subrange is always shorter than the basic 2834 range. Otherwise, use a recursive call to add the additional range. */ 2835 2836 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ 2837 else if (od > end && oc <= end + 1) 2838 { 2839 end = od; /* Extend upwards */ 2840 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); 2841 } 2842 else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od); 2843 } 2844 } 2845 else 2846 #endif /* SUPPORT_UNICODE */ 2847 2848 /* Not UTF mode */ 2849 2850 for (c = start; c <= classbits_end; c++) 2851 { 2852 SETBIT(classbits, cb->fcc[c]); 2853 n8++; 2854 } 2855 } 2856 2857 /* Now handle the original range. Adjust the final value according to the bit 2858 length - this means that the same lists of (e.g.) horizontal spaces can be used 2859 in all cases. */ 2860 2861 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) 2862 end = MAX_NON_UTF_CHAR; 2863 2864 /* Use the bitmap for characters < 256. Otherwise use extra data.*/ 2865 2866 for (c = start; c <= classbits_end; c++) 2867 { 2868 /* Regardless of start, c will always be <= 255. */ 2869 SETBIT(classbits, c); 2870 n8++; 2871 } 2872 2873 #ifdef SUPPORT_WIDE_CHARS 2874 if (start <= 0xff) start = 0xff + 1; 2875 2876 if (end >= start) 2877 { 2878 PCRE2_UCHAR *uchardata = *uchardptr; 2879 2880 #ifdef SUPPORT_UNICODE 2881 if ((options & PCRE2_UTF) != 0) 2882 { 2883 if (start < end) 2884 { 2885 *uchardata++ = XCL_RANGE; 2886 uchardata += PRIV(ord2utf)(start, uchardata); 2887 uchardata += PRIV(ord2utf)(end, uchardata); 2888 } 2889 else if (start == end) 2890 { 2891 *uchardata++ = XCL_SINGLE; 2892 uchardata += PRIV(ord2utf)(start, uchardata); 2893 } 2894 } 2895 else 2896 #endif /* SUPPORT_UNICODE */ 2897 2898 /* Without UTF support, character values are constrained by the bit length, 2899 and can only be > 256 for 16-bit and 32-bit libraries. */ 2900 2901 #if PCRE2_CODE_UNIT_WIDTH == 8 2902 {} 2903 #else 2904 if (start < end) 2905 { 2906 *uchardata++ = XCL_RANGE; 2907 *uchardata++ = start; 2908 *uchardata++ = end; 2909 } 2910 else if (start == end) 2911 { 2912 *uchardata++ = XCL_SINGLE; 2913 *uchardata++ = start; 2914 } 2915 #endif 2916 *uchardptr = uchardata; /* Updata extra data pointer */ 2917 } 2918 #else 2919 (void)uchardptr; /* Avoid compiler warning */ 2920 #endif /* SUPPORT_WIDE_CHARS */ 2921 2922 return n8; /* Number of 8-bit characters */ 2923 } 2924 2925 2926 2927 /************************************************* 2928 * Add a list of characters to a class * 2929 *************************************************/ 2930 2931 /* This function is used for adding a list of case-equivalent characters to a 2932 class, and also for adding a list of horizontal or vertical whitespace. If the 2933 list is in order (which it should be), ranges of characters are detected and 2934 handled appropriately. This function is mutually recursive with the function 2935 above. 2936 2937 Arguments: 2938 classbits the bit map for characters < 256 2939 uchardptr points to the pointer for extra data 2940 options the options word 2941 cb contains pointers to tables etc. 2942 p points to row of 32-bit values, terminated by NOTACHAR 2943 except character to omit; this is used when adding lists of 2944 case-equivalent characters to avoid including the one we 2945 already know about 2946 2947 Returns: the number of < 256 characters added 2948 the pointer to extra data is updated 2949 */ 2950 2951 static unsigned int 2952 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, 2953 compile_block *cb, const uint32_t *p, unsigned int except) 2954 { 2955 unsigned int n8 = 0; 2956 while (p[0] < NOTACHAR) 2957 { 2958 unsigned int n = 0; 2959 if (p[0] != except) 2960 { 2961 while(p[n+1] == p[0] + n + 1) n++; 2962 n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]); 2963 } 2964 p += n + 1; 2965 } 2966 return n8; 2967 } 2968 2969 2970 2971 /************************************************* 2972 * Add characters not in a list to a class * 2973 *************************************************/ 2974 2975 /* This function is used for adding the complement of a list of horizontal or 2976 vertical whitespace to a class. The list must be in order. 2977 2978 Arguments: 2979 classbits the bit map for characters < 256 2980 uchardptr points to the pointer for extra data 2981 options the options word 2982 cb contains pointers to tables etc. 2983 p points to row of 32-bit values, terminated by NOTACHAR 2984 2985 Returns: the number of < 256 characters added 2986 the pointer to extra data is updated 2987 */ 2988 2989 static unsigned int 2990 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, 2991 uint32_t options, compile_block *cb, const uint32_t *p) 2992 { 2993 BOOL utf = (options & PCRE2_UTF) != 0; 2994 unsigned int n8 = 0; 2995 if (p[0] > 0) 2996 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); 2997 while (p[0] < NOTACHAR) 2998 { 2999 while (p[1] == p[0] + 1) p++; 3000 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, 3001 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); 3002 p++; 3003 } 3004 return n8; 3005 } 3006 3007 3008 3009 /************************************************* 3010 * Process (*VERB) name for escapes * 3011 *************************************************/ 3012 3013 /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to 3014 process the characters in a verb's name argument. It is called twice, once with 3015 codeptr == NULL, to find out the length of the processed name, and again to put 3016 the name into memory. 3017 3018 Arguments: 3019 ptrptr pointer to the input pointer 3020 codeptr pointer to the compiled code pointer 3021 errorcodeptr pointer to the error code 3022 options the options bits 3023 utf TRUE if processing UTF 3024 cb compile data block 3025 3026 Returns: length of the processed name, or < 0 on error 3027 */ 3028 3029 static int 3030 process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr, 3031 uint32_t options, BOOL utf, compile_block *cb) 3032 { 3033 int32_t arglen = 0; 3034 BOOL inescq = FALSE; 3035 PCRE2_SPTR ptr = *ptrptr; 3036 PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr; 3037 3038 for (; ptr < cb->end_pattern; ptr++) 3039 { 3040 uint32_t x = *ptr; 3041 3042 /* Skip over literals */ 3043 3044 if (inescq) 3045 { 3046 if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E) 3047 { 3048 inescq = FALSE; 3049 ptr++;; 3050 continue; 3051 } 3052 } 3053 3054 else /* Not a literal character */ 3055 { 3056 if (x == CHAR_RIGHT_PARENTHESIS) break; 3057 3058 /* Skip over comments and whitespace in extended mode. */ 3059 3060 if ((options & PCRE2_EXTENDED) != 0) 3061 { 3062 PCRE2_SPTR wscptr = ptr; 3063 while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr); 3064 if (x == CHAR_NUMBER_SIGN) 3065 { 3066 ptr++; 3067 while (*ptr != CHAR_NULL || ptr < cb->end_pattern) 3068 { 3069 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 3070 { /* IS_NEWLINE sets cb->nllen. */ 3071 ptr += cb->nllen; 3072 break; 3073 } 3074 ptr++; 3075 #ifdef SUPPORT_UNICODE 3076 if (utf) FORWARDCHAR(ptr); 3077 #endif 3078 } 3079 } 3080 3081 /* If we have skipped any characters, restart the loop. */ 3082 3083 if (ptr > wscptr) 3084 { 3085 ptr--; 3086 continue; 3087 } 3088 } 3089 3090 /* Process escapes */ 3091 3092 if (x == '\\') 3093 { 3094 int rc; 3095 *errorcodeptr = 0; 3096 rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options, 3097 FALSE, cb); 3098 *ptrptr = ptr; /* For possible error */ 3099 if (*errorcodeptr != 0) return -1; 3100 if (rc != 0) 3101 { 3102 if (rc == ESC_Q) 3103 { 3104 inescq = TRUE; 3105 continue; 3106 } 3107 if (rc == ESC_E) continue; 3108 *errorcodeptr = ERR40; 3109 return -1; 3110 } 3111 } 3112 } 3113 3114 /* We have the next character in the name. */ 3115 3116 #ifdef SUPPORT_UNICODE 3117 if (utf) 3118 { 3119 if (code == NULL) /* Just want the length */ 3120 { 3121 #if PCRE2_CODE_UNIT_WIDTH == 8 3122 int i; 3123 for (i = 0; i < PRIV(utf8_table1_size); i++) 3124 if ((int)x <= PRIV(utf8_table1)[i]) break; 3125 arglen += i; 3126 #elif PCRE2_CODE_UNIT_WIDTH == 16 3127 if (x > 0xffff) arglen++; 3128 #endif 3129 } 3130 else 3131 { 3132 PCRE2_UCHAR cbuff[8]; 3133 x = PRIV(ord2utf)(x, cbuff); 3134 memcpy(code, cbuff, CU2BYTES(x)); 3135 code += x; 3136 } 3137 } 3138 else 3139 #endif /* SUPPORT_UNICODE */ 3140 3141 /* Not UTF */ 3142 { 3143 if (code != NULL) *code++ = (PCRE2_UCHAR)x; 3144 } 3145 3146 arglen++; 3147 3148 if ((unsigned int)arglen > MAX_MARK) 3149 { 3150 *errorcodeptr = ERR76; 3151 *ptrptr = ptr; 3152 return -1; 3153 } 3154 } 3155 3156 /* Update the pointers before returning. */ 3157 3158 *ptrptr = ptr; 3159 if (codeptr != NULL) *codeptr = code; 3160 return arglen; 3161 } 3162 3163 3164 3165 /************************************************* 3166 * Macro for the next two functions * 3167 *************************************************/ 3168 3169 /* Both scan_for_captures() and compile_branch() use this macro to generate a 3170 fragment of code that reads the characters of a name and sets its length 3171 (checking for not being too long). Count the characters dynamically, to avoid 3172 the possibility of integer overflow. The same macro is used for reading *VERB 3173 names. */ 3174 3175 #define READ_NAME(ctype, errno, errset) \ 3176 namelen = 0; \ 3177 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0) \ 3178 { \ 3179 ptr++; \ 3180 namelen++; \ 3181 if (namelen > MAX_NAME_SIZE) \ 3182 { \ 3183 errset = errno; \ 3184 goto FAILED; \ 3185 } \ 3186 } 3187 3188 3189 3190 /************************************************* 3191 * Scan regex to identify named groups * 3192 *************************************************/ 3193 3194 /* This function is called first of all, to scan for named capturing groups so 3195 that information about them is fully available to both the compiling scans. 3196 It skips over everything except parenthesized items. 3197 3198 Arguments: 3199 ptrptr points to pointer to the start of the pattern 3200 options compiling dynamic options 3201 cb pointer to the compile data block 3202 3203 Returns: zero on success or a non-zero error code, with pointer updated 3204 */ 3205 3206 typedef struct nest_save { 3207 uint16_t nest_depth; 3208 uint16_t reset_group; 3209 uint16_t max_group; 3210 uint16_t flags; 3211 } nest_save; 3212 3213 #define NSF_RESET 0x0001u 3214 #define NSF_EXTENDED 0x0002u 3215 #define NSF_DUPNAMES 0x0004u 3216 3217 static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options, 3218 compile_block *cb) 3219 { 3220 uint32_t c; 3221 uint32_t delimiter; 3222 uint32_t set, unset, *optset; 3223 uint32_t skiptoket = 0; 3224 uint16_t nest_depth = 0; 3225 int errorcode = 0; 3226 int escape; 3227 int namelen; 3228 int i; 3229 BOOL inescq = FALSE; 3230 BOOL isdupname; 3231 BOOL utf = (options & PCRE2_UTF) != 0; 3232 BOOL negate_class; 3233 PCRE2_SPTR name; 3234 PCRE2_SPTR start; 3235 PCRE2_SPTR ptr = *ptrptr; 3236 named_group *ng; 3237 nest_save *top_nest = NULL; 3238 nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); 3239 3240 /* The size of the nest_save structure might not be a factor of the size of the 3241 workspace. Therefore we must round down end_nests so as to correctly avoid 3242 creating a nest_save that spans the end of the workspace. */ 3243 3244 end_nests = (nest_save *)((char *)end_nests - 3245 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); 3246 3247 /* Now scan the pattern */ 3248 3249 for (; ptr < cb->end_pattern; ptr++) 3250 { 3251 c = *ptr; 3252 3253 /* Parenthesized groups set skiptoket when all following characters up to the 3254 next closing parenthesis must be ignored. The parenthesis itself must be 3255 processed (to end the nested parenthesized item). */ 3256 3257 if (skiptoket != 0) 3258 { 3259 if (c != CHAR_RIGHT_PARENTHESIS) continue; 3260 skiptoket = 0; 3261 } 3262 3263 /* Skip over literals */ 3264 3265 if (inescq) 3266 { 3267 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) 3268 { 3269 inescq = FALSE; 3270 ptr++; 3271 } 3272 continue; 3273 } 3274 3275 /* Skip over # comments and whitespace in extended mode. */ 3276 3277 if ((options & PCRE2_EXTENDED) != 0) 3278 { 3279 PCRE2_SPTR wscptr = ptr; 3280 while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); 3281 if (c == CHAR_NUMBER_SIGN) 3282 { 3283 ptr++; 3284 while (ptr < cb->end_pattern) 3285 { 3286 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 3287 { /* IS_NEWLINE sets cb->nllen. */ 3288 ptr += cb->nllen; 3289 break; 3290 } 3291 ptr++; 3292 #ifdef SUPPORT_UNICODE 3293 if (utf) FORWARDCHAR(ptr); 3294 #endif 3295 } 3296 } 3297 3298 /* If we skipped any characters, restart the loop. Otherwise, we didn't see 3299 a comment. */ 3300 3301 if (ptr > wscptr) 3302 { 3303 ptr--; 3304 continue; 3305 } 3306 } 3307 3308 /* Process the next pattern item. */ 3309 3310 switch(c) 3311 { 3312 default: /* Most characters are just skipped */ 3313 break; 3314 3315 /* Skip escapes except for \Q */ 3316 3317 case CHAR_BACKSLASH: 3318 errorcode = 0; 3319 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, options, 3320 FALSE, cb); 3321 if (errorcode != 0) goto FAILED; 3322 if (escape == ESC_Q) inescq = TRUE; 3323 break; 3324 3325 /* Skip a character class. The syntax is complicated so we have to 3326 replicate some of what happens when a class is processed for real. */ 3327 3328 case CHAR_LEFT_SQUARE_BRACKET: 3329 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 || 3330 PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) 3331 { 3332 ptr += 6; 3333 break; 3334 } 3335 3336 /* If the first character is '^', set the negation flag (not actually used 3337 here, except to recognize only one ^) and skip it. If the first few 3338 characters (either before or after ^) are \Q\E or \E we skip them too. This 3339 makes for compatibility with Perl. */ 3340 3341 negate_class = FALSE; 3342 for (;;) 3343 { 3344 c = *(++ptr); /* First character in class */ 3345 if (c == CHAR_BACKSLASH) 3346 { 3347 if (ptr[1] == CHAR_E) 3348 ptr++; 3349 else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) 3350 ptr += 3; 3351 else 3352 break; 3353 } 3354 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) 3355 negate_class = TRUE; 3356 else break; 3357 } 3358 3359 if (c == CHAR_RIGHT_SQUARE_BRACKET && 3360 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) 3361 break; 3362 3363 /* Loop for the contents of the class */ 3364 3365 for (;;) 3366 { 3367 PCRE2_SPTR tempptr; 3368 3369 if (c == CHAR_NULL && ptr >= cb->end_pattern) 3370 { 3371 errorcode = ERR6; /* Missing terminating ']' */ 3372 goto FAILED; 3373 } 3374 3375 #ifdef SUPPORT_UNICODE 3376 if (utf && HAS_EXTRALEN(c)) 3377 { /* Braces are required because the */ 3378 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ 3379 } 3380 #endif 3381 3382 /* Inside \Q...\E everything is literal except \E */ 3383 3384 if (inescq) 3385 { 3386 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ 3387 { 3388 inescq = FALSE; /* Reset literal state */ 3389 ptr++; /* Skip the 'E' */ 3390 } 3391 goto CONTINUE_CLASS; 3392 } 3393 3394 /* Skip POSIX class names. */ 3395 if (c == CHAR_LEFT_SQUARE_BRACKET && 3396 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 3397 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) 3398 { 3399 ptr = tempptr + 1; 3400 } 3401 else if (c == CHAR_BACKSLASH) 3402 { 3403 errorcode = 0; 3404 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, 3405 options, TRUE, cb); 3406 if (errorcode != 0) goto FAILED; 3407 if (escape == ESC_Q) inescq = TRUE; 3408 } 3409 3410 CONTINUE_CLASS: 3411 c = *(++ptr); 3412 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; 3413 } /* End of class-processing loop */ 3414 break; 3415 3416 /* This is the real work of this function - handling parentheses. */ 3417 3418 case CHAR_LEFT_PARENTHESIS: 3419 nest_depth++; 3420 3421 if (ptr[1] != CHAR_QUESTION_MARK) 3422 { 3423 if (ptr[1] != CHAR_ASTERISK) 3424 { 3425 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++; 3426 } 3427 3428 /* (*something) - skip over a name, and then just skip to closing ket 3429 unless PCRE2_ALT_VERBNAMES is set, in which case we have to process 3430 escapes in the string after a verb name terminated by a colon. */ 3431 3432 else 3433 { 3434 ptr += 2; 3435 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++; 3436 if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0) 3437 { 3438 ptr++; 3439 if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0) 3440 goto FAILED; 3441 } 3442 else 3443 { 3444 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) 3445 ptr++; 3446 } 3447 nest_depth--; 3448 } 3449 } 3450 3451 /* Handle (?...) groups */ 3452 3453 else switch(ptr[2]) 3454 { 3455 default: 3456 ptr += 2; 3457 if (ptr[0] == CHAR_R || /* (?R) */ 3458 ptr[0] == CHAR_NUMBER_SIGN || /* (?#) */ 3459 IS_DIGIT(ptr[0]) || /* (?n) */ 3460 (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) /* (?-n) */ 3461 { 3462 skiptoket = ptr[0]; 3463 break; 3464 } 3465 3466 /* Handle (?| and (?imsxJU: which are the only other valid forms. Both 3467 need a new block on the nest stack. */ 3468 3469 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); 3470 else if (++top_nest >= end_nests) 3471 { 3472 errorcode = ERR84; 3473 goto FAILED; 3474 } 3475 top_nest->nest_depth = nest_depth; 3476 top_nest->flags = 0; 3477 if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED; 3478 if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES; 3479 3480 if (*ptr == CHAR_VERTICAL_LINE) 3481 { 3482 top_nest->reset_group = (uint16_t)cb->bracount; 3483 top_nest->max_group = (uint16_t)cb->bracount; 3484 top_nest->flags |= NSF_RESET; 3485 cb->external_flags |= PCRE2_DUPCAPUSED; 3486 break; 3487 } 3488 3489 /* Scan options */ 3490 3491 top_nest->reset_group = 0; 3492 top_nest->max_group = 0; 3493 3494 set = unset = 0; 3495 optset = &set; 3496 3497 /* Need only track (?x: and (?J: at this stage */ 3498 3499 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) 3500 { 3501 switch (*ptr++) 3502 { 3503 case CHAR_MINUS: optset = &unset; break; 3504 3505 case CHAR_x: *optset |= PCRE2_EXTENDED; break; 3506 3507 case CHAR_J: 3508 *optset |= PCRE2_DUPNAMES; 3509 cb->external_flags |= PCRE2_JCHANGED; 3510 break; 3511 3512 case CHAR_i: 3513 case CHAR_m: 3514 case CHAR_s: 3515 case CHAR_U: 3516 break; 3517 3518 default: 3519 errorcode = ERR11; 3520 ptr--; /* Correct the offset */ 3521 goto FAILED; 3522 } 3523 } 3524 3525 options = (options | set) & (~unset); 3526 3527 /* If the options ended with ')' this is not the start of a nested 3528 group with option changes, so the options change at this level. If the 3529 previous level set up a nest block, discard the one we have just created. 3530 Otherwise adjust it for the previous level. */ 3531 3532 if (*ptr == CHAR_RIGHT_PARENTHESIS) 3533 { 3534 nest_depth--; 3535 if (top_nest > (nest_save *)(cb->start_workspace) && 3536 (top_nest-1)->nest_depth == nest_depth) top_nest --; 3537 else top_nest->nest_depth = nest_depth; 3538 } 3539 break; 3540 3541 /* Skip over a numerical or string argument for a callout. */ 3542 3543 case CHAR_C: 3544 ptr += 2; 3545 if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break; 3546 if (IS_DIGIT(ptr[1])) 3547 { 3548 while (IS_DIGIT(ptr[1])) ptr++; 3549 } 3550 3551 /* Handle a string argument */ 3552 3553 else 3554 { 3555 ptr++; 3556 delimiter = 0; 3557 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) 3558 { 3559 if (*ptr == PRIV(callout_start_delims)[i]) 3560 { 3561 delimiter = PRIV(callout_end_delims)[i]; 3562 break; 3563 } 3564 } 3565 3566 if (delimiter == 0) 3567 { 3568 errorcode = ERR82; 3569 goto FAILED; 3570 } 3571 3572 start = ptr; 3573 do 3574 { 3575 if (++ptr >= cb->end_pattern) 3576 { 3577 errorcode = ERR81; 3578 ptr = start; /* To give a more useful message */ 3579 goto FAILED; 3580 } 3581 if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2; 3582 } 3583 while (ptr[0] != delimiter); 3584 } 3585 3586 /* Check terminating ) */ 3587 3588 if (ptr[1] != CHAR_RIGHT_PARENTHESIS) 3589 { 3590 errorcode = ERR39; 3591 ptr++; 3592 goto FAILED; 3593 } 3594 break; 3595 3596 /* Conditional group */ 3597 3598 case CHAR_LEFT_PARENTHESIS: 3599 if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */ 3600 { 3601 nest_depth++; 3602 ptr += 2; 3603 break; 3604 } 3605 3606 /* Must be an assertion or a callout */ 3607 3608 switch(ptr[4]) 3609 { 3610 case CHAR_LESS_THAN_SIGN: 3611 if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) 3612 goto MISSING_ASSERTION; 3613 /* Fall through */ 3614 3615 case CHAR_C: 3616 case CHAR_EXCLAMATION_MARK: 3617 case CHAR_EQUALS_SIGN: 3618 ptr++; 3619 break; 3620 3621 default: 3622 MISSING_ASSERTION: 3623 ptr += 3; /* To improve error message */ 3624 errorcode = ERR28; 3625 goto FAILED; 3626 } 3627 break; 3628 3629 case CHAR_COLON: 3630 case CHAR_GREATER_THAN_SIGN: 3631 case CHAR_EQUALS_SIGN: 3632 case CHAR_EXCLAMATION_MARK: 3633 case CHAR_AMPERSAND: 3634 case CHAR_PLUS: 3635 ptr += 2; 3636 break; 3637 3638 case CHAR_P: 3639 if (ptr[3] != CHAR_LESS_THAN_SIGN) 3640 { 3641 ptr += 3; 3642 break; 3643 } 3644 ptr++; 3645 c = CHAR_GREATER_THAN_SIGN; /* Terminator */ 3646 goto DEFINE_NAME; 3647 3648 case CHAR_LESS_THAN_SIGN: 3649 if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK) 3650 { 3651 ptr += 3; 3652 break; 3653 } 3654 c = CHAR_GREATER_THAN_SIGN; /* Terminator */ 3655 goto DEFINE_NAME; 3656 3657 case CHAR_APOSTROPHE: 3658 c = CHAR_APOSTROPHE; /* Terminator */ 3659 3660 DEFINE_NAME: 3661 name = ptr = ptr + 3; 3662 3663 if (*ptr == c) /* Empty name */ 3664 { 3665 errorcode = ERR62; 3666 goto FAILED; 3667 } 3668 3669 if (IS_DIGIT(*ptr)) 3670 { 3671 errorcode = ERR44; /* Group name must start with non-digit */ 3672 goto FAILED; 3673 } 3674 3675 if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0) 3676 { 3677 errorcode = ERR24; 3678 goto FAILED; 3679 } 3680 3681 /* Advance ptr, set namelen and check its length. */ 3682 READ_NAME(ctype_word, ERR48, errorcode); 3683 3684 if (*ptr != c) 3685 { 3686 errorcode = ERR42; 3687 goto FAILED; 3688 } 3689 3690 if (cb->names_found >= MAX_NAME_COUNT) 3691 { 3692 errorcode = ERR49; 3693 goto FAILED; 3694 } 3695 3696 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) 3697 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); 3698 3699 /* We have a valid name for this capturing group. */ 3700 3701 cb->bracount++; 3702 3703 /* Scan the list to check for duplicates. For duplicate names, if the 3704 number is the same, break the loop, which causes the name to be 3705 discarded; otherwise, if DUPNAMES is not set, give an error. 3706 If it is set, allow the name with a different number, but continue 3707 scanning in case this is a duplicate with the same number. For 3708 non-duplicate names, give an error if the number is duplicated. */ 3709 3710 isdupname = FALSE; 3711 ng = cb->named_groups; 3712 for (i = 0; i < cb->names_found; i++, ng++) 3713 { 3714 if (namelen == ng->length && 3715 PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0) 3716 { 3717 if (ng->number == cb->bracount) break; 3718 if ((options & PCRE2_DUPNAMES) == 0) 3719 { 3720 errorcode = ERR43; 3721 goto FAILED; 3722 } 3723 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ 3724 cb->dupnames = TRUE; /* Duplicate names exist */ 3725 } 3726 else if (ng->number == cb->bracount) 3727 { 3728 errorcode = ERR65; 3729 goto FAILED; 3730 } 3731 } 3732 3733 if (i < cb->names_found) break; /* Ignore duplicate with same number */ 3734 3735 /* Increase the list size if necessary */ 3736 3737 if (cb->names_found >= cb->named_group_list_size) 3738 { 3739 uint32_t newsize = cb->named_group_list_size * 2; 3740 named_group *newspace = 3741 cb->cx->memctl.malloc(newsize * sizeof(named_group), 3742 cb->cx->memctl.memory_data); 3743 if (newspace == NULL) 3744 { 3745 errorcode = ERR21; 3746 goto FAILED; 3747 } 3748 3749 memcpy(newspace, cb->named_groups, 3750 cb->named_group_list_size * sizeof(named_group)); 3751 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) 3752 cb->cx->memctl.free((void *)cb->named_groups, 3753 cb->cx->memctl.memory_data); 3754 cb->named_groups = newspace; 3755 cb->named_group_list_size = newsize; 3756 } 3757 3758 /* Add this name to the list */ 3759 3760 cb->named_groups[cb->names_found].name = name; 3761 cb->named_groups[cb->names_found].length = (uint16_t)namelen; 3762 cb->named_groups[cb->names_found].number = cb->bracount; 3763 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; 3764 cb->names_found++; 3765 break; 3766 } /* End of (? switch */ 3767 break; /* End of ( handling */ 3768 3769 /* At an alternation, reset the capture count if we are in a (?| group. */ 3770 3771 case CHAR_VERTICAL_LINE: 3772 if (top_nest != NULL && top_nest->nest_depth == nest_depth && 3773 (top_nest->flags & NSF_RESET) != 0) 3774 { 3775 if (cb->bracount > top_nest->max_group) 3776 top_nest->max_group = (uint16_t)cb->bracount; 3777 cb->bracount = top_nest->reset_group; 3778 } 3779 break; 3780 3781 /* At a right parenthesis, reset the capture count to the maximum if we 3782 are in a (?| group and/or reset the extended option. */ 3783 3784 case CHAR_RIGHT_PARENTHESIS: 3785 if (top_nest != NULL && top_nest->nest_depth == nest_depth) 3786 { 3787 if ((top_nest->flags & NSF_RESET) != 0 && 3788 top_nest->max_group > cb->bracount) 3789 cb->bracount = top_nest->max_group; 3790 if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED; 3791 else options &= ~PCRE2_EXTENDED; 3792 if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES; 3793 else options &= ~PCRE2_DUPNAMES; 3794 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; 3795 else top_nest--; 3796 } 3797 if (nest_depth == 0) /* Unmatched closing parenthesis */ 3798 { 3799 errorcode = ERR22; 3800 goto FAILED; 3801 } 3802 nest_depth--; 3803 break; 3804 } 3805 } 3806 3807 if (nest_depth == 0) 3808 { 3809 cb->final_bracount = cb->bracount; 3810 return 0; 3811 } 3812 3813 /* We give a special error for a missing closing parentheses after (?# because 3814 it might otherwise be hard to see where the missing character is. */ 3815 3816 errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14; 3817 3818 FAILED: 3819 *ptrptr = ptr; 3820 return errorcode; 3821 } 3822 3823 3824 3825 /************************************************* 3826 * Compile one branch * 3827 *************************************************/ 3828 3829 /* Scan the pattern, compiling it into the a vector. If the options are 3830 changed during the branch, the pointer is used to change the external options 3831 bits. This function is used during the pre-compile phase when we are trying 3832 to find out the amount of memory needed, as well as during the real compile 3833 phase. The value of lengthptr distinguishes the two phases. 3834 3835 Arguments: 3836 optionsptr pointer to the option bits 3837 codeptr points to the pointer to the current code point 3838 ptrptr points to the current pattern pointer 3839 errorcodeptr points to error code variable 3840 firstcuptr place to put the first required code unit 3841 firstcuflagsptr place to put the first code unit flags, or a negative number 3842 reqcuptr place to put the last required code unit 3843 reqcuflagsptr place to put the last required code unit flags, or a negative number 3844 bcptr points to current branch chain 3845 cond_depth conditional nesting depth 3846 cb contains pointers to tables etc. 3847 lengthptr NULL during the real compile phase 3848 points to length accumulator during pre-compile phase 3849 3850 Returns: TRUE on success 3851 FALSE, with *errorcodeptr set non-zero on error 3852 */ 3853 3854 static BOOL 3855 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, 3856 PCRE2_SPTR *ptrptr, int *errorcodeptr, 3857 uint32_t *firstcuptr, int32_t *firstcuflagsptr, 3858 uint32_t *reqcuptr, int32_t *reqcuflagsptr, 3859 branch_chain *bcptr, int cond_depth, 3860 compile_block *cb, size_t *lengthptr) 3861 { 3862 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 3863 int bravalue = 0; 3864 uint32_t greedy_default, greedy_non_default; 3865 uint32_t repeat_type, op_type; 3866 uint32_t options = *optionsptr; /* May change dynamically */ 3867 uint32_t firstcu, reqcu; 3868 int32_t firstcuflags, reqcuflags; 3869 uint32_t zeroreqcu, zerofirstcu; 3870 int32_t zeroreqcuflags, zerofirstcuflags; 3871 int32_t req_caseopt, reqvary, tempreqvary; 3872 int after_manual_callout = 0; 3873 int escape; 3874 size_t length_prevgroup = 0; 3875 register uint32_t c; 3876 register PCRE2_UCHAR *code = *codeptr; 3877 PCRE2_UCHAR *last_code = code; 3878 PCRE2_UCHAR *orig_code = code; 3879 PCRE2_UCHAR *tempcode; 3880 BOOL inescq = FALSE; 3881 BOOL groupsetfirstcu = FALSE; 3882 PCRE2_SPTR ptr = *ptrptr; 3883 PCRE2_SPTR tempptr; 3884 PCRE2_UCHAR *previous = NULL; 3885 PCRE2_UCHAR *previous_callout = NULL; 3886 uint8_t classbits[32]; 3887 3888 /* We can fish out the UTF setting once and for all into a BOOL, but we must 3889 not do this for other options (e.g. PCRE2_EXTENDED) because they may change 3890 dynamically as we process the pattern. */ 3891 3892 #ifdef SUPPORT_UNICODE 3893 BOOL utf = (options & PCRE2_UTF) != 0; 3894 #if PCRE2_CODE_UNIT_WIDTH != 32 3895 PCRE2_UCHAR utf_units[6]; /* For setting up multi-cu chars */ 3896 #endif 3897 3898 #else /* No UTF support */ 3899 BOOL utf = FALSE; 3900 #endif 3901 3902 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define 3903 class_uchardata always so that it can be passed to add_to_class() always, 3904 though it will not be used in non-UTF 8-bit cases. This avoids having to supply 3905 alternative calls for the different cases. */ 3906 3907 PCRE2_UCHAR *class_uchardata; 3908 #ifdef SUPPORT_WIDE_CHARS 3909 BOOL xclass; 3910 PCRE2_UCHAR *class_uchardata_base; 3911 #endif 3912 3913 /* Set up the default and non-default settings for greediness */ 3914 3915 greedy_default = ((options & PCRE2_UNGREEDY) != 0); 3916 greedy_non_default = greedy_default ^ 1; 3917 3918 /* Initialize no first unit, no required unit. REQ_UNSET means "no char 3919 matching encountered yet". It gets changed to REQ_NONE if we hit something that 3920 matches a non-fixed first unit; reqcu just remains unset if we never find one. 3921 3922 When we hit a repeat whose minimum is zero, we may have to adjust these values 3923 to take the zero repeat into account. This is implemented by setting them to 3924 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual 3925 item types that can be repeated set these backoff variables appropriately. */ 3926 3927 firstcu = reqcu = zerofirstcu = zeroreqcu = 0; 3928 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; 3929 3930 /* The variable req_caseopt contains either the REQ_CASELESS value or zero, 3931 according to the current setting of the caseless flag. The REQ_CASELESS value 3932 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables 3933 to record the case status of the value. This is used only for ASCII characters. 3934 */ 3935 3936 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; 3937 3938 /* Switch on next character until the end of the branch */ 3939 3940 for (;; ptr++) 3941 { 3942 BOOL negate_class; 3943 BOOL should_flip_negation; 3944 BOOL match_all_or_no_wide_chars; 3945 BOOL possessive_quantifier; 3946 BOOL is_quantifier; 3947 BOOL is_recurse; 3948 BOOL is_dupname; 3949 BOOL reset_bracount; 3950 int class_has_8bitchar; 3951 int class_one_char; 3952 #ifdef SUPPORT_WIDE_CHARS 3953 BOOL xclass_has_prop; 3954 #endif 3955 int recno; /* Must be signed */ 3956 int refsign; /* Must be signed */ 3957 int terminator; /* Must be signed */ 3958 unsigned int mclength; 3959 unsigned int tempbracount; 3960 uint32_t ec; 3961 uint32_t newoptions; 3962 uint32_t skipunits; 3963 uint32_t subreqcu, subfirstcu; 3964 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */ 3965 PCRE2_UCHAR mcbuffer[8]; 3966 3967 /* Come here to restart the loop. */ 3968 3969 REDO_LOOP: 3970 3971 /* Get next character in the pattern */ 3972 3973 c = *ptr; 3974 3975 /* If we are at the end of a nested substitution, revert to the outer level 3976 string. Nesting only happens one or two levels deep, and the inserted string 3977 is always zero terminated. */ 3978 3979 if (c == CHAR_NULL && cb->nestptr[0] != NULL) 3980 { 3981 ptr = cb->nestptr[0]; 3982 cb->nestptr[0] = cb->nestptr[1]; 3983 cb->nestptr[1] = NULL; 3984 c = *ptr; 3985 } 3986 3987 /* If we are in the pre-compile phase, accumulate the length used for the 3988 previous cycle of this loop. */ 3989 3990 if (lengthptr != NULL) 3991 { 3992 if (code > cb->start_workspace + cb->workspace_size - 3993 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ 3994 { 3995 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? 3996 ERR52 : ERR86; 3997 goto FAILED; 3998 } 3999 4000 /* There is at least one situation where code goes backwards: this is the 4001 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, 4002 the class is simply eliminated. However, it is created first, so we have to 4003 allow memory for it. Therefore, don't ever reduce the length at this point. 4004 */ 4005 4006 if (code < last_code) code = last_code; 4007 4008 /* Paranoid check for integer overflow */ 4009 4010 if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code)) 4011 { 4012 *errorcodeptr = ERR20; 4013 goto FAILED; 4014 } 4015 *lengthptr += (size_t)(code - last_code); 4016 4017 /* If "previous" is set and it is not at the start of the work space, move 4018 it back to there, in order to avoid filling up the work space. Otherwise, 4019 if "previous" is NULL, reset the current code pointer to the start. */ 4020 4021 if (previous != NULL) 4022 { 4023 if (previous > orig_code) 4024 { 4025 memmove(orig_code, previous, (size_t)CU2BYTES(code - previous)); 4026 code -= previous - orig_code; 4027 previous = orig_code; 4028 } 4029 } 4030 else code = orig_code; 4031 4032 /* Remember where this code item starts so we can pick up the length 4033 next time round. */ 4034 4035 last_code = code; 4036 } 4037 4038 /* Before doing anything else we must handle all the special items that do 4039 nothing, and which may come between an item and its quantifier. Otherwise, 4040 when auto-callouts are enabled, a callout gets incorrectly inserted before 4041 the quantifier is recognized. After recognizing a "do nothing" item, restart 4042 the loop in case another one follows. */ 4043 4044 /* If c is not NULL we are not at the end of the pattern. If it is NULL, we 4045 may still be in the pattern with a NULL data item. In these cases, if we are 4046 in \Q...\E, check for the \E that ends the literal string; if not, we have a 4047 literal character. If not in \Q...\E, an isolated \E is ignored. */ 4048 4049 if (c != CHAR_NULL || ptr < cb->end_pattern) 4050 { 4051 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) 4052 { 4053 inescq = FALSE; 4054 ptr++; 4055 continue; 4056 } 4057 else if (inescq) /* Literal character */ 4058 { 4059 if (previous_callout != NULL) 4060 { 4061 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 4062 complete_callout(previous_callout, ptr, cb); 4063 previous_callout = NULL; 4064 } 4065 if ((options & PCRE2_AUTO_CALLOUT) != 0) 4066 { 4067 previous_callout = code; 4068 code = auto_callout(code, ptr, cb); 4069 } 4070 goto NORMAL_CHAR; 4071 } 4072 4073 /* Check for the start of a \Q...\E sequence. We must do this here rather 4074 than later in case it is immediately followed by \E, which turns it into a 4075 "do nothing" sequence. */ 4076 4077 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q) 4078 { 4079 inescq = TRUE; 4080 ptr++; 4081 continue; 4082 } 4083 } 4084 4085 /* In extended mode, skip white space and #-comments that end at newline. */ 4086 4087 if ((options & PCRE2_EXTENDED) != 0) 4088 { 4089 PCRE2_SPTR wscptr = ptr; 4090 while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); 4091 if (c == CHAR_NUMBER_SIGN) 4092 { 4093 ptr++; 4094 while (ptr < cb->end_pattern) 4095 { 4096 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 4097 { /* IS_NEWLINE sets cb->nllen. */ 4098 ptr += cb->nllen; 4099 break; 4100 } 4101 ptr++; 4102 #ifdef SUPPORT_UNICODE 4103 if (utf) FORWARDCHAR(ptr); 4104 #endif 4105 } 4106 } 4107 4108 /* If we skipped any characters, restart the loop. Otherwise, we didn't see 4109 a comment. */ 4110 4111 if (ptr > wscptr) goto REDO_LOOP; 4112 } 4113 4114 /* Skip over (?# comments. */ 4115 4116 if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK && 4117 ptr[2] == CHAR_NUMBER_SIGN) 4118 { 4119 ptr += 3; 4120 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; 4121 if (*ptr != CHAR_RIGHT_PARENTHESIS) 4122 { 4123 *errorcodeptr = ERR18; 4124 goto FAILED; 4125 } 4126 continue; 4127 } 4128 4129 /* End of processing "do nothing" items. See if the next thing is a 4130 quantifier. */ 4131 4132 is_quantifier = 4133 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || 4134 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); 4135 4136 /* Fill in length of a previous callout and create an auto callout if 4137 required, except when the next thing is a quantifier or when processing a 4138 property substitution string for \w etc in UCP mode. */ 4139 4140 if (!is_quantifier && cb->nestptr[0] == NULL) 4141 { 4142 if (previous_callout != NULL && after_manual_callout-- <= 0) 4143 { 4144 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 4145 complete_callout(previous_callout, ptr, cb); 4146 previous_callout = NULL; 4147 } 4148 4149 if ((options & PCRE2_AUTO_CALLOUT) != 0) 4150 { 4151 previous_callout = code; 4152 code = auto_callout(code, ptr, cb); 4153 } 4154 } 4155 4156 /* Process the next pattern item. */ 4157 4158 switch(c) 4159 { 4160 /* ===================================================================*/ 4161 /* The branch terminates at string end or | or ) */ 4162 4163 case CHAR_NULL: 4164 if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */ 4165 /* Fall through */ 4166 4167 case CHAR_VERTICAL_LINE: 4168 case CHAR_RIGHT_PARENTHESIS: 4169 *firstcuptr = firstcu; 4170 *firstcuflagsptr = firstcuflags; 4171 *reqcuptr = reqcu; 4172 *reqcuflagsptr = reqcuflags; 4173 *codeptr = code; 4174 *ptrptr = ptr; 4175 if (lengthptr != NULL) 4176 { 4177 if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code)) 4178 { 4179 *errorcodeptr = ERR20; 4180 goto FAILED; 4181 } 4182 *lengthptr += (size_t)(code - last_code); /* To include callout length */ 4183 } 4184 return TRUE; 4185 4186 4187 /* ===================================================================*/ 4188 /* Handle single-character metacharacters. In multiline mode, ^ disables 4189 the setting of any following char as a first character. */ 4190 4191 case CHAR_CIRCUMFLEX_ACCENT: 4192 previous = NULL; 4193 if ((options & PCRE2_MULTILINE) != 0) 4194 { 4195 if (firstcuflags == REQ_UNSET) 4196 zerofirstcuflags = firstcuflags = REQ_NONE; 4197 *code++ = OP_CIRCM; 4198 } 4199 else *code++ = OP_CIRC; 4200 break; 4201 4202 case CHAR_DOLLAR_SIGN: 4203 previous = NULL; 4204 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; 4205 break; 4206 4207 /* There can never be a first char if '.' is first, whatever happens about 4208 repeats. The value of reqcu doesn't change either. */ 4209 4210 case CHAR_DOT: 4211 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 4212 zerofirstcu = firstcu; 4213 zerofirstcuflags = firstcuflags; 4214 zeroreqcu = reqcu; 4215 zeroreqcuflags = reqcuflags; 4216 previous = code; 4217 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; 4218 break; 4219 4220 4221 /* ===================================================================*/ 4222 /* Character classes. If the included characters are all < 256, we build a 4223 32-byte bitmap of the permitted characters, except in the special case 4224 where there is only one such character. For negated classes, we build the 4225 map as usual, then invert it at the end. However, we use a different opcode 4226 so that data characters > 255 can be handled correctly. 4227 4228 If the class contains characters outside the 0-255 range, a different 4229 opcode is compiled. It may optionally have a bit map for characters < 256, 4230 but those above are are explicitly listed afterwards. A flag byte tells 4231 whether the bitmap is present, and whether this is a negated class or not. 4232 4233 An isolated ']' character is not treated specially, so is just another data 4234 character. In earlier versions of PCRE that used the original API there was 4235 a "JavaScript compatibility mode" in which it gave an error. However, 4236 JavaScript itself has changed in this respect so there is no longer any 4237 need for this special handling. 4238 4239 In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is 4240 used for "start of word" and "end of word". As these are otherwise illegal 4241 sequences, we don't break anything by recognizing them. They are replaced 4242 by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top 4243 nesting level, as no other inserted sequences will contains these oddities. 4244 Sequences like [a[:<:]] are erroneous and are handled by the normal code 4245 below. */ 4246 4247 case CHAR_LEFT_SQUARE_BRACKET: 4248 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) 4249 { 4250 cb->nestptr[0] = ptr + 7; 4251 ptr = sub_start_of_word; 4252 goto REDO_LOOP; 4253 } 4254 4255 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) 4256 { 4257 cb->nestptr[0] = ptr + 7; 4258 ptr = sub_end_of_word; 4259 goto REDO_LOOP; 4260 } 4261 4262 /* Handle a real character class. */ 4263 4264 previous = code; 4265 4266 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 4267 they are encountered at the top level, so we'll do that too. */ 4268 4269 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 4270 ptr[1] == CHAR_EQUALS_SIGN) && 4271 check_posix_syntax(ptr, &tempptr)) 4272 { 4273 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13; 4274 goto FAILED; 4275 } 4276 4277 /* If the first character is '^', set the negation flag and skip it. Also, 4278 if the first few characters (either before or after ^) are \Q\E or \E we 4279 skip them too. This makes for compatibility with Perl. */ 4280 4281 negate_class = FALSE; 4282 for (;;) 4283 { 4284 c = *(++ptr); 4285 if (c == CHAR_BACKSLASH) 4286 { 4287 if (ptr[1] == CHAR_E) 4288 ptr++; 4289 else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) 4290 ptr += 3; 4291 else 4292 break; 4293 } 4294 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) 4295 negate_class = TRUE; 4296 else break; 4297 } 4298 4299 /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise, 4300 an initial ']' is taken as a data character -- the code below handles 4301 that. When empty classes are allowed, [] must always fail, so generate 4302 OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */ 4303 4304 if (c == CHAR_RIGHT_SQUARE_BRACKET && 4305 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) 4306 { 4307 *code++ = negate_class? OP_ALLANY : OP_FAIL; 4308 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 4309 zerofirstcu = firstcu; 4310 zerofirstcuflags = firstcuflags; 4311 break; 4312 } 4313 4314 /* If a non-extended class contains a negative special such as \S, we need 4315 to flip the negation flag at the end, so that support for characters > 255 4316 works correctly (they are all included in the class). An extended class may 4317 need to insert specific matching or non-matching code for wide characters. 4318 */ 4319 4320 should_flip_negation = match_all_or_no_wide_chars = FALSE; 4321 4322 /* Extended class (xclass) will be used when characters > 255 4323 might match. */ 4324 4325 #ifdef SUPPORT_WIDE_CHARS 4326 xclass = FALSE; 4327 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ 4328 class_uchardata_base = class_uchardata; /* Save the start */ 4329 #endif 4330 4331 /* For optimization purposes, we track some properties of the class: 4332 class_has_8bitchar will be non-zero if the class contains at least one 256 4333 character with a code point less than 256; class_one_char will be 1 if the 4334 class contains just one character; xclass_has_prop will be TRUE if Unicode 4335 property checks are present in the class. */ 4336 4337 class_has_8bitchar = 0; 4338 class_one_char = 0; 4339 #ifdef SUPPORT_WIDE_CHARS 4340 xclass_has_prop = FALSE; 4341 #endif 4342 4343 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map 4344 in a temporary bit of memory, in case the class contains fewer than two 4345 8-bit characters because in that case the compiled code doesn't use the bit 4346 map. */ 4347 4348 memset(classbits, 0, 32 * sizeof(uint8_t)); 4349 4350 /* Process characters until ] is reached. As the test is at the end of the 4351 loop, an initial ] is taken as a data character. At the start of the loop, 4352 c contains the first code unit of the character. If it is zero, check for 4353 the end of the pattern, to allow binary zero as data. */ 4354 4355 for(;;) 4356 { 4357 PCRE2_SPTR oldptr; 4358 #ifdef EBCDIC 4359 BOOL range_is_literal = TRUE; 4360 #endif 4361 4362 if (c == CHAR_NULL && ptr >= cb->end_pattern) 4363 { 4364 *errorcodeptr = ERR6; /* Missing terminating ']' */ 4365 goto FAILED; 4366 } 4367 4368 #ifdef SUPPORT_UNICODE 4369 if (utf && HAS_EXTRALEN(c)) 4370 { /* Braces are required because the */ 4371 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ 4372 } 4373 #endif 4374 4375 /* Inside \Q...\E everything is literal except \E */ 4376 4377 if (inescq) 4378 { 4379 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ 4380 { 4381 inescq = FALSE; /* Reset literal state */ 4382 ptr++; /* Skip the 'E' */ 4383 goto CONTINUE_CLASS; /* Carry on with next char */ 4384 } 4385 goto CHECK_RANGE; /* Could be range if \E follows */ 4386 } 4387 4388 /* Handle POSIX class names. Perl allows a negation extension of the 4389 form [:^name:]. A square bracket that doesn't match the syntax is 4390 treated as a literal. We also recognize the POSIX constructions 4391 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 4392 5.6 and 5.8 do. */ 4393 4394 if (c == CHAR_LEFT_SQUARE_BRACKET && 4395 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 4396 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) 4397 { 4398 BOOL local_negate = FALSE; 4399 int posix_class, taboffset, tabopt; 4400 register const uint8_t *cbits = cb->cbits; 4401 uint8_t pbits[32]; 4402 4403 if (ptr[1] != CHAR_COLON) 4404 { 4405 *errorcodeptr = ERR13; 4406 goto FAILED; 4407 } 4408 4409 ptr += 2; 4410 if (*ptr == CHAR_CIRCUMFLEX_ACCENT) 4411 { 4412 local_negate = TRUE; 4413 should_flip_negation = TRUE; /* Note negative special */ 4414 ptr++; 4415 } 4416 4417 posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); 4418 if (posix_class < 0) 4419 { 4420 *errorcodeptr = ERR30; 4421 goto FAILED; 4422 } 4423 4424 /* If matching is caseless, upper and lower are converted to 4425 alpha. This relies on the fact that the class table starts with 4426 alpha, lower, upper as the first 3 entries. */ 4427 4428 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) 4429 posix_class = 0; 4430 4431 /* When PCRE2_UCP is set, some of the POSIX classes are converted to 4432 different escape sequences that use Unicode properties \p or \P. Others 4433 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP 4434 directly. UCP support is not available unless UTF support is.*/ 4435 4436 #ifdef SUPPORT_UNICODE 4437 if ((options & PCRE2_UCP) != 0) 4438 { 4439 unsigned int ptype = 0; 4440 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); 4441 4442 /* The posix_substitutes table specifies which POSIX classes can be 4443 converted to \p or \P items. This can only happen at top nestling 4444 level, as there will never be a POSIX class in a string that is 4445 substituted for something else. */ 4446 4447 if (posix_substitutes[pc] != NULL) 4448 { 4449 cb->nestptr[0] = tempptr + 1; 4450 ptr = posix_substitutes[pc] - 1; 4451 goto CONTINUE_CLASS; 4452 } 4453 4454 /* There are three other classes that generate special property calls 4455 that are recognized only in an XCLASS. */ 4456 4457 else switch(posix_class) 4458 { 4459 case PC_GRAPH: 4460 ptype = PT_PXGRAPH; 4461 /* Fall through */ 4462 case PC_PRINT: 4463 if (ptype == 0) ptype = PT_PXPRINT; 4464 /* Fall through */ 4465 case PC_PUNCT: 4466 if (ptype == 0) ptype = PT_PXPUNCT; 4467 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; 4468 *class_uchardata++ = (PCRE2_UCHAR)ptype; 4469 *class_uchardata++ = 0; 4470 xclass_has_prop = TRUE; 4471 ptr = tempptr + 1; 4472 goto CONTINUE_CLASS; 4473 4474 /* For the other POSIX classes (ascii, xdigit) we are going to fall 4475 through to the non-UCP case and build a bit map for characters with 4476 code points less than 256. However, if we are in a negated POSIX 4477 class, characters with code points greater than 255 must either all 4478 match or all not match, depending on whether the whole class is not 4479 or is negated. For example, for [[:^ascii:]... they must all match, 4480 whereas for [^[:^xdigit:]... they must not. 4481 4482 In the special case where there are no xclass items, this is 4483 automatically handled by the use of OP_CLASS or OP_NCLASS, but an 4484 explicit range is needed for OP_XCLASS. Setting a flag here causes 4485 the range to be generated later when it is known that OP_XCLASS is 4486 required. */ 4487 4488 default: 4489 match_all_or_no_wide_chars |= local_negate; 4490 break; 4491 } 4492 } 4493 #endif /* SUPPORT_UNICODE */ 4494 4495 /* In the non-UCP case, or when UCP makes no difference, we build the 4496 bit map for the POSIX class in a chunk of local store because we may be 4497 adding and subtracting from it, and we don't want to subtract bits that 4498 may be in the main map already. At the end we or the result into the 4499 bit map that is being built. */ 4500 4501 posix_class *= 3; 4502 4503 /* Copy in the first table (always present) */ 4504 4505 memcpy(pbits, cbits + posix_class_maps[posix_class], 4506 32 * sizeof(uint8_t)); 4507 4508 /* If there is a second table, add or remove it as required. */ 4509 4510 taboffset = posix_class_maps[posix_class + 1]; 4511 tabopt = posix_class_maps[posix_class + 2]; 4512 4513 if (taboffset >= 0) 4514 { 4515 if (tabopt >= 0) 4516 for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset]; 4517 else 4518 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset]; 4519 } 4520 4521 /* Now see if we need to remove any special characters. An option 4522 value of 1 removes vertical space and 2 removes underscore. */ 4523 4524 if (tabopt < 0) tabopt = -tabopt; 4525 if (tabopt == 1) pbits[1] &= ~0x3c; 4526 else if (tabopt == 2) pbits[11] &= 0x7f; 4527 4528 /* Add the POSIX table or its complement into the main table that is 4529 being built and we are done. */ 4530 4531 if (local_negate) 4532 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; 4533 else 4534 for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; 4535 4536 ptr = tempptr + 1; 4537 /* Every class contains at least one < 256 character. */ 4538 class_has_8bitchar = 1; 4539 /* Every class contains at least two characters. */ 4540 class_one_char = 2; 4541 goto CONTINUE_CLASS; /* End of POSIX syntax handling */ 4542 } 4543 4544 /* Backslash may introduce a single character, or it may introduce one 4545 of the specials, which just set a flag. The sequence \b is a special 4546 case. Inside a class (and only there) it is treated as backspace. We 4547 assume that other escapes have more than one character in them, so 4548 speculatively set both class_has_8bitchar and class_one_char bigger 4549 than one. Unrecognized escapes fall through and are faulted. */ 4550 4551 if (c == CHAR_BACKSLASH) 4552 { 4553 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr, 4554 options, TRUE, cb); 4555 if (*errorcodeptr != 0) goto FAILED; 4556 if (escape == 0) /* Escaped single char */ 4557 { 4558 c = ec; 4559 #ifdef EBCDIC 4560 range_is_literal = FALSE; 4561 #endif 4562 } 4563 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ 4564 else if (escape == ESC_N) /* \N is not supported in a class */ 4565 { 4566 *errorcodeptr = ERR71; 4567 goto FAILED; 4568 } 4569 else if (escape == ESC_Q) /* Handle start of quoted string */ 4570 { 4571 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 4572 { 4573 ptr += 2; /* avoid empty string */ 4574 } 4575 else inescq = TRUE; 4576 goto CONTINUE_CLASS; 4577 } 4578 else if (escape == ESC_E) goto CONTINUE_CLASS; /* Ignore orphan \E */ 4579 4580 else /* Handle \d-type escapes */ 4581 { 4582 register const uint8_t *cbits = cb->cbits; 4583 /* Every class contains at least two < 256 characters. */ 4584 class_has_8bitchar++; 4585 /* Every class contains at least two characters. */ 4586 class_one_char += 2; 4587 4588 switch (escape) 4589 { 4590 #ifdef SUPPORT_UNICODE 4591 case ESC_du: /* These are the values given for \d etc */ 4592 case ESC_DU: /* when PCRE2_UCP is set. We replace the */ 4593 case ESC_wu: /* escape sequence with an appropriate \p */ 4594 case ESC_WU: /* or \P to test Unicode properties instead */ 4595 case ESC_su: /* of the default ASCII testing. This might be */ 4596 case ESC_SU: /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */ 4597 cb->nestptr[1] = cb->nestptr[0]; 4598 cb->nestptr[0] = ptr; 4599 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ 4600 class_has_8bitchar--; /* Undo! */ 4601 break; 4602 #endif 4603 case ESC_d: 4604 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; 4605 break; 4606 4607 case ESC_D: 4608 should_flip_negation = TRUE; 4609 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; 4610 break; 4611 4612 case ESC_w: 4613 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; 4614 break; 4615 4616 case ESC_W: 4617 should_flip_negation = TRUE; 4618 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; 4619 break; 4620 4621 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl 4622 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was 4623 previously set by something earlier in the character class. 4624 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so 4625 we could just adjust the appropriate bit. From PCRE 8.34 we no 4626 longer treat \s and \S specially. */ 4627 4628 case ESC_s: 4629 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; 4630 break; 4631 4632 case ESC_S: 4633 should_flip_negation = TRUE; 4634 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; 4635 break; 4636 4637 /* The rest apply in both UCP and non-UCP cases. */ 4638 4639 case ESC_h: 4640 (void)add_list_to_class(classbits, &class_uchardata, options, cb, 4641 PRIV(hspace_list), NOTACHAR); 4642 break; 4643 4644 case ESC_H: 4645 (void)add_not_list_to_class(classbits, &class_uchardata, options, 4646 cb, PRIV(hspace_list)); 4647 break; 4648 4649 case ESC_v: 4650 (void)add_list_to_class(classbits, &class_uchardata, options, cb, 4651 PRIV(vspace_list), NOTACHAR); 4652 break; 4653 4654 case ESC_V: 4655 (void)add_not_list_to_class(classbits, &class_uchardata, options, 4656 cb, PRIV(vspace_list)); 4657 break; 4658 4659 case ESC_p: 4660 case ESC_P: 4661 #ifdef SUPPORT_UNICODE 4662 { 4663 BOOL negated; 4664 unsigned int ptype = 0, pdata = 0; 4665 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb)) 4666 goto FAILED; 4667 *class_uchardata++ = ((escape == ESC_p) != negated)? 4668 XCL_PROP : XCL_NOTPROP; 4669 *class_uchardata++ = ptype; 4670 *class_uchardata++ = pdata; 4671 xclass_has_prop = TRUE; 4672 class_has_8bitchar--; /* Undo! */ 4673 } 4674 break; 4675 #else 4676 *errorcodeptr = ERR45; 4677 goto FAILED; 4678 #endif 4679 /* Unrecognized escapes are faulted. */ 4680 4681 default: 4682 *errorcodeptr = ERR7; 4683 goto FAILED; 4684 } 4685 4686 /* Handled \d-type escape */ 4687 4688 goto CONTINUE_CLASS; 4689 } 4690 4691 /* Control gets here if the escape just defined a single character. 4692 This is in c and may be greater than 256. */ 4693 4694 escape = 0; 4695 } /* End of backslash handling */ 4696 4697 /* A character may be followed by '-' to form a range. However, Perl does 4698 not permit ']' to be the end of the range. A '-' character at the end is 4699 treated as a literal. Perl ignores orphaned \E sequences entirely. The 4700 code for handling \Q and \E is messy. */ 4701 4702 CHECK_RANGE: 4703 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 4704 { 4705 inescq = FALSE; 4706 ptr += 2; 4707 } 4708 oldptr = ptr; 4709 4710 /* Remember if \r or \n were explicitly used */ 4711 4712 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; 4713 4714 /* Check for range */ 4715 4716 if (!inescq && ptr[1] == CHAR_MINUS) 4717 { 4718 uint32_t d; 4719 ptr += 2; 4720 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; 4721 4722 /* If we hit \Q (not followed by \E) at this point, go into escaped 4723 mode. */ 4724 4725 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) 4726 { 4727 ptr += 2; 4728 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) 4729 { ptr += 2; continue; } 4730 inescq = TRUE; 4731 break; 4732 } 4733 4734 /* Minus (hyphen) at the end of a class is treated as a literal, so put 4735 back the pointer and jump to handle the character that preceded it. */ 4736 4737 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) 4738 { 4739 ptr = oldptr; 4740 goto CLASS_SINGLE_CHARACTER; 4741 } 4742 4743 /* Otherwise, we have a potential range; pick up the next character */ 4744 4745 #ifdef SUPPORT_UNICODE 4746 if (utf) 4747 { /* Braces are required because the */ 4748 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ 4749 } 4750 else 4751 #endif 4752 d = *ptr; /* Not UTF mode */ 4753 4754 /* The second part of a range can be a single-character escape 4755 sequence, but not any of the other escapes. Perl treats a hyphen as a 4756 literal in such circumstances. However, in Perl's warning mode, a 4757 warning is given, so PCRE now faults it as it is almost certainly a 4758 mistake on the user's part. */ 4759 4760 if (!inescq) 4761 { 4762 if (d == CHAR_BACKSLASH) 4763 { 4764 int descape; 4765 descape = PRIV(check_escape)(&ptr, cb->end_pattern, &d, 4766 errorcodeptr, options, TRUE, cb); 4767 if (*errorcodeptr != 0) goto FAILED; 4768 #ifdef EBCDIC 4769 range_is_literal = FALSE; 4770 #endif 4771 /* 0 means a character was put into d; \b is backspace; any other 4772 special causes an error. */ 4773 4774 if (descape != 0) 4775 { 4776 if (descape == ESC_b) d = CHAR_BS; else 4777 { 4778 *errorcodeptr = ERR50; 4779 goto FAILED; 4780 } 4781 } 4782 } 4783 4784 /* A hyphen followed by a POSIX class is treated in the same way. */ 4785 4786 else if (d == CHAR_LEFT_SQUARE_BRACKET && 4787 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 4788 ptr[1] == CHAR_EQUALS_SIGN) && 4789 check_posix_syntax(ptr, &tempptr)) 4790 { 4791 *errorcodeptr = ERR50; 4792 goto FAILED; 4793 } 4794 } 4795 4796 /* Check that the two values are in the correct order. Optimize 4797 one-character ranges. */ 4798 4799 if (d < c) 4800 { 4801 *errorcodeptr = ERR8; 4802 goto FAILED; 4803 } 4804 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */ 4805 4806 /* We have found a character range, so single character optimizations 4807 cannot be done anymore. Any value greater than 1 indicates that there 4808 is more than one character. */ 4809 4810 class_one_char = 2; 4811 4812 /* Remember an explicit \r or \n, and add the range to the class. */ 4813 4814 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; 4815 4816 /* In an EBCDIC environment, Perl treats alphabetic ranges specially 4817 because there are holes in the encoding, and simply using the range A-Z 4818 (for example) would include the characters in the holes. This applies 4819 only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ 4820 4821 #ifdef EBCDIC 4822 if (range_is_literal && 4823 (cb->ctypes[c] & ctype_letter) != 0 && 4824 (cb->ctypes[d] & ctype_letter) != 0 && 4825 (c <= CHAR_z) == (d <= CHAR_z)) 4826 { 4827 uint32_t uc = (c <= CHAR_z)? 0 : 64; 4828 uint32_t C = c - uc; 4829 uint32_t D = d - uc; 4830 4831 if (C <= CHAR_i) 4832 { 4833 class_has_8bitchar += 4834 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 4835 ((D < CHAR_i)? D : CHAR_i) + uc); 4836 C = CHAR_j; 4837 } 4838 4839 if (C <= D && C <= CHAR_r) 4840 { 4841 class_has_8bitchar += 4842 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 4843 ((D < CHAR_r)? D : CHAR_r) + uc); 4844 C = CHAR_s; 4845 } 4846 4847 if (C <= D) 4848 { 4849 class_has_8bitchar += 4850 add_to_class(classbits, &class_uchardata, options, cb, C + uc, 4851 D + uc); 4852 } 4853 } 4854 else 4855 #endif 4856 class_has_8bitchar += 4857 add_to_class(classbits, &class_uchardata, options, cb, c, d); 4858 goto CONTINUE_CLASS; /* Go get the next char in the class */ 4859 } 4860 4861 /* Handle a single character - we can get here for a normal non-escape 4862 char, or after \ that introduces a single character or for an apparent 4863 range that isn't. Only the value 1 matters for class_one_char, so don't 4864 increase it if it is already 2 or more ... just in case there's a class 4865 with a zillion characters in it. */ 4866 4867 CLASS_SINGLE_CHARACTER: 4868 if (class_one_char < 2) class_one_char++; 4869 4870 /* If class_one_char is 1 and xclass_has_prop is false, we have the first 4871 single character in the class, and there have been no prior ranges, or 4872 XCLASS items generated by escapes. If this is the final character in the 4873 class, we can optimize by turning the item into a 1-character OP_CHAR[I] 4874 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it 4875 can cause firstcu to be set. Otherwise, there can be no first char if 4876 this item is first, whatever repeat count may follow. In the case of 4877 reqcu, save the previous value for reinstating. */ 4878 4879 if (!inescq && 4880 #ifdef SUPPORT_UNICODE 4881 !xclass_has_prop && 4882 #endif 4883 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 4884 { 4885 ptr++; 4886 zeroreqcu = reqcu; 4887 zeroreqcuflags = reqcuflags; 4888 4889 if (negate_class) 4890 { 4891 #ifdef SUPPORT_UNICODE 4892 int d; 4893 #endif 4894 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 4895 zerofirstcu = firstcu; 4896 zerofirstcuflags = firstcuflags; 4897 4898 /* For caseless UTF mode, check whether this character has more than 4899 one other case. If so, generate a special OP_NOTPROP item instead of 4900 OP_NOTI. */ 4901 4902 #ifdef SUPPORT_UNICODE 4903 if (utf && (options & PCRE2_CASELESS) != 0 && 4904 (d = UCD_CASESET(c)) != 0) 4905 { 4906 *code++ = OP_NOTPROP; 4907 *code++ = PT_CLIST; 4908 *code++ = d; 4909 } 4910 else 4911 #endif 4912 /* Char has only one other case, or UCP not available */ 4913 4914 { 4915 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; 4916 code += PUTCHAR(c, code); 4917 } 4918 4919 /* We are finished with this character class */ 4920 4921 goto END_CLASS; 4922 } 4923 4924 /* For a single, positive character, get the value into mcbuffer, and 4925 then we can handle this with the normal one-character code. */ 4926 4927 mclength = PUTCHAR(c, mcbuffer); 4928 goto ONE_CHAR; 4929 } /* End of 1-char optimization */ 4930 4931 /* There is more than one character in the class, or an XCLASS item 4932 has been generated. Add this character to the class. */ 4933 4934 class_has_8bitchar += 4935 add_to_class(classbits, &class_uchardata, options, cb, c, c); 4936 4937 /* Continue to the next character in the class. Closing square bracket 4938 not within \Q..\E ends the class. A NULL character terminates a 4939 nested substitution string, but may be a data character in the main 4940 pattern (tested at the start of this loop). */ 4941 4942 CONTINUE_CLASS: 4943 c = *(++ptr); 4944 if (c == CHAR_NULL && cb->nestptr[0] != NULL) 4945 { 4946 ptr = cb->nestptr[0]; 4947 cb->nestptr[0] = cb->nestptr[1]; 4948 cb->nestptr[1] = NULL; 4949 c = *(++ptr); 4950 } 4951 4952 #ifdef SUPPORT_WIDE_CHARS 4953 /* If any wide characters have been encountered, set xclass = TRUE. Then, 4954 in the pre-compile phase, accumulate the length of the wide characters 4955 and reset the pointer. This is so that very large classes that contain a 4956 zillion wide characters do not overwrite the work space (which is on the 4957 stack). */ 4958 4959 if (class_uchardata > class_uchardata_base) 4960 { 4961 xclass = TRUE; 4962 if (lengthptr != NULL) 4963 { 4964 *lengthptr += class_uchardata - class_uchardata_base; 4965 class_uchardata = class_uchardata_base; 4966 } 4967 } 4968 #endif 4969 /* An unescaped ] ends the class */ 4970 4971 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; 4972 } /* End of main class-processing loop */ 4973 4974 /* If this is the first thing in the branch, there can be no first char 4975 setting, whatever the repeat count. Any reqcu setting must remain 4976 unchanged after any kind of repeat. */ 4977 4978 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 4979 zerofirstcu = firstcu; 4980 zerofirstcuflags = firstcuflags; 4981 zeroreqcu = reqcu; 4982 zeroreqcuflags = reqcuflags; 4983 4984 /* If there are characters with values > 255, or Unicode property settings 4985 (\p or \P), we have to compile an extended class, with its own opcode, 4986 unless there were no property settings and there was a negated special such 4987 as \S in the class, and PCRE2_UCP is not set, because in that case all 4988 characters > 255 are in or not in the class, so any that were explicitly 4989 given as well can be ignored. 4990 4991 In the UCP case, if certain negated POSIX classes ([:^ascii:] or 4992 [^:xdigit:]) were present in a class, we either have to match or not match 4993 all wide characters (depending on whether the whole class is or is not 4994 negated). This requirement is indicated by match_all_or_no_wide_chars being 4995 true. We do this by including an explicit range, which works in both cases. 4996 4997 If, when generating an xclass, there are no characters < 256, we can omit 4998 the bitmap in the actual compiled code. */ 4999 5000 #ifdef SUPPORT_WIDE_CHARS 5001 #ifdef SUPPORT_UNICODE 5002 if (xclass && (xclass_has_prop || !should_flip_negation || 5003 (options & PCRE2_UCP) != 0)) 5004 #elif PCRE2_CODE_UNIT_WIDTH != 8 5005 if (xclass && (xclass_has_prop || !should_flip_negation)) 5006 #endif 5007 { 5008 if (match_all_or_no_wide_chars) 5009 { 5010 *class_uchardata++ = XCL_RANGE; 5011 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); 5012 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); 5013 } 5014 *class_uchardata++ = XCL_END; /* Marks the end of extra data */ 5015 *code++ = OP_XCLASS; 5016 code += LINK_SIZE; 5017 *code = negate_class? XCL_NOT:0; 5018 if (xclass_has_prop) *code |= XCL_HASPROP; 5019 5020 /* If the map is required, move up the extra data to make room for it; 5021 otherwise just move the code pointer to the end of the extra data. */ 5022 5023 if (class_has_8bitchar > 0) 5024 { 5025 *code++ |= XCL_MAP; 5026 memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, 5027 CU2BYTES(class_uchardata - code)); 5028 if (negate_class && !xclass_has_prop) 5029 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; 5030 memcpy(code, classbits, 32); 5031 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); 5032 } 5033 else code = class_uchardata; 5034 5035 /* Now fill in the complete length of the item */ 5036 5037 PUT(previous, 1, (int)(code - previous)); 5038 break; /* End of class handling */ 5039 } 5040 #endif 5041 5042 /* If there are no characters > 255, or they are all to be included or 5043 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the 5044 whole class was negated and whether there were negative specials such as \S 5045 (non-UCP) in the class. Then copy the 32-byte map into the code vector, 5046 negating it if necessary. */ 5047 5048 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; 5049 if (lengthptr == NULL) /* Save time in the pre-compile phase */ 5050 { 5051 if (negate_class) 5052 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; 5053 memcpy(code, classbits, 32); 5054 } 5055 code += 32 / sizeof(PCRE2_UCHAR); 5056 5057 END_CLASS: 5058 break; 5059 5060 5061 /* ===================================================================*/ 5062 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this 5063 has been tested above. */ 5064 5065 case CHAR_LEFT_CURLY_BRACKET: 5066 if (!is_quantifier) goto NORMAL_CHAR; 5067 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); 5068 if (*errorcodeptr != 0) goto FAILED; 5069 goto REPEAT; 5070 5071 case CHAR_ASTERISK: 5072 repeat_min = 0; 5073 repeat_max = -1; 5074 goto REPEAT; 5075 5076 case CHAR_PLUS: 5077 repeat_min = 1; 5078 repeat_max = -1; 5079 goto REPEAT; 5080 5081 case CHAR_QUESTION_MARK: 5082 repeat_min = 0; 5083 repeat_max = 1; 5084 5085 REPEAT: 5086 if (previous == NULL) 5087 { 5088 *errorcodeptr = ERR9; 5089 goto FAILED; 5090 } 5091 5092 if (repeat_min == 0) 5093 { 5094 firstcu = zerofirstcu; /* Adjust for zero repeat */ 5095 firstcuflags = zerofirstcuflags; 5096 reqcu = zeroreqcu; /* Ditto */ 5097 reqcuflags = zeroreqcuflags; 5098 } 5099 5100 /* Remember whether this is a variable length repeat */ 5101 5102 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 5103 5104 op_type = 0; /* Default single-char op codes */ 5105 possessive_quantifier = FALSE; /* Default not possessive quantifier */ 5106 5107 /* Save start of previous item, in case we have to move it up in order to 5108 insert something before it. */ 5109 5110 tempcode = previous; 5111 5112 /* Before checking for a possessive quantifier, we must skip over 5113 whitespace and comments in extended mode because Perl allows white space at 5114 this point. */ 5115 5116 if ((options & PCRE2_EXTENDED) != 0) 5117 { 5118 ptr++; 5119 for (;;) 5120 { 5121 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_space) != 0) ptr++; 5122 if (*ptr != CHAR_NUMBER_SIGN) break; 5123 ptr++; 5124 while (ptr < cb->end_pattern) 5125 { 5126 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 5127 { /* IS_NEWLINE sets cb->nllen. */ 5128 ptr += cb->nllen; 5129 break; 5130 } 5131 ptr++; 5132 #ifdef SUPPORT_UNICODE 5133 if (utf) FORWARDCHAR(ptr); 5134 #endif 5135 } /* Loop for comment characters */ 5136 } /* Loop for multiple comments */ 5137 ptr--; /* Last code unit of previous character. */ 5138 } 5139 5140 /* If the next character is '+', we have a possessive quantifier. This 5141 implies greediness, whatever the setting of the PCRE2_UNGREEDY option. 5142 If the next character is '?' this is a minimizing repeat, by default, 5143 but if PCRE2_UNGREEDY is set, it works the other way round. We change the 5144 repeat type to the non-default. */ 5145 5146 if (ptr[1] == CHAR_PLUS) 5147 { 5148 repeat_type = 0; /* Force greedy */ 5149 possessive_quantifier = TRUE; 5150 ptr++; 5151 } 5152 else if (ptr[1] == CHAR_QUESTION_MARK) 5153 { 5154 repeat_type = greedy_non_default; 5155 ptr++; 5156 } 5157 else repeat_type = greedy_default; 5158 5159 /* If the repeat is {1} we can ignore it. */ 5160 5161 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; 5162 5163 /* If previous was a recursion call, wrap it in atomic brackets so that 5164 previous becomes the atomic group. All recursions were so wrapped in the 5165 past, but it no longer happens for non-repeated recursions. In fact, the 5166 repeated ones could be re-implemented independently so as not to need this, 5167 but for the moment we rely on the code for repeating groups. */ 5168 5169 if (*previous == OP_RECURSE) 5170 { 5171 memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); 5172 *previous = OP_ONCE; 5173 PUT(previous, 1, 2 + 2*LINK_SIZE); 5174 previous[2 + 2*LINK_SIZE] = OP_KET; 5175 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); 5176 code += 2 + 2 * LINK_SIZE; 5177 length_prevgroup = 3 + 3*LINK_SIZE; 5178 } 5179 5180 /* Now handle repetition for the different types of item. */ 5181 5182 /* If previous was a character or negated character match, abolish the item 5183 and generate a repeat item instead. If a char item has a minimum of more 5184 than one, ensure that it is set in reqcu - it might not be if a sequence 5185 such as x{3} is the first thing in a branch because the x will have gone 5186 into firstcu instead. */ 5187 5188 if (*previous == OP_CHAR || *previous == OP_CHARI 5189 || *previous == OP_NOT || *previous == OP_NOTI) 5190 { 5191 switch (*previous) 5192 { 5193 default: /* Make compiler happy. */ 5194 case OP_CHAR: op_type = OP_STAR - OP_STAR; break; 5195 case OP_CHARI: op_type = OP_STARI - OP_STAR; break; 5196 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; 5197 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; 5198 } 5199 5200 /* Deal with UTF characters that take up more than one code unit. It's 5201 easier to write this out separately than try to macrify it. Use c to 5202 hold the length of the character in code units, plus UTF_LENGTH to flag 5203 that it's a length rather than a small character. */ 5204 5205 #ifdef MAYBE_UTF_MULTI 5206 if (utf && NOT_FIRSTCU(code[-1])) 5207 { 5208 PCRE2_UCHAR *lastchar = code - 1; 5209 BACKCHAR(lastchar); 5210 c = (int)(code - lastchar); /* Length of UTF character */ 5211 memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */ 5212 c |= UTF_LENGTH; /* Flag c as a length */ 5213 } 5214 else 5215 #endif /* MAYBE_UTF_MULTI */ 5216 5217 /* Handle the case of a single charater - either with no UTF support, or 5218 with UTF disabled, or for a single-code-unit UTF character. */ 5219 { 5220 c = code[-1]; 5221 if (*previous <= OP_CHARI && repeat_min > 1) 5222 { 5223 reqcu = c; 5224 reqcuflags = req_caseopt | cb->req_varyopt; 5225 } 5226 } 5227 5228 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 5229 } 5230 5231 /* If previous was a character type match (\d or similar), abolish it and 5232 create a suitable repeat item. The code is shared with single-character 5233 repeats by setting op_type to add a suitable offset into repeat_type. Note 5234 the the Unicode property types will be present only when SUPPORT_UNICODE is 5235 defined, but we don't wrap the little bits of code here because it just 5236 makes it horribly messy. */ 5237 5238 else if (*previous < OP_EODN) 5239 { 5240 PCRE2_UCHAR *oldcode; 5241 int prop_type, prop_value; 5242 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 5243 c = *previous; /* Save previous opcode */ 5244 if (c == OP_PROP || c == OP_NOTPROP) 5245 { 5246 prop_type = previous[1]; 5247 prop_value = previous[2]; 5248 } 5249 else 5250 { 5251 /* Come here from just above with a character in c */ 5252 OUTPUT_SINGLE_REPEAT: 5253 prop_type = prop_value = -1; 5254 } 5255 5256 /* At this point we either have prop_type == prop_value == -1 and either 5257 a code point or a character type that is not OP_[NOT]PROP in c, or we 5258 have OP_[NOT]PROP in c and prop_type/prop_value not negative. */ 5259 5260 oldcode = code; /* Save where we were */ 5261 code = previous; /* Usually overwrite previous item */ 5262 5263 /* If the maximum is zero then the minimum must also be zero; Perl allows 5264 this case, so we do too - by simply omitting the item altogether. */ 5265 5266 if (repeat_max == 0) goto END_REPEAT; 5267 5268 /* Combine the op_type with the repeat_type */ 5269 5270 repeat_type += op_type; 5271 5272 /* A minimum of zero is handled either as the special case * or ?, or as 5273 an UPTO, with the maximum given. */ 5274 5275 if (repeat_min == 0) 5276 { 5277 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; 5278 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 5279 else 5280 { 5281 *code++ = OP_UPTO + repeat_type; 5282 PUT2INC(code, 0, repeat_max); 5283 } 5284 } 5285 5286 /* A repeat minimum of 1 is optimized into some special cases. If the 5287 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is 5288 left in place and, if the maximum is greater than 1, we use OP_UPTO with 5289 one less than the maximum. */ 5290 5291 else if (repeat_min == 1) 5292 { 5293 if (repeat_max == -1) 5294 *code++ = OP_PLUS + repeat_type; 5295 else 5296 { 5297 code = oldcode; /* Leave previous item in place */ 5298 if (repeat_max == 1) goto END_REPEAT; 5299 *code++ = OP_UPTO + repeat_type; 5300 PUT2INC(code, 0, repeat_max - 1); 5301 } 5302 } 5303 5304 /* The case {n,n} is just an EXACT, while the general case {n,m} is 5305 handled as an EXACT followed by an UPTO or STAR or QUERY. */ 5306 5307 else 5308 { 5309 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 5310 PUT2INC(code, 0, repeat_min); 5311 5312 /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and 5313 then generate the second opcode. In UTF mode, multi-code-unit 5314 characters have their length in c, with the UTF_LENGTH bit as a flag, 5315 and the code units in utf_units. For a repeated Unicode property match, 5316 there are two extra values that define the required property, and c 5317 never has the UTF_LENGTH bit set. */ 5318 5319 if (repeat_max != repeat_min) 5320 { 5321 #ifdef MAYBE_UTF_MULTI 5322 if (utf && (c & UTF_LENGTH) != 0) 5323 { 5324 memcpy(code, utf_units, CU2BYTES(c & 7)); 5325 code += c & 7; 5326 } 5327 else 5328 #endif /* MAYBE_UTF_MULTI */ 5329 { 5330 *code++ = c; 5331 if (prop_type >= 0) 5332 { 5333 *code++ = prop_type; 5334 *code++ = prop_value; 5335 } 5336 } 5337 5338 /* Now set up the following opcode */ 5339 5340 if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else 5341 { 5342 repeat_max -= repeat_min; 5343 if (repeat_max == 1) 5344 { 5345 *code++ = OP_QUERY + repeat_type; 5346 } 5347 else 5348 { 5349 *code++ = OP_UPTO + repeat_type; 5350 PUT2INC(code, 0, repeat_max); 5351 } 5352 } 5353 } 5354 } 5355 5356 /* Fill in the character or character type for the final opcode. */ 5357 5358 #ifdef MAYBE_UTF_MULTI 5359 if (utf && (c & UTF_LENGTH) != 0) 5360 { 5361 memcpy(code, utf_units, CU2BYTES(c & 7)); 5362 code += c & 7; 5363 } 5364 else 5365 #endif /* MAYBEW_UTF_MULTI */ 5366 { 5367 *code++ = c; 5368 if (prop_type >= 0) 5369 { 5370 *code++ = prop_type; 5371 *code++ = prop_value; 5372 } 5373 } 5374 } 5375 5376 /* If previous was a character class or a back reference, we put the repeat 5377 stuff after it, but just skip the item if the repeat was {0,0}. */ 5378 5379 else if (*previous == OP_CLASS || *previous == OP_NCLASS || 5380 #ifdef SUPPORT_WIDE_CHARS 5381 *previous == OP_XCLASS || 5382 #endif 5383 *previous == OP_REF || *previous == OP_REFI || 5384 *previous == OP_DNREF || *previous == OP_DNREFI) 5385 { 5386 if (repeat_max == 0) 5387 { 5388 code = previous; 5389 goto END_REPEAT; 5390 } 5391 5392 if (repeat_min == 0 && repeat_max == -1) 5393 *code++ = OP_CRSTAR + repeat_type; 5394 else if (repeat_min == 1 && repeat_max == -1) 5395 *code++ = OP_CRPLUS + repeat_type; 5396 else if (repeat_min == 0 && repeat_max == 1) 5397 *code++ = OP_CRQUERY + repeat_type; 5398 else 5399 { 5400 *code++ = OP_CRRANGE + repeat_type; 5401 PUT2INC(code, 0, repeat_min); 5402 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ 5403 PUT2INC(code, 0, repeat_max); 5404 } 5405 } 5406 5407 /* If previous was a bracket group, we may have to replicate it in certain 5408 cases. Note that at this point we can encounter only the "basic" bracket 5409 opcodes such as BRA and CBRA, as this is the place where they get converted 5410 into the more special varieties such as BRAPOS and SBRA. A test for >= 5411 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK, 5412 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND. 5413 Originally, PCRE did not allow repetition of assertions, but now it does, 5414 for Perl compatibility. */ 5415 5416 else if (*previous >= OP_ASSERT && *previous <= OP_COND) 5417 { 5418 register int i; 5419 int len = (int)(code - previous); 5420 PCRE2_UCHAR *bralink = NULL; 5421 PCRE2_UCHAR *brazeroptr = NULL; 5422 5423 /* Repeating a DEFINE group (or any group where the condition is always 5424 FALSE and there is only one branch) is pointless, but Perl allows the 5425 syntax, so we just ignore the repeat. */ 5426 5427 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && 5428 previous[GET(previous, 1)] != OP_ALT) 5429 goto END_REPEAT; 5430 5431 /* There is no sense in actually repeating assertions. The only potential 5432 use of repetition is in cases when the assertion is optional. Therefore, 5433 if the minimum is greater than zero, just ignore the repeat. If the 5434 maximum is not zero or one, set it to 1. */ 5435 5436 if (*previous < OP_ONCE) /* Assertion */ 5437 { 5438 if (repeat_min > 0) goto END_REPEAT; 5439 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1; 5440 } 5441 5442 /* The case of a zero minimum is special because of the need to stick 5443 OP_BRAZERO in front of it, and because the group appears once in the 5444 data, whereas in other cases it appears the minimum number of times. For 5445 this reason, it is simplest to treat this case separately, as otherwise 5446 the code gets far too messy. There are several special subcases when the 5447 minimum is zero. */ 5448 5449 if (repeat_min == 0) 5450 { 5451 /* If the maximum is also zero, we used to just omit the group from the 5452 output altogether, like this: 5453 5454 ** if (repeat_max == 0) 5455 ** { 5456 ** code = previous; 5457 ** goto END_REPEAT; 5458 ** } 5459 5460 However, that fails when a group or a subgroup within it is referenced 5461 as a subroutine from elsewhere in the pattern, so now we stick in 5462 OP_SKIPZERO in front of it so that it is skipped on execution. As we 5463 don't have a list of which groups are referenced, we cannot do this 5464 selectively. 5465 5466 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO 5467 and do no more at this point. */ 5468 5469 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ 5470 { 5471 memmove(previous + 1, previous, CU2BYTES(len)); 5472 code++; 5473 if (repeat_max == 0) 5474 { 5475 *previous++ = OP_SKIPZERO; 5476 goto END_REPEAT; 5477 } 5478 brazeroptr = previous; /* Save for possessive optimizing */ 5479 *previous++ = OP_BRAZERO + repeat_type; 5480 } 5481 5482 /* If the maximum is greater than 1 and limited, we have to replicate 5483 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 5484 The first one has to be handled carefully because it's the original 5485 copy, which has to be moved up. The remainder can be handled by code 5486 that is common with the non-zero minimum case below. We have to 5487 adjust the value or repeat_max, since one less copy is required. */ 5488 5489 else 5490 { 5491 int offset; 5492 memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); 5493 code += 2 + LINK_SIZE; 5494 *previous++ = OP_BRAZERO + repeat_type; 5495 *previous++ = OP_BRA; 5496 5497 /* We chain together the bracket offset fields that have to be 5498 filled in later when the ends of the brackets are reached. */ 5499 5500 offset = (bralink == NULL)? 0 : (int)(previous - bralink); 5501 bralink = previous; 5502 PUTINC(previous, 0, offset); 5503 } 5504 5505 repeat_max--; 5506 } 5507 5508 /* If the minimum is greater than zero, replicate the group as many 5509 times as necessary, and adjust the maximum to the number of subsequent 5510 copies that we need. */ 5511 5512 else 5513 { 5514 if (repeat_min > 1) 5515 { 5516 /* In the pre-compile phase, we don't actually do the replication. We 5517 just adjust the length as if we had. Do some paranoid checks for 5518 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit 5519 integer type when available, otherwise double. */ 5520 5521 if (lengthptr != NULL) 5522 { 5523 size_t delta = (repeat_min - 1)*length_prevgroup; 5524 if ((INT64_OR_DOUBLE)(repeat_min - 1)* 5525 (INT64_OR_DOUBLE)length_prevgroup > 5526 (INT64_OR_DOUBLE)INT_MAX || 5527 OFLOW_MAX - *lengthptr < delta) 5528 { 5529 *errorcodeptr = ERR20; 5530 goto FAILED; 5531 } 5532 *lengthptr += delta; 5533 } 5534 5535 /* This is compiling for real. If there is a set first byte for 5536 the group, and we have not yet set a "required byte", set it. */ 5537 5538 else 5539 { 5540 if (groupsetfirstcu && reqcuflags < 0) 5541 { 5542 reqcu = firstcu; 5543 reqcuflags = firstcuflags; 5544 } 5545 for (i = 1; i < repeat_min; i++) 5546 { 5547 memcpy(code, previous, CU2BYTES(len)); 5548 code += len; 5549 } 5550 } 5551 } 5552 5553 if (repeat_max > 0) repeat_max -= repeat_min; 5554 } 5555 5556 /* This code is common to both the zero and non-zero minimum cases. If 5557 the maximum is limited, it replicates the group in a nested fashion, 5558 remembering the bracket starts on a stack. In the case of a zero minimum, 5559 the first one was set up above. In all cases the repeat_max now specifies 5560 the number of additional copies needed. Again, we must remember to 5561 replicate entries on the forward reference list. */ 5562 5563 if (repeat_max >= 0) 5564 { 5565 /* In the pre-compile phase, we don't actually do the replication. We 5566 just adjust the length as if we had. For each repetition we must add 1 5567 to the length for BRAZERO and for all but the last repetition we must 5568 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some 5569 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is 5570 a 64-bit integer type when available, otherwise double. */ 5571 5572 if (lengthptr != NULL && repeat_max > 0) 5573 { 5574 size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 5575 2 - 2*LINK_SIZE; /* Last one doesn't nest */ 5576 if ((INT64_OR_DOUBLE)repeat_max * 5577 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) 5578 > (INT64_OR_DOUBLE)INT_MAX || 5579 OFLOW_MAX - *lengthptr < delta) 5580 { 5581 *errorcodeptr = ERR20; 5582 goto FAILED; 5583 } 5584 *lengthptr += delta; 5585 } 5586 5587 /* This is compiling for real */ 5588 5589 else for (i = repeat_max - 1; i >= 0; i--) 5590 { 5591 *code++ = OP_BRAZERO + repeat_type; 5592 5593 /* All but the final copy start a new nesting, maintaining the 5594 chain of brackets outstanding. */ 5595 5596 if (i != 0) 5597 { 5598 int offset; 5599 *code++ = OP_BRA; 5600 offset = (bralink == NULL)? 0 : (int)(code - bralink); 5601 bralink = code; 5602 PUTINC(code, 0, offset); 5603 } 5604 5605 memcpy(code, previous, CU2BYTES(len)); 5606 code += len; 5607 } 5608 5609 /* Now chain through the pending brackets, and fill in their length 5610 fields (which are holding the chain links pro tem). */ 5611 5612 while (bralink != NULL) 5613 { 5614 int oldlinkoffset; 5615 int offset = (int)(code - bralink + 1); 5616 PCRE2_UCHAR *bra = code - offset; 5617 oldlinkoffset = GET(bra, 1); 5618 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 5619 *code++ = OP_KET; 5620 PUTINC(code, 0, offset); 5621 PUT(bra, 1, offset); 5622 } 5623 } 5624 5625 /* If the maximum is unlimited, set a repeater in the final copy. For 5626 ONCE brackets, that's all we need to do. However, possessively repeated 5627 ONCE brackets can be converted into non-capturing brackets, as the 5628 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to 5629 deal with possessive ONCEs specially. 5630 5631 Otherwise, when we are doing the actual compile phase, check to see 5632 whether this group is one that could match an empty string. If so, 5633 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so 5634 that runtime checking can be done. [This check is also applied to ONCE 5635 groups at runtime, but in a different way.] 5636 5637 Then, if the quantifier was possessive and the bracket is not a 5638 conditional, we convert the BRA code to the POS form, and the KET code to 5639 KETRPOS. (It turns out to be convenient at runtime to detect this kind of 5640 subpattern at both the start and at the end.) The use of special opcodes 5641 makes it possible to reduce greatly the stack usage in pcre2_match(). If 5642 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. 5643 5644 Then, if the minimum number of matches is 1 or 0, cancel the possessive 5645 flag so that the default action below, of wrapping everything inside 5646 atomic brackets, does not happen. When the minimum is greater than 1, 5647 there will be earlier copies of the group, and so we still have to wrap 5648 the whole thing. */ 5649 5650 else 5651 { 5652 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; 5653 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); 5654 5655 /* Convert possessive ONCE brackets to non-capturing */ 5656 5657 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && 5658 possessive_quantifier) *bracode = OP_BRA; 5659 5660 /* For non-possessive ONCE brackets, all we need to do is to 5661 set the KET. */ 5662 5663 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC) 5664 *ketcode = OP_KETRMAX + repeat_type; 5665 5666 /* Handle non-ONCE brackets and possessive ONCEs (which have been 5667 converted to non-capturing above). */ 5668 5669 else 5670 { 5671 /* In the compile phase, check whether the group could match an empty 5672 string. */ 5673 5674 if (lengthptr == NULL) 5675 { 5676 PCRE2_UCHAR *scode = bracode; 5677 do 5678 { 5679 int count = 0; 5680 int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE, 5681 NULL, &count); 5682 if (rc < 0) 5683 { 5684 *errorcodeptr = ERR86; 5685 goto FAILED; 5686 } 5687 if (rc > 0) 5688 { 5689 *bracode += OP_SBRA - OP_BRA; 5690 break; 5691 } 5692 scode += GET(scode, 1); 5693 } 5694 while (*scode == OP_ALT); 5695 5696 /* A conditional group with only one branch has an implicit empty 5697 alternative branch. */ 5698 5699 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) 5700 *bracode = OP_SCOND; 5701 } 5702 5703 /* Handle possessive quantifiers. */ 5704 5705 if (possessive_quantifier) 5706 { 5707 /* For COND brackets, we wrap the whole thing in a possessively 5708 repeated non-capturing bracket, because we have not invented POS 5709 versions of the COND opcodes. */ 5710 5711 if (*bracode == OP_COND || *bracode == OP_SCOND) 5712 { 5713 int nlen = (int)(code - bracode); 5714 memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); 5715 code += 1 + LINK_SIZE; 5716 nlen += 1 + LINK_SIZE; 5717 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; 5718 *code++ = OP_KETRPOS; 5719 PUTINC(code, 0, nlen); 5720 PUT(bracode, 1, nlen); 5721 } 5722 5723 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ 5724 5725 else 5726 { 5727 *bracode += 1; /* Switch to xxxPOS opcodes */ 5728 *ketcode = OP_KETRPOS; 5729 } 5730 5731 /* If the minimum is zero, mark it as possessive, then unset the 5732 possessive flag when the minimum is 0 or 1. */ 5733 5734 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; 5735 if (repeat_min < 2) possessive_quantifier = FALSE; 5736 } 5737 5738 /* Non-possessive quantifier */ 5739 5740 else *ketcode = OP_KETRMAX + repeat_type; 5741 } 5742 } 5743 } 5744 5745 /* If previous is OP_FAIL, it was generated by an empty class [] 5746 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be 5747 generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a 5748 "nothing to repeat" error above. We can just ignore the repeat in empty 5749 class case. */ 5750 5751 else if (*previous == OP_FAIL) goto END_REPEAT; 5752 5753 /* Else there's some kind of shambles */ 5754 5755 else 5756 { 5757 *errorcodeptr = ERR10; 5758 goto FAILED; 5759 } 5760 5761 /* If the character following a repeat is '+', possessive_quantifier is 5762 TRUE. For some opcodes, there are special alternative opcodes for this 5763 case. For anything else, we wrap the entire repeated item inside OP_ONCE 5764 brackets. Logically, the '+' notation is just syntactic sugar, taken from 5765 Sun's Java package, but the special opcodes can optimize it. 5766 5767 Some (but not all) possessively repeated subpatterns have already been 5768 completely handled in the code just above. For them, possessive_quantifier 5769 is always FALSE at this stage. Note that the repeated item starts at 5770 tempcode, not at previous, which might be the first part of a string whose 5771 (former) last char we repeated. */ 5772 5773 if (possessive_quantifier) 5774 { 5775 int len; 5776 5777 /* Possessifying an EXACT quantifier has no effect, so we can ignore it. 5778 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, 5779 {5,}, or {5,10}). We skip over an EXACT item; if the length of what 5780 remains is greater than zero, there's a further opcode that can be 5781 handled. If not, do nothing, leaving the EXACT alone. */ 5782 5783 switch(*tempcode) 5784 { 5785 case OP_TYPEEXACT: 5786 tempcode += PRIV(OP_lengths)[*tempcode] + 5787 ((tempcode[1 + IMM2_SIZE] == OP_PROP 5788 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 5789 break; 5790 5791 /* CHAR opcodes are used for exacts whose count is 1. */ 5792 5793 case OP_CHAR: 5794 case OP_CHARI: 5795 case OP_NOT: 5796 case OP_NOTI: 5797 case OP_EXACT: 5798 case OP_EXACTI: 5799 case OP_NOTEXACT: 5800 case OP_NOTEXACTI: 5801 tempcode += PRIV(OP_lengths)[*tempcode]; 5802 #ifdef SUPPORT_UNICODE 5803 if (utf && HAS_EXTRALEN(tempcode[-1])) 5804 tempcode += GET_EXTRALEN(tempcode[-1]); 5805 #endif 5806 break; 5807 5808 /* For the class opcodes, the repeat operator appears at the end; 5809 adjust tempcode to point to it. */ 5810 5811 case OP_CLASS: 5812 case OP_NCLASS: 5813 tempcode += 1 + 32/sizeof(PCRE2_UCHAR); 5814 break; 5815 5816 #ifdef SUPPORT_WIDE_CHARS 5817 case OP_XCLASS: 5818 tempcode += GET(tempcode, 1); 5819 break; 5820 #endif 5821 } 5822 5823 /* If tempcode is equal to code (which points to the end of the repeated 5824 item), it means we have skipped an EXACT item but there is no following 5825 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In 5826 all other cases, tempcode will be pointing to the repeat opcode, and will 5827 be less than code, so the value of len will be greater than 0. */ 5828 5829 len = (int)(code - tempcode); 5830 if (len > 0) 5831 { 5832 unsigned int repcode = *tempcode; 5833 5834 /* There is a table for possessifying opcodes, all of which are less 5835 than OP_CALLOUT. A zero entry means there is no possessified version. 5836 */ 5837 5838 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) 5839 *tempcode = opcode_possessify[repcode]; 5840 5841 /* For opcode without a special possessified version, wrap the item in 5842 ONCE brackets. */ 5843 5844 else 5845 { 5846 memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); 5847 code += 1 + LINK_SIZE; 5848 len += 1 + LINK_SIZE; 5849 tempcode[0] = OP_ONCE; 5850 *code++ = OP_KET; 5851 PUTINC(code, 0, len); 5852 PUT(tempcode, 1, len); 5853 } 5854 } 5855 } 5856 5857 /* In all case we no longer have a previous item. We also set the 5858 "follows varying string" flag for subsequently encountered reqcus if 5859 it isn't already set and we have just passed a varying length item. */ 5860 5861 END_REPEAT: 5862 previous = NULL; 5863 cb->req_varyopt |= reqvary; 5864 break; 5865 5866 5867 /* ===================================================================*/ 5868 /* Start of nested parenthesized sub-expression, or lookahead or lookbehind 5869 or option setting or condition or all the other extended parenthesis forms. 5870 We must save the current high-water-mark for the forward reference list so 5871 that we know where they start for this group. However, because the list may 5872 be extended when there are very many forward references (usually the result 5873 of a replicated inner group), we must use an offset rather than an absolute 5874 address. Note that (?# comments are dealt with at the top of the loop; 5875 they do not get this far. */ 5876 5877 case CHAR_LEFT_PARENTHESIS: 5878 ptr++; 5879 5880 /* Deal with various "verbs" that can be introduced by '*'. */ 5881 5882 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' 5883 || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0)))) 5884 { 5885 int i, namelen; 5886 int arglen = 0; 5887 const char *vn = verbnames; 5888 PCRE2_SPTR name = ptr + 1; 5889 PCRE2_SPTR arg = NULL; 5890 previous = NULL; 5891 ptr++; 5892 5893 /* Increment ptr, set namelen, check length */ 5894 5895 READ_NAME(ctype_letter, ERR60, *errorcodeptr); 5896 5897 /* It appears that Perl allows any characters whatsoever, other than 5898 a closing parenthesis, to appear in arguments, so we no longer insist on 5899 letters, digits, and underscores. Perl does not, however, do any 5900 interpretation within arguments, and has no means of including a closing 5901 parenthesis. PCRE supports escape processing but only when it is 5902 requested by an option. Note that check_escape() will not return values 5903 greater than the code unit maximum when not in UTF mode. */ 5904 5905 if (*ptr == CHAR_COLON) 5906 { 5907 arg = ++ptr; 5908 5909 if ((options & PCRE2_ALT_VERBNAMES) == 0) 5910 { 5911 arglen = 0; 5912 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) 5913 { 5914 ptr++; /* Check length as we go */ 5915 arglen++; /* along, to avoid the */ 5916 if ((unsigned int)arglen > MAX_MARK) /* possibility of overflow. */ 5917 { 5918 *errorcodeptr = ERR76; 5919 goto FAILED; 5920 } 5921 } 5922 } 5923 else 5924 { 5925 /* The length check is in process_verb_names() */ 5926 arglen = process_verb_name(&ptr, NULL, errorcodeptr, options, 5927 utf, cb); 5928 if (arglen < 0) goto FAILED; 5929 } 5930 } 5931 5932 if (*ptr != CHAR_RIGHT_PARENTHESIS) 5933 { 5934 *errorcodeptr = ERR60; 5935 goto FAILED; 5936 } 5937 5938 /* Scan the table of verb names */ 5939 5940 for (i = 0; i < verbcount; i++) 5941 { 5942 if (namelen == verbs[i].len && 5943 PRIV(strncmp_c8)(name, vn, namelen) == 0) 5944 { 5945 int setverb; 5946 5947 /* Check for open captures before ACCEPT and convert it to 5948 ASSERT_ACCEPT if in an assertion. */ 5949 5950 if (verbs[i].op == OP_ACCEPT) 5951 { 5952 open_capitem *oc; 5953 if (arglen != 0) 5954 { 5955 *errorcodeptr = ERR59; 5956 goto FAILED; 5957 } 5958 cb->had_accept = TRUE; 5959 5960 /* In the first pass, just accumulate the length required; 5961 otherwise hitting (*ACCEPT) inside many nested parentheses can 5962 cause workspace overflow. */ 5963 5964 for (oc = cb->open_caps; oc != NULL; oc = oc->next) 5965 { 5966 if (lengthptr != NULL) 5967 { 5968 *lengthptr += CU2BYTES(1) + IMM2_SIZE; 5969 } 5970 else 5971 { 5972 *code++ = OP_CLOSE; 5973 PUT2INC(code, 0, oc->number); 5974 } 5975 } 5976 setverb = *code++ = 5977 (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; 5978 5979 /* Do not set firstcu after *ACCEPT */ 5980 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 5981 } 5982 5983 /* Handle other cases with/without an argument */ 5984 5985 else if (arglen == 0) /* There is no argument */ 5986 { 5987 if (verbs[i].op < 0) /* Argument is mandatory */ 5988 { 5989 *errorcodeptr = ERR66; 5990 goto FAILED; 5991 } 5992 setverb = *code++ = verbs[i].op; 5993 } 5994 5995 else /* An argument is present */ 5996 { 5997 if (verbs[i].op_arg < 0) /* Argument is forbidden */ 5998 { 5999 *errorcodeptr = ERR59; 6000 goto FAILED; 6001 } 6002 setverb = *code++ = verbs[i].op_arg; 6003 6004 /* Arguments can be very long, especially in 16- and 32-bit modes, 6005 and can overflow the workspace in the first pass. Instead of 6006 putting the argument into memory, we just update the length counter 6007 and set up an empty argument. */ 6008 6009 if (lengthptr != NULL) 6010 { 6011 *lengthptr += arglen; 6012 *code++ = 0; 6013 } 6014 else 6015 { 6016 *code++ = arglen; 6017 if ((options & PCRE2_ALT_VERBNAMES) != 0) 6018 { 6019 PCRE2_UCHAR *memcode = code; /* code is "register" */ 6020 (void)process_verb_name(&arg, &memcode, errorcodeptr, options, 6021 utf, cb); 6022 code = memcode; 6023 } 6024 else /* No argument processing */ 6025 { 6026 memcpy(code, arg, CU2BYTES(arglen)); 6027 code += arglen; 6028 } 6029 } 6030 6031 *code++ = 0; 6032 } 6033 6034 switch (setverb) 6035 { 6036 case OP_THEN: 6037 case OP_THEN_ARG: 6038 cb->external_flags |= PCRE2_HASTHEN; 6039 break; 6040 6041 case OP_PRUNE: 6042 case OP_PRUNE_ARG: 6043 case OP_SKIP: 6044 case OP_SKIP_ARG: 6045 cb->had_pruneorskip = TRUE; 6046 break; 6047 } 6048 6049 break; /* Found verb, exit loop */ 6050 } 6051 6052 vn += verbs[i].len + 1; 6053 } 6054 6055 if (i < verbcount) continue; /* Successfully handled a verb */ 6056 *errorcodeptr = ERR60; /* Verb not recognized */ 6057 goto FAILED; 6058 } 6059 6060 /* Initialization for "real" parentheses */ 6061 6062 newoptions = options; 6063 skipunits = 0; 6064 bravalue = OP_CBRA; 6065 reset_bracount = FALSE; 6066 6067 /* Deal with the extended parentheses; all are introduced by '?', and the 6068 appearance of any of them means that this is not a capturing group. */ 6069 6070 if (*ptr == CHAR_QUESTION_MARK) 6071 { 6072 int i, count; 6073 int namelen; /* Must be signed */ 6074 uint32_t index; 6075 uint32_t set, unset, *optset; 6076 named_group *ng; 6077 PCRE2_SPTR name; 6078 PCRE2_UCHAR *slot; 6079 6080 switch (*(++ptr)) 6081 { 6082 /* ------------------------------------------------------------ */ 6083 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ 6084 reset_bracount = TRUE; 6085 /* Fall through */ 6086 6087 /* ------------------------------------------------------------ */ 6088 case CHAR_COLON: /* Non-capturing bracket */ 6089 bravalue = OP_BRA; 6090 ptr++; 6091 break; 6092 6093 /* ------------------------------------------------------------ */ 6094 case CHAR_LEFT_PARENTHESIS: 6095 bravalue = OP_COND; /* Conditional group */ 6096 tempptr = ptr; 6097 6098 /* A condition can be an assertion, a number (referring to a numbered 6099 group's having been set), a name (referring to a named group), or 'R', 6100 referring to recursion. R<digits> and R&name are also permitted for 6101 recursion tests. 6102 6103 There are ways of testing a named group: (?(name)) is used by Python; 6104 Perl 5.10 onwards uses (?(<name>) or (?('name')). 6105 6106 There is one unfortunate ambiguity, caused by history. 'R' can be the 6107 recursive thing or the name 'R' (and similarly for 'R' followed by 6108 digits). We look for a name first; if not found, we try the other case. 6109 6110 For compatibility with auto-callouts, we allow a callout to be 6111 specified before a condition that is an assertion. First, check for the 6112 syntax of a callout; if found, adjust the temporary pointer that is 6113 used to check for an assertion condition. That's all that is needed! */ 6114 6115 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C) 6116 { 6117 if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS) 6118 { 6119 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break; 6120 if (ptr[i] == CHAR_RIGHT_PARENTHESIS) 6121 tempptr += i + 1; 6122 } 6123 else 6124 { 6125 uint32_t delimiter = 0; 6126 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) 6127 { 6128 if (ptr[3] == PRIV(callout_start_delims)[i]) 6129 { 6130 delimiter = PRIV(callout_end_delims)[i]; 6131 break; 6132 } 6133 } 6134 if (delimiter != 0) 6135 { 6136 for (i = 4; ptr + i < cb->end_pattern; i++) 6137 { 6138 if (ptr[i] == delimiter) 6139 { 6140 if (ptr[i+1] == delimiter) i++; 6141 else 6142 { 6143 if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2; 6144 break; 6145 } 6146 } 6147 } 6148 } 6149 } 6150 6151 /* tempptr should now be pointing to the opening parenthesis of the 6152 assertion condition. */ 6153 6154 if (*tempptr != CHAR_LEFT_PARENTHESIS) 6155 { 6156 *errorcodeptr = ERR28; 6157 goto FAILED; 6158 } 6159 } 6160 6161 /* For conditions that are assertions, check the syntax, and then exit 6162 the switch. This will take control down to where bracketed groups 6163 are processed. The assertion will be handled as part of the group, 6164 but we need to identify this case because the conditional assertion may 6165 not be quantifier. */ 6166 6167 if (tempptr[1] == CHAR_QUESTION_MARK && 6168 (tempptr[2] == CHAR_EQUALS_SIGN || 6169 tempptr[2] == CHAR_EXCLAMATION_MARK || 6170 (tempptr[2] == CHAR_LESS_THAN_SIGN && 6171 (tempptr[3] == CHAR_EQUALS_SIGN || 6172 tempptr[3] == CHAR_EXCLAMATION_MARK)))) 6173 { 6174 cb->iscondassert = TRUE; 6175 break; 6176 } 6177 6178 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all 6179 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ 6180 6181 code[1+LINK_SIZE] = OP_CREF; 6182 skipunits = 1+IMM2_SIZE; 6183 refsign = -1; /* => not a number */ 6184 namelen = -1; /* => not a name; must set to avoid warning */ 6185 name = NULL; /* Always set to avoid warning */ 6186 recno = 0; /* Always set to avoid warning */ 6187 6188 /* Point at character after (?( */ 6189 6190 ptr++; 6191 6192 /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect 6193 users of PCRE2 via an application can discover which release of PCRE2 6194 is being used. */ 6195 6196 if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && 6197 ptr[7] != CHAR_RIGHT_PARENTHESIS) 6198 { 6199 BOOL ge = FALSE; 6200 int major = 0; 6201 int minor = 0; 6202 6203 ptr += 7; 6204 if (*ptr == CHAR_GREATER_THAN_SIGN) 6205 { 6206 ge = TRUE; 6207 ptr++; 6208 } 6209 6210 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT 6211 references its argument twice. */ 6212 6213 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) 6214 { 6215 *errorcodeptr = ERR79; 6216 goto FAILED; 6217 } 6218 6219 while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0'; 6220 if (*ptr == CHAR_DOT) 6221 { 6222 ptr++; 6223 while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0'; 6224 if (minor < 10) minor *= 10; 6225 } 6226 6227 if (*ptr != CHAR_RIGHT_PARENTHESIS || minor > 99) 6228 { 6229 *errorcodeptr = ERR79; 6230 goto FAILED; 6231 } 6232 6233 if (ge) 6234 code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) || 6235 (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))? 6236 OP_TRUE : OP_FALSE; 6237 else 6238 code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)? 6239 OP_TRUE : OP_FALSE; 6240 6241 ptr++; 6242 skipunits = 1; 6243 break; /* End of condition processing */ 6244 } 6245 6246 /* Check for a test for recursion in a named group. */ 6247 6248 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND) 6249 { 6250 terminator = -1; 6251 ptr += 2; 6252 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ 6253 } 6254 6255 /* Check for a test for a named group's having been set, using the Perl 6256 syntax (?(<name>) or (?('name'), and also allow for the original PCRE 6257 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */ 6258 6259 else if (*ptr == CHAR_LESS_THAN_SIGN) 6260 { 6261 terminator = CHAR_GREATER_THAN_SIGN; 6262 ptr++; 6263 } 6264 else if (*ptr == CHAR_APOSTROPHE) 6265 { 6266 terminator = CHAR_APOSTROPHE; 6267 ptr++; 6268 } 6269 else 6270 { 6271 terminator = CHAR_NULL; 6272 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++; 6273 else if (IS_DIGIT(*ptr)) refsign = 0; 6274 } 6275 6276 /* Handle a number */ 6277 6278 if (refsign >= 0) 6279 { 6280 while (IS_DIGIT(*ptr)) 6281 { 6282 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 6283 { 6284 while (IS_DIGIT(*ptr)) ptr++; 6285 *errorcodeptr = ERR61; 6286 goto FAILED; 6287 } 6288 recno = recno * 10 + (int)(*ptr - CHAR_0); 6289 ptr++; 6290 } 6291 } 6292 6293 /* Otherwise we expect to read a name; anything else is an error. When 6294 the referenced name is one of a number of duplicates, a different 6295 opcode is used and it needs more memory. Unfortunately we cannot tell 6296 whether this is the case in the first pass, so we have to allow for 6297 more memory always. In the second pass, the additional to skipunits 6298 happens later. */ 6299 6300 else 6301 { 6302 if (IS_DIGIT(*ptr)) 6303 { 6304 *errorcodeptr = ERR44; /* Group name must start with non-digit */ 6305 goto FAILED; 6306 } 6307 if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0) 6308 { 6309 *errorcodeptr = ERR28; /* Assertion expected */ 6310 goto FAILED; 6311 } 6312 name = ptr; 6313 /* Increment ptr, set namelen, check length */ 6314 READ_NAME(ctype_word, ERR48, *errorcodeptr); 6315 if (lengthptr != NULL) skipunits += IMM2_SIZE; 6316 } 6317 6318 /* Check the terminator */ 6319 6320 if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) || 6321 *ptr++ != CHAR_RIGHT_PARENTHESIS) 6322 { 6323 ptr--; /* Error offset */ 6324 *errorcodeptr = ERR26; /* Malformed number or name */ 6325 goto FAILED; 6326 } 6327 6328 /* Do no further checking in the pre-compile phase. */ 6329 6330 if (lengthptr != NULL) break; 6331 6332 /* In the real compile we do the work of looking for the actual 6333 reference. If refsign is not negative, it means we have a number in 6334 recno. */ 6335 6336 if (refsign >= 0) 6337 { 6338 if (recno <= 0) 6339 { 6340 *errorcodeptr = ERR35; 6341 goto FAILED; 6342 } 6343 if (refsign != 0) recno = (refsign == CHAR_MINUS)? 6344 (cb->bracount + 1) - recno : recno + cb->bracount; 6345 if (recno <= 0 || (uint32_t)recno > cb->final_bracount) 6346 { 6347 *errorcodeptr = ERR15; 6348 goto FAILED; 6349 } 6350 PUT2(code, 2+LINK_SIZE, recno); 6351 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; 6352 break; 6353 } 6354 6355 /* Otherwise look for the name. */ 6356 6357 slot = cb->name_table; 6358 for (i = 0; i < cb->names_found; i++) 6359 { 6360 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break; 6361 slot += cb->name_entry_size; 6362 } 6363 6364 /* Found the named subpattern. If the name is duplicated, add one to 6365 the opcode to change CREF/RREF into DNCREF/DNRREF and insert 6366 appropriate data values. Otherwise, just insert the unique subpattern 6367 number. */ 6368 6369 if (i < cb->names_found) 6370 { 6371 int offset = i; /* Offset of first name found */ 6372 6373 count = 0; 6374 for (;;) 6375 { 6376 recno = GET2(slot, 0); /* Number for last found */ 6377 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; 6378 count++; 6379 if (++i >= cb->names_found) break; 6380 slot += cb->name_entry_size; 6381 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 || 6382 (slot+IMM2_SIZE)[namelen] != 0) break; 6383 } 6384 6385 if (count > 1) 6386 { 6387 PUT2(code, 2+LINK_SIZE, offset); 6388 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); 6389 skipunits += IMM2_SIZE; 6390 code[1+LINK_SIZE]++; 6391 } 6392 else /* Not a duplicated name */ 6393 { 6394 PUT2(code, 2+LINK_SIZE, recno); 6395 } 6396 } 6397 6398 /* If terminator == CHAR_NULL it means that the name followed directly 6399 after the opening parenthesis [e.g. (?(abc)...] and in this case there 6400 are some further alternatives to try. For the cases where terminator != 6401 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ] 6402 we have now checked all the possibilities, so give an error. */ 6403 6404 else if (terminator != CHAR_NULL) 6405 { 6406 *errorcodeptr = ERR15; 6407 goto FAILED; 6408 } 6409 6410 /* Check for (?(R) for recursion. Allow digits after R to specify a 6411 specific group number. */ 6412 6413 else if (*name == CHAR_R) 6414 { 6415 recno = 0; 6416 for (i = 1; i < namelen; i++) 6417 { 6418 if (!IS_DIGIT(name[i])) 6419 { 6420 *errorcodeptr = ERR15; /* Non-existent subpattern */ 6421 goto FAILED; 6422 } 6423 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 6424 { 6425 *errorcodeptr = ERR61; 6426 goto FAILED; 6427 } 6428 recno = recno * 10 + name[i] - CHAR_0; 6429 } 6430 if (recno == 0) recno = RREF_ANY; 6431 code[1+LINK_SIZE] = OP_RREF; /* Change test type */ 6432 PUT2(code, 2+LINK_SIZE, recno); 6433 } 6434 6435 /* Similarly, check for the (?(DEFINE) "condition", which is always 6436 false. During compilation we set OP_DEFINE to distinguish this from 6437 other OP_FALSE conditions so that it can be checked for having only one 6438 branch, but after that the opcode is changed to OP_FALSE. */ 6439 6440 else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) 6441 { 6442 code[1+LINK_SIZE] = OP_DEFINE; 6443 skipunits = 1; 6444 } 6445 6446 /* Reference to an unidentified subpattern. */ 6447 6448 else 6449 { 6450 *errorcodeptr = ERR15; 6451 goto FAILED; 6452 } 6453 break; 6454 6455 6456 /* ------------------------------------------------------------ */ 6457 case CHAR_EQUALS_SIGN: /* Positive lookahead */ 6458 bravalue = OP_ASSERT; 6459 cb->assert_depth += 1; 6460 ptr++; 6461 break; 6462 6463 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird 6464 thing to do, but Perl allows all assertions to be quantified, and when 6465 they contain capturing parentheses there may be a potential use for 6466 this feature. Not that that applies to a quantified (?!) but we allow 6467 it for uniformity. */ 6468 6469 /* ------------------------------------------------------------ */ 6470 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ 6471 ptr++; 6472 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK && 6473 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK && 6474 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2))) 6475 { 6476 *code++ = OP_FAIL; 6477 previous = NULL; 6478 continue; 6479 } 6480 bravalue = OP_ASSERT_NOT; 6481 cb->assert_depth += 1; 6482 break; 6483 6484 6485 /* ------------------------------------------------------------ */ 6486 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ 6487 switch (ptr[1]) 6488 { 6489 case CHAR_EQUALS_SIGN: /* Positive lookbehind */ 6490 bravalue = OP_ASSERTBACK; 6491 cb->assert_depth += 1; 6492 ptr += 2; 6493 break; 6494 6495 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ 6496 bravalue = OP_ASSERTBACK_NOT; 6497 cb->assert_depth += 1; 6498 ptr += 2; 6499 break; 6500 6501 /* Must be a name definition - as the syntax was checked in the 6502 pre-pass, we can assume here that it is valid. Skip over the name 6503 and go to handle the numbered group. */ 6504 6505 default: 6506 while (*(++ptr) != CHAR_GREATER_THAN_SIGN); 6507 ptr++; 6508 goto NUMBERED_GROUP; 6509 } 6510 break; 6511 6512 6513 /* ------------------------------------------------------------ */ 6514 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ 6515 bravalue = OP_ONCE; 6516 ptr++; 6517 break; 6518 6519 6520 /* ------------------------------------------------------------ */ 6521 case CHAR_C: /* Callout */ 6522 previous_callout = code; /* Save for later completion */ 6523 after_manual_callout = 1; /* Skip one item before completing */ 6524 ptr++; /* Character after (?C */ 6525 6526 /* A callout may have a string argument, delimited by one of a fixed 6527 number of characters, or an undelimited numerical argument, or no 6528 argument, which is the same as (?C0). Different opcodes are used for 6529 the two cases. */ 6530 6531 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) 6532 { 6533 uint32_t delimiter = 0; 6534 6535 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) 6536 { 6537 if (*ptr == PRIV(callout_start_delims)[i]) 6538 { 6539 delimiter = PRIV(callout_end_delims)[i]; 6540 break; 6541 } 6542 } 6543 6544 if (delimiter == 0) 6545 { 6546 *errorcodeptr = ERR82; 6547 goto FAILED; 6548 } 6549 6550 /* During the pre-compile phase, we parse the string and update the 6551 length. There is no need to generate any code. (In fact, the string 6552 has already been parsed in the pre-pass that looks for named 6553 parentheses, but it does no harm to leave this code in.) */ 6554 6555 if (lengthptr != NULL) /* Only check the string */ 6556 { 6557 PCRE2_SPTR start = ptr; 6558 do 6559 { 6560 if (++ptr >= cb->end_pattern) 6561 { 6562 *errorcodeptr = ERR81; 6563 ptr = start; /* To give a more useful message */ 6564 goto FAILED; 6565 } 6566 if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2; 6567 } 6568 while (ptr[0] != delimiter); 6569 6570 /* Start points to the opening delimiter, ptr points to the 6571 closing delimiter. We must allow for including the delimiter and 6572 for the terminating zero. Any doubled delimiters within the string 6573 make this an overestimate, but it is not worth bothering about. */ 6574 6575 (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE); 6576 } 6577 6578 /* In the real compile we can copy the string, knowing that it is 6579 syntactically OK. The starting delimiter is included so that the 6580 client can discover it if they want. We also pass the start offset to 6581 help a script language give better error messages. */ 6582 6583 else 6584 { 6585 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); 6586 *callout_string++ = *ptr++; 6587 PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */ 6588 for(;;) 6589 { 6590 if (*ptr == delimiter) 6591 { 6592 if (ptr[1] == delimiter) ptr++; else break; 6593 } 6594 *callout_string++ = *ptr++; 6595 } 6596 *callout_string++ = CHAR_NULL; 6597 code[0] = OP_CALLOUT_STR; 6598 PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */ 6599 PUT(code, 1 + LINK_SIZE, 0); /* Default length */ 6600 PUT(code, 1 + 2*LINK_SIZE, /* Compute size */ 6601 (int)(callout_string - code)); 6602 code = callout_string; 6603 } 6604 6605 /* Advance to what should be the closing parenthesis, which is 6606 checked below. */ 6607 6608 ptr++; 6609 } 6610 6611 /* Handle a callout with an optional numerical argument, which must be 6612 less than or equal to 255. A missing argument gives 0. */ 6613 6614 else 6615 { 6616 int n = 0; 6617 code[0] = OP_CALLOUT; /* Numerical callout */ 6618 while (IS_DIGIT(*ptr)) 6619 { 6620 n = n * 10 + *ptr++ - CHAR_0; 6621 if (n > 255) 6622 { 6623 *errorcodeptr = ERR38; 6624 goto FAILED; 6625 } 6626 } 6627 PUT(code, 1, (int)(ptr - cb->start_pattern + 1)); /* Next offset */ 6628 PUT(code, 1 + LINK_SIZE, 0); /* Default length */ 6629 code[1 + 2*LINK_SIZE] = n; /* Callout number */ 6630 code += PRIV(OP_lengths)[OP_CALLOUT]; 6631 } 6632 6633 /* Both formats must have a closing parenthesis */ 6634 6635 if (*ptr != CHAR_RIGHT_PARENTHESIS) 6636 { 6637 *errorcodeptr = ERR39; 6638 goto FAILED; 6639 } 6640 6641 /* Callouts cannot be quantified. */ 6642 6643 previous = NULL; 6644 continue; 6645 6646 6647 /* ------------------------------------------------------------ */ 6648 case CHAR_P: /* Python-style named subpattern handling */ 6649 if (*(++ptr) == CHAR_EQUALS_SIGN || 6650 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ 6651 { 6652 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; 6653 terminator = CHAR_RIGHT_PARENTHESIS; 6654 goto NAMED_REF_OR_RECURSE; 6655 } 6656 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ 6657 { 6658 *errorcodeptr = ERR41; 6659 goto FAILED; 6660 } 6661 /* Fall through to handle (?P< as (?< is handled */ 6662 6663 6664 /* ------------------------------------------------------------ */ 6665 case CHAR_APOSTROPHE: /* Define a name - note fall through above */ 6666 6667 /* The syntax was checked and the list of names was set up in the 6668 pre-pass, so there is nothing to be done now except to skip over the 6669 name. */ 6670 6671 terminator = (*ptr == CHAR_LESS_THAN_SIGN)? 6672 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 6673 while (*(++ptr) != (unsigned int)terminator); 6674 ptr++; 6675 goto NUMBERED_GROUP; /* Set up numbered group */ 6676 6677 6678 /* ------------------------------------------------------------ */ 6679 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ 6680 terminator = CHAR_RIGHT_PARENTHESIS; 6681 is_recurse = TRUE; 6682 /* Fall through */ 6683 6684 /* We come here from the Python syntax above that handles both 6685 references (?P=name) and recursion (?P>name), as well as falling 6686 through from the Perl recursion syntax (?&name). We also come here from 6687 the Perl \k<name> or \k'name' back reference syntax and the \k{name} 6688 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ 6689 6690 NAMED_REF_OR_RECURSE: 6691 name = ++ptr; 6692 if (IS_DIGIT(*ptr)) 6693 { 6694 *errorcodeptr = ERR44; /* Group name must start with non-digit */ 6695 goto FAILED; 6696 } 6697 /* Increment ptr, set namelen, check length */ 6698 READ_NAME(ctype_word, ERR48, *errorcodeptr); 6699 6700 /* In the pre-compile phase, do a syntax check. */ 6701 6702 if (lengthptr != NULL) 6703 { 6704 if (namelen == 0) 6705 { 6706 *errorcodeptr = ERR62; 6707 goto FAILED; 6708 } 6709 if (*ptr != (PCRE2_UCHAR)terminator) 6710 { 6711 *errorcodeptr = ERR42; 6712 goto FAILED; 6713 } 6714 } 6715 6716 /* Scan the list of names generated in the pre-pass in order to get 6717 a number and whether or not this name is duplicated. */ 6718 6719 recno = 0; 6720 is_dupname = FALSE; 6721 ng = cb->named_groups; 6722 6723 for (i = 0; i < cb->names_found; i++, ng++) 6724 { 6725 if (namelen == ng->length && 6726 PRIV(strncmp)(name, ng->name, namelen) == 0) 6727 { 6728 open_capitem *oc; 6729 is_dupname = ng->isdup; 6730 recno = ng->number; 6731 6732 /* For a recursion, that's all that is needed. We can now go to the 6733 code that handles numerical recursion. */ 6734 6735 if (is_recurse) goto HANDLE_RECURSION; 6736 6737 /* For a back reference, update the back reference map and the 6738 maximum back reference. Then for each group we must check to see if 6739 it is recursive, that is, it is inside the group that it 6740 references. A flag is set so that the group can be made atomic. */ 6741 6742 cb->backref_map |= (recno < 32)? (1u << recno) : 1; 6743 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; 6744 6745 for (oc = cb->open_caps; oc != NULL; oc = oc->next) 6746 { 6747 if (oc->number == recno) 6748 { 6749 oc->flag = TRUE; 6750 break; 6751 } 6752 } 6753 } 6754 } 6755 6756 /* If the name was not found we have a bad reference. */ 6757 6758 if (recno == 0) 6759 { 6760 *errorcodeptr = ERR15; 6761 goto FAILED; 6762 } 6763 6764 /* If a back reference name is not duplicated, we can handle it as a 6765 numerical reference. */ 6766 6767 if (!is_dupname) goto HANDLE_REFERENCE; 6768 6769 /* If a back reference name is duplicated, we generate a different 6770 opcode to a numerical back reference. In the second pass we must search 6771 for the index and count in the final name table. */ 6772 6773 count = 0; 6774 index = 0; 6775 6776 if (lengthptr == NULL) 6777 { 6778 slot = cb->name_table; 6779 for (i = 0; i < cb->names_found; i++) 6780 { 6781 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 && 6782 slot[IMM2_SIZE+namelen] == 0) 6783 { 6784 if (count == 0) index = i; 6785 count++; 6786 } 6787 slot += cb->name_entry_size; 6788 } 6789 6790 if (count == 0) 6791 { 6792 *errorcodeptr = ERR15; 6793 goto FAILED; 6794 } 6795 } 6796 6797 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 6798 previous = code; 6799 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; 6800 PUT2INC(code, 0, index); 6801 PUT2INC(code, 0, count); 6802 continue; /* End of back ref handling */ 6803 6804 6805 /* ------------------------------------------------------------ */ 6806 case CHAR_R: /* Recursion, same as (?0) */ 6807 recno = 0; 6808 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS) 6809 { 6810 *errorcodeptr = ERR29; 6811 goto FAILED; 6812 } 6813 goto HANDLE_RECURSION; 6814 6815 6816 /* ------------------------------------------------------------ */ 6817 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ 6818 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: 6819 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 6820 { 6821 terminator = CHAR_RIGHT_PARENTHESIS; 6822 6823 /* Come here from the \g<...> and \g'...' code (Oniguruma 6824 compatibility). However, the syntax has been checked to ensure that 6825 the ... are a (signed) number, so that neither ERR63 nor ERR29 will 6826 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY 6827 ever be taken. */ 6828 6829 HANDLE_NUMERICAL_RECURSION: 6830 6831 if ((refsign = *ptr) == CHAR_PLUS) 6832 { 6833 ptr++; 6834 if (!IS_DIGIT(*ptr)) 6835 { 6836 *errorcodeptr = ERR63; 6837 goto FAILED; 6838 } 6839 } 6840 else if (refsign == CHAR_MINUS) 6841 { 6842 if (!IS_DIGIT(ptr[1])) 6843 goto OTHER_CHAR_AFTER_QUERY; 6844 ptr++; 6845 } 6846 6847 recno = 0; 6848 while (IS_DIGIT(*ptr)) 6849 { 6850 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 6851 { 6852 while (IS_DIGIT(*ptr)) ptr++; 6853 *errorcodeptr = ERR61; 6854 goto FAILED; 6855 } 6856 recno = recno * 10 + *ptr++ - CHAR_0; 6857 } 6858 6859 if (*ptr != (PCRE2_UCHAR)terminator) 6860 { 6861 *errorcodeptr = ERR29; 6862 goto FAILED; 6863 } 6864 6865 if (refsign == CHAR_MINUS) 6866 { 6867 if (recno == 0) 6868 { 6869 *errorcodeptr = ERR58; 6870 goto FAILED; 6871 } 6872 recno = (int)(cb->bracount + 1) - recno; 6873 if (recno <= 0) 6874 { 6875 *errorcodeptr = ERR15; 6876 goto FAILED; 6877 } 6878 } 6879 else if (refsign == CHAR_PLUS) 6880 { 6881 if (recno == 0) 6882 { 6883 *errorcodeptr = ERR58; 6884 goto FAILED; 6885 } 6886 recno += cb->bracount; 6887 } 6888 6889 if ((uint32_t)recno > cb->final_bracount) 6890 { 6891 *errorcodeptr = ERR15; 6892 goto FAILED; 6893 } 6894 6895 /* Come here from code above that handles a named recursion. 6896 We insert the number of the called group after OP_RECURSE. At the 6897 end of compiling the pattern is scanned and these numbers are 6898 replaced by offsets within the pattern. It is done like this to avoid 6899 problems with forward references and adjusting offsets when groups 6900 are duplicated and moved (as discovered in previous implementations). 6901 Note that a recursion does not have a set first character (relevant 6902 if it is repeated, because it will then be wrapped with ONCE 6903 brackets). */ 6904 6905 HANDLE_RECURSION: 6906 previous = code; 6907 *code = OP_RECURSE; 6908 PUT(code, 1, recno); 6909 code += 1 + LINK_SIZE; 6910 groupsetfirstcu = FALSE; 6911 cb->had_recurse = TRUE; 6912 } 6913 6914 /* Can't determine a first byte now */ 6915 6916 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 6917 continue; 6918 6919 6920 /* ------------------------------------------------------------ */ 6921 default: /* Other characters: check option setting */ 6922 OTHER_CHAR_AFTER_QUERY: 6923 set = unset = 0; 6924 optset = &set; 6925 6926 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) 6927 { 6928 switch (*ptr++) 6929 { 6930 case CHAR_MINUS: optset = &unset; break; 6931 6932 case CHAR_J: /* Record that it changed in the external options */ 6933 *optset |= PCRE2_DUPNAMES; 6934 cb->external_flags |= PCRE2_JCHANGED; 6935 break; 6936 6937 case CHAR_i: *optset |= PCRE2_CASELESS; break; 6938 case CHAR_m: *optset |= PCRE2_MULTILINE; break; 6939 case CHAR_s: *optset |= PCRE2_DOTALL; break; 6940 case CHAR_x: *optset |= PCRE2_EXTENDED; break; 6941 case CHAR_U: *optset |= PCRE2_UNGREEDY; break; 6942 6943 default: *errorcodeptr = ERR11; 6944 ptr--; /* Correct the offset */ 6945 goto FAILED; 6946 } 6947 } 6948 6949 /* Set up the changed option bits, but don't change anything yet. */ 6950 6951 newoptions = (options | set) & (~unset); 6952 6953 /* If the options ended with ')' this is not the start of a nested 6954 group with option changes, so the options change at this level. They 6955 must also be passed back for use in subsequent branches. Reset the 6956 greedy defaults and the case value for firstcu and reqcu. */ 6957 6958 if (*ptr == CHAR_RIGHT_PARENTHESIS) 6959 { 6960 *optionsptr = options = newoptions; 6961 greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0); 6962 greedy_non_default = greedy_default ^ 1; 6963 req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; 6964 previous = NULL; /* This item can't be repeated */ 6965 continue; /* It is complete */ 6966 } 6967 6968 /* If the options ended with ':' we are heading into a nested group 6969 with possible change of options. Such groups are non-capturing and are 6970 not assertions of any kind. All we need to do is skip over the ':'; 6971 the newoptions value is handled below. */ 6972 6973 bravalue = OP_BRA; 6974 ptr++; 6975 } /* End of switch for character following (? */ 6976 } /* End of (? handling */ 6977 6978 /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE 6979 is set, all unadorned brackets become non-capturing and behave like (?:...) 6980 brackets. */ 6981 6982 else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0) 6983 { 6984 bravalue = OP_BRA; 6985 } 6986 6987 /* Else we have a capturing group. */ 6988 6989 else 6990 { 6991 NUMBERED_GROUP: 6992 cb->bracount += 1; 6993 PUT2(code, 1+LINK_SIZE, cb->bracount); 6994 skipunits = IMM2_SIZE; 6995 } 6996 6997 /* Process nested bracketed regex. First check for parentheses nested too 6998 deeply. */ 6999 7000 if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit)) 7001 { 7002 *errorcodeptr = ERR19; 7003 goto FAILED; 7004 } 7005 7006 /* All assertions used not to be repeatable, but this was changed for Perl 7007 compatibility. All kinds can now be repeated except for assertions that are 7008 conditions (Perl also forbids these to be repeated). We copy code into a 7009 non-register variable (tempcode) in order to be able to pass its address 7010 because some compilers complain otherwise. At the start of a conditional 7011 group whose condition is an assertion, cb->iscondassert is set. We unset it 7012 here so as to allow assertions later in the group to be quantified. */ 7013 7014 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && 7015 cb->iscondassert) 7016 { 7017 previous = NULL; 7018 cb->iscondassert = FALSE; 7019 } 7020 else 7021 { 7022 previous = code; 7023 } 7024 7025 *code = bravalue; 7026 tempcode = code; 7027 tempreqvary = cb->req_varyopt; /* Save value before bracket */ 7028 tempbracount = cb->bracount; /* Save value before bracket */ 7029 length_prevgroup = 0; /* Initialize for pre-compile phase */ 7030 7031 if (!compile_regex( 7032 newoptions, /* The complete new option state */ 7033 &tempcode, /* Where to put code (updated) */ 7034 &ptr, /* Input pointer (updated) */ 7035 errorcodeptr, /* Where to put an error message */ 7036 (bravalue == OP_ASSERTBACK || 7037 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ 7038 reset_bracount, /* True if (?| group */ 7039 skipunits, /* Skip over bracket number */ 7040 cond_depth + 7041 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ 7042 &subfirstcu, /* For possible first char */ 7043 &subfirstcuflags, 7044 &subreqcu, /* For possible last char */ 7045 &subreqcuflags, 7046 bcptr, /* Current branch chain */ 7047 cb, /* Compile data block */ 7048 (lengthptr == NULL)? NULL : /* Actual compile phase */ 7049 &length_prevgroup /* Pre-compile phase */ 7050 )) 7051 goto FAILED; 7052 7053 cb->parens_depth -= 1; 7054 7055 /* If this was an atomic group and there are no capturing groups within it, 7056 generate OP_ONCE_NC instead of OP_ONCE. */ 7057 7058 if (bravalue == OP_ONCE && cb->bracount <= tempbracount) 7059 *code = OP_ONCE_NC; 7060 7061 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) 7062 cb->assert_depth -= 1; 7063 7064 /* At the end of compiling, code is still pointing to the start of the 7065 group, while tempcode has been updated to point past the end of the group. 7066 The pattern pointer (ptr) is on the bracket. 7067 7068 If this is a conditional bracket, check that there are no more than 7069 two branches in the group, or just one if it's a DEFINE group. We do this 7070 in the real compile phase, not in the pre-pass, where the whole group may 7071 not be available. */ 7072 7073 if (bravalue == OP_COND && lengthptr == NULL) 7074 { 7075 PCRE2_UCHAR *tc = code; 7076 int condcount = 0; 7077 7078 do { 7079 condcount++; 7080 tc += GET(tc,1); 7081 } 7082 while (*tc != OP_KET); 7083 7084 /* A DEFINE group is never obeyed inline (the "condition" is always 7085 false). It must have only one branch. Having checked this, change the 7086 opcode to OP_FALSE. */ 7087 7088 if (code[LINK_SIZE+1] == OP_DEFINE) 7089 { 7090 if (condcount > 1) 7091 { 7092 *errorcodeptr = ERR54; 7093 goto FAILED; 7094 } 7095 code[LINK_SIZE+1] = OP_FALSE; 7096 bravalue = OP_DEFINE; /* Just a flag to suppress char handling below */ 7097 } 7098 7099 /* A "normal" conditional group. If there is just one branch, we must not 7100 make use of its firstcu or reqcu, because this is equivalent to an 7101 empty second branch. */ 7102 7103 else 7104 { 7105 if (condcount > 2) 7106 { 7107 *errorcodeptr = ERR27; 7108 goto FAILED; 7109 } 7110 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; 7111 } 7112 } 7113 7114 /* At the end of a group, it's an error if we hit end of pattern or 7115 any non-closing parenthesis. This check also happens in the pre-scan, 7116 so should not trigger here, but leave this code as an insurance. */ 7117 7118 if (*ptr != CHAR_RIGHT_PARENTHESIS) 7119 { 7120 *errorcodeptr = ERR14; 7121 goto FAILED; 7122 } 7123 7124 /* In the pre-compile phase, update the length by the length of the group, 7125 less the brackets at either end. Then reduce the compiled code to just a 7126 set of non-capturing brackets so that it doesn't use much memory if it is 7127 duplicated by a quantifier.*/ 7128 7129 if (lengthptr != NULL) 7130 { 7131 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) 7132 { 7133 *errorcodeptr = ERR20; 7134 goto FAILED; 7135 } 7136 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; 7137 code++; /* This already contains bravalue */ 7138 PUTINC(code, 0, 1 + LINK_SIZE); 7139 *code++ = OP_KET; 7140 PUTINC(code, 0, 1 + LINK_SIZE); 7141 break; /* No need to waste time with special character handling */ 7142 } 7143 7144 /* Otherwise update the main code pointer to the end of the group. */ 7145 7146 code = tempcode; 7147 7148 /* For a DEFINE group, required and first character settings are not 7149 relevant. */ 7150 7151 if (bravalue == OP_DEFINE) break; 7152 7153 /* Handle updating of the required and first characters for other types of 7154 group. Update for normal brackets of all kinds, and conditions with two 7155 branches (see code above). If the bracket is followed by a quantifier with 7156 zero repeat, we have to back off. Hence the definition of zeroreqcu and 7157 zerofirstcu outside the main loop so that they can be accessed for the 7158 back off. */ 7159 7160 zeroreqcu = reqcu; 7161 zeroreqcuflags = reqcuflags; 7162 zerofirstcu = firstcu; 7163 zerofirstcuflags = firstcuflags; 7164 groupsetfirstcu = FALSE; 7165 7166 if (bravalue >= OP_ONCE) 7167 { 7168 /* If we have not yet set a firstcu in this branch, take it from the 7169 subpattern, remembering that it was set here so that a repeat of more 7170 than one can replicate it as reqcu if necessary. If the subpattern has 7171 no firstcu, set "none" for the whole branch. In both cases, a zero 7172 repeat forces firstcu to "none". */ 7173 7174 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) 7175 { 7176 if (subfirstcuflags >= 0) 7177 { 7178 firstcu = subfirstcu; 7179 firstcuflags = subfirstcuflags; 7180 groupsetfirstcu = TRUE; 7181 } 7182 else firstcuflags = REQ_NONE; 7183 zerofirstcuflags = REQ_NONE; 7184 } 7185 7186 /* If firstcu was previously set, convert the subpattern's firstcu 7187 into reqcu if there wasn't one, using the vary flag that was in 7188 existence beforehand. */ 7189 7190 else if (subfirstcuflags >= 0 && subreqcuflags < 0) 7191 { 7192 subreqcu = subfirstcu; 7193 subreqcuflags = subfirstcuflags | tempreqvary; 7194 } 7195 7196 /* If the subpattern set a required byte (or set a first byte that isn't 7197 really the first byte - see above), set it. */ 7198 7199 if (subreqcuflags >= 0) 7200 { 7201 reqcu = subreqcu; 7202 reqcuflags = subreqcuflags; 7203 } 7204 } 7205 7206 /* For a forward assertion, we take the reqcu, if set. This can be 7207 helpful if the pattern that follows the assertion doesn't set a different 7208 char. For example, it's useful for /(?=abcde).+/. We can't set firstcu 7209 for an assertion, however because it leads to incorrect effect for patterns 7210 such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead 7211 of a firstcu. This is overcome by a scan at the end if there's no 7212 firstcu, looking for an asserted first char. */ 7213 7214 else if (bravalue == OP_ASSERT && subreqcuflags >= 0) 7215 { 7216 reqcu = subreqcu; 7217 reqcuflags = subreqcuflags; 7218 } 7219 break; /* End of processing '(' */ 7220 7221 7222 /* ===================================================================*/ 7223 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values 7224 are arranged to be the negation of the corresponding OP_values in the 7225 default case when PCRE2_UCP is not set. For the back references, the values 7226 are negative the reference number. Only back references and those types 7227 that consume a character may be repeated. We can test for values between 7228 ESC_b and ESC_Z for the latter; this may have to change if any new ones are 7229 ever created. 7230 7231 Note: \Q and \E are handled at the start of the character-processing loop, 7232 not here. */ 7233 7234 case CHAR_BACKSLASH: 7235 tempptr = ptr; 7236 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr, 7237 options, FALSE, cb); 7238 if (*errorcodeptr != 0) goto FAILED; 7239 7240 if (escape == 0) /* The escape coded a single character */ 7241 c = ec; 7242 else 7243 { 7244 /* For metasequences that actually match a character, we disable the 7245 setting of a first character if it hasn't already been set. */ 7246 7247 if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z) 7248 firstcuflags = REQ_NONE; 7249 7250 /* Set values to reset to if this is followed by a zero repeat. */ 7251 7252 zerofirstcu = firstcu; 7253 zerofirstcuflags = firstcuflags; 7254 zeroreqcu = reqcu; 7255 zeroreqcuflags = reqcuflags; 7256 7257 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' 7258 is a subroutine call by number (Oniguruma syntax). In fact, the value 7259 ESC_g is returned only for these cases. So we don't need to check for < 7260 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is 7261 -n, and for the Perl syntax \g{name} the result is ESC_k (as 7262 that is a synonym for a named back reference). */ 7263 7264 if (escape == ESC_g) 7265 { 7266 PCRE2_SPTR p; 7267 uint32_t cf; 7268 7269 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 7270 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 7271 7272 /* These two statements stop the compiler for warning about possibly 7273 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In 7274 fact, because we do the check for a number below, the paths that 7275 would actually be in error are never taken. */ 7276 7277 skipunits = 0; 7278 reset_bracount = FALSE; 7279 7280 /* If it's not a signed or unsigned number, treat it as a name. */ 7281 7282 cf = ptr[1]; 7283 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf)) 7284 { 7285 is_recurse = TRUE; 7286 goto NAMED_REF_OR_RECURSE; 7287 } 7288 7289 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus 7290 or a digit. */ 7291 7292 p = ptr + 2; 7293 while (IS_DIGIT(*p)) p++; 7294 if (*p != (PCRE2_UCHAR)terminator) 7295 { 7296 *errorcodeptr = ERR57; 7297 goto FAILED; 7298 } 7299 ptr++; 7300 goto HANDLE_NUMERICAL_RECURSION; 7301 } 7302 7303 /* \k<name> or \k'name' is a back reference by name (Perl syntax). 7304 We also support \k{name} (.NET syntax). */ 7305 7306 if (escape == ESC_k) 7307 { 7308 if ((ptr[1] != CHAR_LESS_THAN_SIGN && 7309 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) 7310 { 7311 *errorcodeptr = ERR69; 7312 goto FAILED; 7313 } 7314 is_recurse = FALSE; 7315 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 7316 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? 7317 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; 7318 goto NAMED_REF_OR_RECURSE; 7319 } 7320 7321 /* Back references are handled specially; must disable firstcu if 7322 not set to cope with cases like (?=(\w+))\1: which would otherwise set 7323 ':' later. */ 7324 7325 if (escape < 0) 7326 { 7327 open_capitem *oc; 7328 recno = -escape; 7329 7330 /* Come here from named backref handling when the reference is to a 7331 single group (i.e. not to a duplicated name). */ 7332 7333 HANDLE_REFERENCE: 7334 if (recno > (int)cb->final_bracount) 7335 { 7336 *errorcodeptr = ERR15; 7337 goto FAILED; 7338 } 7339 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; 7340 previous = code; 7341 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; 7342 PUT2INC(code, 0, recno); 7343 cb->backref_map |= (recno < 32)? (1u << recno) : 1; 7344 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; 7345 7346 /* Check to see if this back reference is recursive, that it, it 7347 is inside the group that it references. A flag is set so that the 7348 group can be made atomic. */ 7349 7350 for (oc = cb->open_caps; oc != NULL; oc = oc->next) 7351 { 7352 if (oc->number == recno) 7353 { 7354 oc->flag = TRUE; 7355 break; 7356 } 7357 } 7358 } 7359 7360 /* So are Unicode property matches, if supported. */ 7361 7362 #ifdef SUPPORT_UNICODE 7363 else if (escape == ESC_P || escape == ESC_p) 7364 { 7365 BOOL negated; 7366 unsigned int ptype = 0, pdata = 0; 7367 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb)) 7368 goto FAILED; 7369 previous = code; 7370 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; 7371 *code++ = ptype; 7372 *code++ = pdata; 7373 } 7374 #else 7375 7376 /* If Unicode properties are not supported, \X, \P, and \p are not 7377 allowed. */ 7378 7379 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p) 7380 { 7381 *errorcodeptr = ERR45; 7382 goto FAILED; 7383 } 7384 #endif 7385 7386 /* The use of \C can be locked out. */ 7387 7388 #ifdef NEVER_BACKSLASH_C 7389 else if (escape == ESC_C) 7390 { 7391 *errorcodeptr = ERR85; 7392 goto FAILED; 7393 } 7394 #else 7395 else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0) 7396 { 7397 *errorcodeptr = ERR83; 7398 goto FAILED; 7399 } 7400 #endif 7401 7402 /* For the rest (including \X when Unicode properties are supported), we 7403 can obtain the OP value by negating the escape value in the default 7404 situation when PCRE2_UCP is not set. When it *is* set, we substitute 7405 Unicode property tests. Note that \b and \B do a one-character 7406 lookbehind, and \A also behaves as if it does. */ 7407 7408 else 7409 { 7410 if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ 7411 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) && 7412 cb->max_lookbehind == 0) 7413 cb->max_lookbehind = 1; 7414 #ifdef SUPPORT_UNICODE 7415 if (escape >= ESC_DU && escape <= ESC_wu) 7416 { 7417 cb->nestptr[1] = cb->nestptr[0]; /* Back up if at 2nd level */ 7418 cb->nestptr[0] = ptr + 1; /* Where to resume */ 7419 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ 7420 } 7421 else 7422 #endif 7423 /* In non-UTF mode, and for both 32-bit modes, we turn \C into 7424 OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in 7425 lookbehinds. */ 7426 7427 { 7428 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; 7429 #if PCRE2_CODE_UNIT_WIDTH == 32 7430 *code++ = (escape == ESC_C)? OP_ALLANY : escape; 7431 #else 7432 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; 7433 #endif 7434 } 7435 } 7436 continue; 7437 } 7438 7439 /* We have a data character whose value is in c. In UTF-8 mode it may have 7440 a value > 127. We set its representation in the length/buffer, and then 7441 handle it as a data character. */ 7442 7443 mclength = PUTCHAR(c, mcbuffer); 7444 goto ONE_CHAR; 7445 7446 7447 /* ===================================================================*/ 7448 /* Handle a literal character. It is guaranteed not to be whitespace or # 7449 when the extended flag is set. If we are in a UTF mode, it may be a 7450 multi-unit literal character. */ 7451 7452 default: 7453 NORMAL_CHAR: 7454 mclength = 1; 7455 mcbuffer[0] = c; 7456 7457 #ifdef SUPPORT_UNICODE 7458 if (utf && HAS_EXTRALEN(c)) 7459 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); 7460 #endif 7461 7462 /* At this point we have the character's bytes in mcbuffer, and the length 7463 in mclength. When not in UTF mode, the length is always 1. */ 7464 7465 ONE_CHAR: 7466 previous = code; 7467 7468 /* For caseless UTF mode, check whether this character has more than one 7469 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ 7470 7471 #ifdef SUPPORT_UNICODE 7472 if (utf && (options & PCRE2_CASELESS) != 0) 7473 { 7474 GETCHAR(c, mcbuffer); 7475 if ((c = UCD_CASESET(c)) != 0) 7476 { 7477 *code++ = OP_PROP; 7478 *code++ = PT_CLIST; 7479 *code++ = c; 7480 if (firstcuflags == REQ_UNSET) 7481 firstcuflags = zerofirstcuflags = REQ_NONE; 7482 break; 7483 } 7484 } 7485 #endif 7486 7487 /* Caseful matches, or not one of the multicase characters. */ 7488 7489 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; 7490 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; 7491 7492 /* Remember if \r or \n were seen */ 7493 7494 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) 7495 cb->external_flags |= PCRE2_HASCRORLF; 7496 7497 /* Set the first and required bytes appropriately. If no previous first 7498 byte, set it from this character, but revert to none on a zero repeat. 7499 Otherwise, leave the firstcu value alone, and don't change it on a zero 7500 repeat. */ 7501 7502 if (firstcuflags == REQ_UNSET) 7503 { 7504 zerofirstcuflags = REQ_NONE; 7505 zeroreqcu = reqcu; 7506 zeroreqcuflags = reqcuflags; 7507 7508 /* If the character is more than one byte long, we can set firstcu 7509 only if it is not to be matched caselessly. */ 7510 7511 if (mclength == 1 || req_caseopt == 0) 7512 { 7513 firstcu = mcbuffer[0] | req_caseopt; 7514 firstcu = mcbuffer[0]; 7515 firstcuflags = req_caseopt; 7516 7517 if (mclength != 1) 7518 { 7519 reqcu = code[-1]; 7520 reqcuflags = cb->req_varyopt; 7521 } 7522 } 7523 else firstcuflags = reqcuflags = REQ_NONE; 7524 } 7525 7526 /* firstcu was previously set; we can set reqcu only if the length is 7527 1 or the matching is caseful. */ 7528 7529 else 7530 { 7531 zerofirstcu = firstcu; 7532 zerofirstcuflags = firstcuflags; 7533 zeroreqcu = reqcu; 7534 zeroreqcuflags = reqcuflags; 7535 if (mclength == 1 || req_caseopt == 0) 7536 { 7537 reqcu = code[-1]; 7538 reqcuflags = req_caseopt | cb->req_varyopt; 7539 } 7540 } 7541 7542 break; /* End of literal character handling */ 7543 } 7544 } /* end of big loop */ 7545 7546 /* Control never reaches here by falling through, only by a goto for all the 7547 error states. Pass back the position in the pattern so that it can be displayed 7548 to the user for diagnosing the error. */ 7549 7550 FAILED: 7551 *ptrptr = ptr; 7552 return FALSE; 7553 } 7554 7555 7556 7557 /************************************************* 7558 * Compile regex: a sequence of alternatives * 7559 *************************************************/ 7560 7561 /* On entry, ptr is pointing past the bracket character, but on return it 7562 points to the closing bracket, or vertical bar, or end of string. The code 7563 variable is pointing at the byte into which the BRA operator has been stored. 7564 This function is used during the pre-compile phase when we are trying to find 7565 out the amount of memory needed, as well as during the real compile phase. The 7566 value of lengthptr distinguishes the two phases. 7567 7568 Arguments: 7569 options option bits, including any changes for this subpattern 7570 codeptr -> the address of the current code pointer 7571 ptrptr -> the address of the current pattern pointer 7572 errorcodeptr -> pointer to error code variable 7573 lookbehind TRUE if this is a lookbehind assertion 7574 reset_bracount TRUE to reset the count for each branch 7575 skipunits skip this many code units at start (for brackets and OP_COND) 7576 cond_depth depth of nesting for conditional subpatterns 7577 firstcuptr place to put the first required code unit 7578 firstcuflagsptr place to put the first code unit flags, or a negative number 7579 reqcuptr place to put the last required code unit 7580 reqcuflagsptr place to put the last required code unit flags, or a negative number 7581 bcptr pointer to the chain of currently open branches 7582 cb points to the data block with tables pointers etc. 7583 lengthptr NULL during the real compile phase 7584 points to length accumulator during pre-compile phase 7585 7586 Returns: TRUE on success 7587 */ 7588 7589 static BOOL 7590 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr, 7591 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits, 7592 int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr, 7593 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, 7594 compile_block *cb, size_t *lengthptr) 7595 { 7596 PCRE2_SPTR ptr = *ptrptr; 7597 PCRE2_UCHAR *code = *codeptr; 7598 PCRE2_UCHAR *last_branch = code; 7599 PCRE2_UCHAR *start_bracket = code; 7600 PCRE2_UCHAR *reverse_count = NULL; 7601 open_capitem capitem; 7602 int capnumber = 0; 7603 uint32_t firstcu, reqcu; 7604 int32_t firstcuflags, reqcuflags; 7605 uint32_t branchfirstcu, branchreqcu; 7606 int32_t branchfirstcuflags, branchreqcuflags; 7607 size_t length; 7608 unsigned int orig_bracount; 7609 unsigned int max_bracount; 7610 branch_chain bc; 7611 7612 /* If set, call the external function that checks for stack availability. */ 7613 7614 if (cb->cx->stack_guard != NULL && 7615 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) 7616 { 7617 *errorcodeptr= ERR33; 7618 return FALSE; 7619 } 7620 7621 /* Miscellaneous initialization */ 7622 7623 bc.outer = bcptr; 7624 bc.current_branch = code; 7625 7626 firstcu = reqcu = 0; 7627 firstcuflags = reqcuflags = REQ_UNSET; 7628 7629 /* Accumulate the length for use in the pre-compile phase. Start with the 7630 length of the BRA and KET and any extra code units that are required at the 7631 beginning. We accumulate in a local variable to save frequent testing of 7632 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the 7633 start and end of each alternative, because compiled items are discarded during 7634 the pre-compile phase so that the work space is not exceeded. */ 7635 7636 length = 2 + 2*LINK_SIZE + skipunits; 7637 7638 /* WARNING: If the above line is changed for any reason, you must also change 7639 the code that abstracts option settings at the start of the pattern and makes 7640 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the 7641 pre-compile phase to find out whether or not anything has yet been compiled. 7642 7643 If this is a capturing subpattern, add to the chain of open capturing items 7644 so that we can detect them if (*ACCEPT) is encountered. This is also used to 7645 detect groups that contain recursive back references to themselves. Note that 7646 only OP_CBRA need be tested here; changing this opcode to one of its variants, 7647 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */ 7648 7649 if (*code == OP_CBRA) 7650 { 7651 capnumber = GET2(code, 1 + LINK_SIZE); 7652 capitem.number = capnumber; 7653 capitem.next = cb->open_caps; 7654 capitem.flag = FALSE; 7655 cb->open_caps = &capitem; 7656 } 7657 7658 /* Offset is set zero to mark that this bracket is still open */ 7659 7660 PUT(code, 1, 0); 7661 code += 1 + LINK_SIZE + skipunits; 7662 7663 /* Loop for each alternative branch */ 7664 7665 orig_bracount = max_bracount = cb->bracount; 7666 7667 for (;;) 7668 { 7669 /* For a (?| group, reset the capturing bracket count so that each branch 7670 uses the same numbers. */ 7671 7672 if (reset_bracount) cb->bracount = orig_bracount; 7673 7674 /* Set up dummy OP_REVERSE if lookbehind assertion */ 7675 7676 if (lookbehind) 7677 { 7678 *code++ = OP_REVERSE; 7679 reverse_count = code; 7680 PUTINC(code, 0, 0); 7681 length += 1 + LINK_SIZE; 7682 } 7683 7684 /* Now compile the branch; in the pre-compile phase its length gets added 7685 into the length. */ 7686 7687 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu, 7688 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, 7689 cond_depth, cb, (lengthptr == NULL)? NULL : &length)) 7690 { 7691 *ptrptr = ptr; 7692 return FALSE; 7693 } 7694 7695 /* Keep the highest bracket count in case (?| was used and some branch 7696 has fewer than the rest. */ 7697 7698 if (cb->bracount > max_bracount) max_bracount = cb->bracount; 7699 7700 /* In the real compile phase, there is some post-processing to be done. */ 7701 7702 if (lengthptr == NULL) 7703 { 7704 /* If this is the first branch, the firstcu and reqcu values for the 7705 branch become the values for the regex. */ 7706 7707 if (*last_branch != OP_ALT) 7708 { 7709 firstcu = branchfirstcu; 7710 firstcuflags = branchfirstcuflags; 7711 reqcu = branchreqcu; 7712 reqcuflags = branchreqcuflags; 7713 } 7714 7715 /* If this is not the first branch, the first char and reqcu have to 7716 match the values from all the previous branches, except that if the 7717 previous value for reqcu didn't have REQ_VARY set, it can still match, 7718 and we set REQ_VARY for the regex. */ 7719 7720 else 7721 { 7722 /* If we previously had a firstcu, but it doesn't match the new branch, 7723 we have to abandon the firstcu for the regex, but if there was 7724 previously no reqcu, it takes on the value of the old firstcu. */ 7725 7726 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) 7727 { 7728 if (firstcuflags >= 0) 7729 { 7730 if (reqcuflags < 0) 7731 { 7732 reqcu = firstcu; 7733 reqcuflags = firstcuflags; 7734 } 7735 } 7736 firstcuflags = REQ_NONE; 7737 } 7738 7739 /* If we (now or from before) have no firstcu, a firstcu from the 7740 branch becomes a reqcu if there isn't a branch reqcu. */ 7741 7742 if (firstcuflags < 0 && branchfirstcuflags >= 0 && 7743 branchreqcuflags < 0) 7744 { 7745 branchreqcu = branchfirstcu; 7746 branchreqcuflags = branchfirstcuflags; 7747 } 7748 7749 /* Now ensure that the reqcus match */ 7750 7751 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || 7752 reqcu != branchreqcu) 7753 reqcuflags = REQ_NONE; 7754 else 7755 { 7756 reqcu = branchreqcu; 7757 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */ 7758 } 7759 } 7760 7761 /* If lookbehind, check that this branch matches a fixed-length string, and 7762 put the length into the OP_REVERSE item. Temporarily mark the end of the 7763 branch with OP_END. If the branch contains OP_RECURSE, the result is 7764 FFL_LATER (a negative value) because there may be forward references that 7765 we can't check here. Set a flag to cause another lookbehind check at the 7766 end. Why not do it all at the end? Because common errors can be picked up 7767 here and the offset of the problem can be shown. */ 7768 7769 if (lookbehind) 7770 { 7771 int fixed_length; 7772 int count = 0; 7773 *code = OP_END; 7774 fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0, 7775 FALSE, cb, NULL, &count); 7776 if (fixed_length == FFL_LATER) 7777 { 7778 cb->check_lookbehind = TRUE; 7779 } 7780 else if (fixed_length < 0) 7781 { 7782 *errorcodeptr = fixed_length_errors[-fixed_length]; 7783 *ptrptr = ptr; 7784 return FALSE; 7785 } 7786 else 7787 { 7788 if (fixed_length > cb->max_lookbehind) 7789 cb->max_lookbehind = fixed_length; 7790 PUT(reverse_count, 0, fixed_length); 7791 } 7792 } 7793 } 7794 7795 /* Reached end of expression, either ')' or end of pattern. In the real 7796 compile phase, go back through the alternative branches and reverse the chain 7797 of offsets, with the field in the BRA item now becoming an offset to the 7798 first alternative. If there are no alternatives, it points to the end of the 7799 group. The length in the terminating ket is always the length of the whole 7800 bracketed item. Return leaving the pointer at the terminating char. */ 7801 7802 if (*ptr != CHAR_VERTICAL_LINE) 7803 { 7804 if (lengthptr == NULL) 7805 { 7806 size_t branch_length = code - last_branch; 7807 do 7808 { 7809 size_t prev_length = GET(last_branch, 1); 7810 PUT(last_branch, 1, branch_length); 7811 branch_length = prev_length; 7812 last_branch -= branch_length; 7813 } 7814 while (branch_length > 0); 7815 } 7816 7817 /* Fill in the ket */ 7818 7819 *code = OP_KET; 7820 PUT(code, 1, (int)(code - start_bracket)); 7821 code += 1 + LINK_SIZE; 7822 7823 /* If it was a capturing subpattern, check to see if it contained any 7824 recursive back references. If so, we must wrap it in atomic brackets. In 7825 any event, remove the block from the chain. */ 7826 7827 if (capnumber > 0) 7828 { 7829 if (cb->open_caps->flag) 7830 { 7831 memmove(start_bracket + 1 + LINK_SIZE, start_bracket, 7832 CU2BYTES(code - start_bracket)); 7833 *start_bracket = OP_ONCE; 7834 code += 1 + LINK_SIZE; 7835 PUT(start_bracket, 1, (int)(code - start_bracket)); 7836 *code = OP_KET; 7837 PUT(code, 1, (int)(code - start_bracket)); 7838 code += 1 + LINK_SIZE; 7839 length += 2 + 2*LINK_SIZE; 7840 } 7841 cb->open_caps = cb->open_caps->next; 7842 } 7843 7844 /* Retain the highest bracket number, in case resetting was used. */ 7845 7846 cb->bracount = max_bracount; 7847 7848 /* Set values to pass back */ 7849 7850 *codeptr = code; 7851 *ptrptr = ptr; 7852 *firstcuptr = firstcu; 7853 *firstcuflagsptr = firstcuflags; 7854 *reqcuptr = reqcu; 7855 *reqcuflagsptr = reqcuflags; 7856 if (lengthptr != NULL) 7857 { 7858 if (OFLOW_MAX - *lengthptr < length) 7859 { 7860 *errorcodeptr = ERR20; 7861 return FALSE; 7862 } 7863 *lengthptr += length; 7864 } 7865 return TRUE; 7866 } 7867 7868 /* Another branch follows. In the pre-compile phase, we can move the code 7869 pointer back to where it was for the start of the first branch. (That is, 7870 pretend that each branch is the only one.) 7871 7872 In the real compile phase, insert an ALT node. Its length field points back 7873 to the previous branch while the bracket remains open. At the end the chain 7874 is reversed. It's done like this so that the start of the bracket has a 7875 zero offset until it is closed, making it possible to detect recursion. */ 7876 7877 if (lengthptr != NULL) 7878 { 7879 code = *codeptr + 1 + LINK_SIZE + skipunits; 7880 length += 1 + LINK_SIZE; 7881 } 7882 else 7883 { 7884 *code = OP_ALT; 7885 PUT(code, 1, (int)(code - last_branch)); 7886 bc.current_branch = last_branch = code; 7887 code += 1 + LINK_SIZE; 7888 } 7889 7890 /* Advance past the vertical bar */ 7891 7892 ptr++; 7893 } 7894 /* Control never reaches here */ 7895 } 7896 7897 7898 7899 /************************************************* 7900 * Check for anchored pattern * 7901 *************************************************/ 7902 7903 /* Try to find out if this is an anchored regular expression. Consider each 7904 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket 7905 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then 7906 it's anchored. However, if this is a multiline pattern, then only OP_SOD will 7907 be found, because ^ generates OP_CIRCM in that mode. 7908 7909 We can also consider a regex to be anchored if OP_SOM starts all its branches. 7910 This is the code for \G, which means "match at start of match position, taking 7911 into account the match offset". 7912 7913 A branch is also implicitly anchored if it starts with .* and DOTALL is set, 7914 because that will try the rest of the pattern at all possible matching points, 7915 so there is no point trying again.... er .... 7916 7917 .... except when the .* appears inside capturing parentheses, and there is a 7918 subsequent back reference to those parentheses. We haven't enough information 7919 to catch that case precisely. 7920 7921 At first, the best we could do was to detect when .* was in capturing brackets 7922 and the highest back reference was greater than or equal to that level. 7923 However, by keeping a bitmap of the first 31 back references, we can catch some 7924 of the more common cases more precisely. 7925 7926 ... A second exception is when the .* appears inside an atomic group, because 7927 this prevents the number of characters it matches from being adjusted. 7928 7929 Arguments: 7930 code points to start of the compiled pattern 7931 bracket_map a bitmap of which brackets we are inside while testing; this 7932 handles up to substring 31; after that we just have to take 7933 the less precise approach 7934 cb points to the compile data block 7935 atomcount atomic group level 7936 7937 Returns: TRUE or FALSE 7938 */ 7939 7940 static BOOL 7941 is_anchored(register PCRE2_SPTR code, unsigned int bracket_map, 7942 compile_block *cb, int atomcount) 7943 { 7944 do { 7945 PCRE2_SPTR scode = first_significant_code( 7946 code + PRIV(OP_lengths)[*code], FALSE); 7947 register int op = *scode; 7948 7949 /* Non-capturing brackets */ 7950 7951 if (op == OP_BRA || op == OP_BRAPOS || 7952 op == OP_SBRA || op == OP_SBRAPOS) 7953 { 7954 if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; 7955 } 7956 7957 /* Capturing brackets */ 7958 7959 else if (op == OP_CBRA || op == OP_CBRAPOS || 7960 op == OP_SCBRA || op == OP_SCBRAPOS) 7961 { 7962 int n = GET2(scode, 1+LINK_SIZE); 7963 int new_map = bracket_map | ((n < 32)? (1u << n) : 1); 7964 if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE; 7965 } 7966 7967 /* Positive forward assertions and conditions */ 7968 7969 else if (op == OP_ASSERT || op == OP_COND) 7970 { 7971 if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; 7972 } 7973 7974 /* Atomic groups */ 7975 7976 else if (op == OP_ONCE || op == OP_ONCE_NC) 7977 { 7978 if (!is_anchored(scode, bracket_map, cb, atomcount + 1)) 7979 return FALSE; 7980 } 7981 7982 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and 7983 it isn't in brackets that are or may be referenced or inside an atomic 7984 group. There is also an option that disables auto-anchoring. */ 7985 7986 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || 7987 op == OP_TYPEPOSSTAR)) 7988 { 7989 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || 7990 atomcount > 0 || cb->had_pruneorskip || 7991 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) 7992 return FALSE; 7993 } 7994 7995 /* Check for explicit anchoring */ 7996 7997 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; 7998 7999 code += GET(code, 1); 8000 } 8001 while (*code == OP_ALT); /* Loop for each alternative */ 8002 return TRUE; 8003 } 8004 8005 8006 8007 /************************************************* 8008 * Check for starting with ^ or .* * 8009 *************************************************/ 8010 8011 /* This is called to find out if every branch starts with ^ or .* so that 8012 "first char" processing can be done to speed things up in multiline 8013 matching and for non-DOTALL patterns that start with .* (which must start at 8014 the beginning or after \n). As in the case of is_anchored() (see above), we 8015 have to take account of back references to capturing brackets that contain .* 8016 because in that case we can't make the assumption. Also, the appearance of .* 8017 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not 8018 count, because once again the assumption no longer holds. 8019 8020 Arguments: 8021 code points to start of the compiled pattern or a group 8022 bracket_map a bitmap of which brackets we are inside while testing; this 8023 handles up to substring 31; after that we just have to take 8024 the less precise approach 8025 cb points to the compile data 8026 atomcount atomic group level 8027 8028 Returns: TRUE or FALSE 8029 */ 8030 8031 static BOOL 8032 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, 8033 int atomcount) 8034 { 8035 do { 8036 PCRE2_SPTR scode = first_significant_code( 8037 code + PRIV(OP_lengths)[*code], FALSE); 8038 register int op = *scode; 8039 8040 /* If we are at the start of a conditional assertion group, *both* the 8041 conditional assertion *and* what follows the condition must satisfy the test 8042 for start of line. Other kinds of condition fail. Note that there may be an 8043 auto-callout at the start of a condition. */ 8044 8045 if (op == OP_COND) 8046 { 8047 scode += 1 + LINK_SIZE; 8048 8049 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; 8050 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); 8051 8052 switch (*scode) 8053 { 8054 case OP_CREF: 8055 case OP_DNCREF: 8056 case OP_RREF: 8057 case OP_DNRREF: 8058 case OP_FAIL: 8059 case OP_FALSE: 8060 case OP_TRUE: 8061 return FALSE; 8062 8063 default: /* Assertion */ 8064 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; 8065 do scode += GET(scode, 1); while (*scode == OP_ALT); 8066 scode += 1 + LINK_SIZE; 8067 break; 8068 } 8069 scode = first_significant_code(scode, FALSE); 8070 op = *scode; 8071 } 8072 8073 /* Non-capturing brackets */ 8074 8075 if (op == OP_BRA || op == OP_BRAPOS || 8076 op == OP_SBRA || op == OP_SBRAPOS) 8077 { 8078 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; 8079 } 8080 8081 /* Capturing brackets */ 8082 8083 else if (op == OP_CBRA || op == OP_CBRAPOS || 8084 op == OP_SCBRA || op == OP_SCBRAPOS) 8085 { 8086 int n = GET2(scode, 1+LINK_SIZE); 8087 int new_map = bracket_map | ((n < 32)? (1u << n) : 1); 8088 if (!is_startline(scode, new_map, cb, atomcount)) return FALSE; 8089 } 8090 8091 /* Positive forward assertions */ 8092 8093 else if (op == OP_ASSERT) 8094 { 8095 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; 8096 } 8097 8098 /* Atomic brackets */ 8099 8100 else if (op == OP_ONCE || op == OP_ONCE_NC) 8101 { 8102 if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE; 8103 } 8104 8105 /* .* means "start at start or after \n" if it isn't in atomic brackets or 8106 brackets that may be referenced, as long as the pattern does not contain 8107 *PRUNE or *SKIP, because these break the feature. Consider, for example, 8108 /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the 8109 start of a line. There is also an option that disables this optimization. */ 8110 8111 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) 8112 { 8113 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || 8114 atomcount > 0 || cb->had_pruneorskip || 8115 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) 8116 return FALSE; 8117 } 8118 8119 /* Check for explicit circumflex; anything else gives a FALSE result. Note 8120 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC 8121 because the number of characters matched by .* cannot be adjusted inside 8122 them. */ 8123 8124 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; 8125 8126 /* Move on to the next alternative */ 8127 8128 code += GET(code, 1); 8129 } 8130 while (*code == OP_ALT); /* Loop for each alternative */ 8131 return TRUE; 8132 } 8133 8134 8135 8136 /************************************************* 8137 * Check for asserted fixed first code unit * 8138 *************************************************/ 8139 8140 /* During compilation, the "first code unit" settings from forward assertions 8141 are discarded, because they can cause conflicts with actual literals that 8142 follow. However, if we end up without a first code unit setting for an 8143 unanchored pattern, it is worth scanning the regex to see if there is an 8144 initial asserted first code unit. If all branches start with the same asserted 8145 code unit, or with a non-conditional bracket all of whose alternatives start 8146 with the same asserted code unit (recurse ad lib), then we return that code 8147 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with 8148 REQ_NONE in the flags. 8149 8150 Arguments: 8151 code points to start of compiled pattern 8152 flags points to the first code unit flags 8153 inassert TRUE if in an assertion 8154 8155 Returns: the fixed first code unit, or 0 with REQ_NONE in flags 8156 */ 8157 8158 static uint32_t 8159 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert) 8160 { 8161 register uint32_t c = 0; 8162 int cflags = REQ_NONE; 8163 8164 *flags = REQ_NONE; 8165 do { 8166 uint32_t d; 8167 int dflags; 8168 int xl = (*code == OP_CBRA || *code == OP_SCBRA || 8169 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; 8170 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); 8171 register PCRE2_UCHAR op = *scode; 8172 8173 switch(op) 8174 { 8175 default: 8176 return 0; 8177 8178 case OP_BRA: 8179 case OP_BRAPOS: 8180 case OP_CBRA: 8181 case OP_SCBRA: 8182 case OP_CBRAPOS: 8183 case OP_SCBRAPOS: 8184 case OP_ASSERT: 8185 case OP_ONCE: 8186 case OP_ONCE_NC: 8187 d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT); 8188 if (dflags < 0) 8189 return 0; 8190 if (cflags < 0) { c = d; cflags = dflags; } 8191 else if (c != d || cflags != dflags) return 0; 8192 break; 8193 8194 case OP_EXACT: 8195 scode += IMM2_SIZE; 8196 /* Fall through */ 8197 8198 case OP_CHAR: 8199 case OP_PLUS: 8200 case OP_MINPLUS: 8201 case OP_POSPLUS: 8202 if (!inassert) return 0; 8203 if (cflags < 0) { c = scode[1]; cflags = 0; } 8204 else if (c != scode[1]) return 0; 8205 break; 8206 8207 case OP_EXACTI: 8208 scode += IMM2_SIZE; 8209 /* Fall through */ 8210 8211 case OP_CHARI: 8212 case OP_PLUSI: 8213 case OP_MINPLUSI: 8214 case OP_POSPLUSI: 8215 if (!inassert) return 0; 8216 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } 8217 else if (c != scode[1]) return 0; 8218 break; 8219 } 8220 8221 code += GET(code, 1); 8222 } 8223 while (*code == OP_ALT); 8224 8225 *flags = cflags; 8226 return c; 8227 } 8228 8229 8230 8231 /************************************************* 8232 * Add an entry to the name/number table * 8233 *************************************************/ 8234 8235 /* This function is called between compiling passes to add an entry to the 8236 name/number table, maintaining alphabetical order. Checking for permitted 8237 and forbidden duplicates has already been done. 8238 8239 Arguments: 8240 cb the compile data block 8241 name the name to add 8242 length the length of the name 8243 groupno the group number 8244 8245 Returns: nothing 8246 */ 8247 8248 static void 8249 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, 8250 unsigned int groupno) 8251 { 8252 int i; 8253 PCRE2_UCHAR *slot = cb->name_table; 8254 8255 for (i = 0; i < cb->names_found; i++) 8256 { 8257 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); 8258 if (crc == 0 && slot[IMM2_SIZE+length] != 0) 8259 crc = -1; /* Current name is a substring */ 8260 8261 /* Make space in the table and break the loop for an earlier name. For a 8262 duplicate or later name, carry on. We do this for duplicates so that in the 8263 simple case (when ?(| is not used) they are in order of their numbers. In all 8264 cases they are in the order in which they appear in the pattern. */ 8265 8266 if (crc < 0) 8267 { 8268 memmove(slot + cb->name_entry_size, slot, 8269 CU2BYTES((cb->names_found - i) * cb->name_entry_size)); 8270 break; 8271 } 8272 8273 /* Continue the loop for a later or duplicate name */ 8274 8275 slot += cb->name_entry_size; 8276 } 8277 8278 PUT2(slot, 0, groupno); 8279 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); 8280 cb->names_found++; 8281 8282 /* Add a terminating zero and fill the rest of the slot with zeroes so that 8283 the memory is all initialized. Otherwise valgrind moans about uninitialized 8284 memory when saving serialized compiled patterns. */ 8285 8286 memset(slot + IMM2_SIZE + length, 0, 8287 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE)); 8288 } 8289 8290 8291 8292 /************************************************* 8293 * External function to compile a pattern * 8294 *************************************************/ 8295 8296 /* This function reads a regular expression in the form of a string and returns 8297 a pointer to a block of store holding a compiled version of the expression. 8298 8299 Arguments: 8300 pattern the regular expression 8301 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED 8302 options option bits 8303 errorptr pointer to errorcode 8304 erroroffset pointer to error offset 8305 ccontext points to a compile context or is NULL 8306 8307 Returns: pointer to compiled data block, or NULL on error, 8308 with errorcode and erroroffset set 8309 */ 8310 8311 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION 8312 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, 8313 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) 8314 { 8315 BOOL utf; /* Set TRUE for UTF mode */ 8316 pcre2_real_code *re = NULL; /* What we will return */ 8317 compile_block cb; /* "Static" compile-time data */ 8318 const uint8_t *tables; /* Char tables base pointer */ 8319 8320 PCRE2_UCHAR *code; /* Current pointer in compiled code */ 8321 PCRE2_SPTR codestart; /* Start of compiled code */ 8322 PCRE2_SPTR ptr; /* Current pointer in pattern */ 8323 8324 size_t length = 1; /* Allow or final END opcode */ 8325 size_t usedlength; /* Actual length used */ 8326 size_t re_blocksize; /* Size of memory block */ 8327 8328 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ 8329 uint32_t firstcu, reqcu; /* Value of first/req code unit */ 8330 uint32_t setflags = 0; /* NL and BSR set flags */ 8331 8332 uint32_t skipatstart; /* When checking (*UTF) etc */ 8333 uint32_t limit_match = UINT32_MAX; /* Unset match limits */ 8334 uint32_t limit_recursion = UINT32_MAX; 8335 8336 int newline = 0; /* Unset; can be set by the pattern */ 8337 int bsr = 0; /* Unset; can be set by the pattern */ 8338 int errorcode = 0; /* Initialize to avoid compiler warn */ 8339 8340 /* Comments at the head of this file explain about these variables. */ 8341 8342 PCRE2_UCHAR *copied_pattern = NULL; 8343 PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE]; 8344 named_group named_groups[NAMED_GROUP_LIST_SIZE]; 8345 8346 /* The workspace is used in different ways in the different compiling phases. 8347 It needs to be 16-bit aligned for the preliminary group scan, and 32-bit 8348 aligned for the group information cache. */ 8349 8350 uint32_t c32workspace[C32_WORK_SIZE]; 8351 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c32workspace; 8352 8353 8354 /* -------------- Check arguments and set up the pattern ----------------- */ 8355 8356 /* There must be error code and offset pointers. */ 8357 8358 if (errorptr == NULL || erroroffset == NULL) return NULL; 8359 *errorptr = ERR0; 8360 *erroroffset = 0; 8361 8362 /* There must be a pattern! */ 8363 8364 if (pattern == NULL) 8365 { 8366 *errorptr = ERR16; 8367 return NULL; 8368 } 8369 8370 /* Check that all undefined public option bits are zero. */ 8371 8372 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) 8373 { 8374 *errorptr = ERR17; 8375 return NULL; 8376 } 8377 8378 /* A NULL compile context means "use a default context" */ 8379 8380 if (ccontext == NULL) 8381 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); 8382 8383 /* A zero-terminated pattern is indicated by the special length value 8384 PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero, 8385 to ensure that it is always possible to look one code unit beyond the end of 8386 the pattern's characters. In both cases, check that the pattern is overlong. */ 8387 8388 if (patlen == PCRE2_ZERO_TERMINATED) 8389 { 8390 patlen = PRIV(strlen)(pattern); 8391 if (patlen > ccontext->max_pattern_length) 8392 { 8393 *errorptr = ERR88; 8394 return NULL; 8395 } 8396 } 8397 else 8398 { 8399 if (patlen > ccontext->max_pattern_length) 8400 { 8401 *errorptr = ERR88; 8402 return NULL; 8403 } 8404 if (patlen < COPIED_PATTERN_SIZE) 8405 copied_pattern = stack_copied_pattern; 8406 else 8407 { 8408 copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1), 8409 ccontext->memctl.memory_data); 8410 if (copied_pattern == NULL) 8411 { 8412 *errorptr = ERR21; 8413 return NULL; 8414 } 8415 } 8416 memcpy(copied_pattern, pattern, CU2BYTES(patlen)); 8417 copied_pattern[patlen] = 0; 8418 pattern = copied_pattern; 8419 } 8420 8421 /* ------------ Initialize the "static" compile data -------------- */ 8422 8423 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); 8424 8425 cb.lcc = tables + lcc_offset; /* Individual */ 8426 cb.fcc = tables + fcc_offset; /* character */ 8427 cb.cbits = tables + cbits_offset; /* tables */ 8428 cb.ctypes = tables + ctypes_offset; 8429 8430 cb.assert_depth = 0; 8431 cb.bracount = cb.final_bracount = 0; 8432 cb.cx = ccontext; 8433 cb.dupnames = FALSE; 8434 cb.end_pattern = pattern + patlen; 8435 cb.nestptr[0] = cb.nestptr[1] = NULL; 8436 cb.external_flags = 0; 8437 cb.external_options = options; 8438 cb.groupinfo = c32workspace; 8439 cb.had_recurse = FALSE; 8440 cb.iscondassert = FALSE; 8441 cb.max_lookbehind = 0; 8442 cb.name_entry_size = 0; 8443 cb.name_table = NULL; 8444 cb.named_groups = named_groups; 8445 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; 8446 cb.names_found = 0; 8447 cb.open_caps = NULL; 8448 cb.parens_depth = 0; 8449 cb.req_varyopt = 0; 8450 cb.start_code = cworkspace; 8451 cb.start_pattern = pattern; 8452 cb.start_workspace = cworkspace; 8453 cb.workspace_size = COMPILE_WORK_SIZE; 8454 8455 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back 8456 references to help in deciding whether (.*) can be treated as anchored or not. 8457 */ 8458 8459 cb.top_backref = 0; 8460 cb.backref_map = 0; 8461 8462 /* --------------- Start looking at the pattern --------------- */ 8463 8464 /* Check for global one-time option settings at the start of the pattern, and 8465 remember the offset to the actual regex. */ 8466 8467 ptr = pattern; 8468 skipatstart = 0; 8469 8470 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && 8471 ptr[skipatstart+1] == CHAR_ASTERISK) 8472 { 8473 unsigned int i; 8474 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) 8475 { 8476 pso *p = pso_list + i; 8477 8478 if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0) 8479 { 8480 uint32_t c, pp; 8481 8482 skipatstart += p->length + 2; 8483 switch(p->type) 8484 { 8485 case PSO_OPT: 8486 cb.external_options |= p->value; 8487 break; 8488 8489 case PSO_FLG: 8490 setflags |= p->value; 8491 break; 8492 8493 case PSO_NL: 8494 newline = p->value; 8495 setflags |= PCRE2_NL_SET; 8496 break; 8497 8498 case PSO_BSR: 8499 bsr = p->value; 8500 setflags |= PCRE2_BSR_SET; 8501 break; 8502 8503 case PSO_LIMM: 8504 case PSO_LIMR: 8505 c = 0; 8506 pp = skipatstart; 8507 if (!IS_DIGIT(ptr[pp])) 8508 { 8509 errorcode = ERR60; 8510 ptr += pp; 8511 goto HAD_ERROR; 8512 } 8513 while (IS_DIGIT(ptr[pp])) 8514 { 8515 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ 8516 c = c*10 + (ptr[pp++] - CHAR_0); 8517 } 8518 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) 8519 { 8520 errorcode = ERR60; 8521 ptr += pp; 8522 goto HAD_ERROR; 8523 } 8524 if (p->type == PSO_LIMM) limit_match = c; 8525 else limit_recursion = c; 8526 skipatstart += pp - skipatstart; 8527 break; 8528 } 8529 break; /* Out of the table scan loop */ 8530 } 8531 } 8532 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ 8533 } 8534 8535 /* End of pattern-start options; advance to start of real regex. */ 8536 8537 ptr += skipatstart; 8538 8539 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */ 8540 8541 #ifndef SUPPORT_UNICODE 8542 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) 8543 { 8544 errorcode = ERR32; 8545 goto HAD_ERROR; 8546 } 8547 #endif 8548 8549 /* Check UTF. We have the original options in 'options', with that value as 8550 modified by (*UTF) etc in cb->external_options. */ 8551 8552 utf = (cb.external_options & PCRE2_UTF) != 0; 8553 if (utf) 8554 { 8555 if ((options & PCRE2_NEVER_UTF) != 0) 8556 { 8557 errorcode = ERR74; 8558 goto HAD_ERROR; 8559 } 8560 if ((options & PCRE2_NO_UTF_CHECK) == 0 && 8561 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) 8562 goto HAD_UTF_ERROR; 8563 } 8564 8565 /* Check UCP lockout. */ 8566 8567 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == 8568 (PCRE2_UCP|PCRE2_NEVER_UCP)) 8569 { 8570 errorcode = ERR75; 8571 goto HAD_ERROR; 8572 } 8573 8574 /* Process the BSR setting. */ 8575 8576 if (bsr == 0) bsr = ccontext->bsr_convention; 8577 8578 /* Process the newline setting. */ 8579 8580 if (newline == 0) newline = ccontext->newline_convention; 8581 cb.nltype = NLTYPE_FIXED; 8582 switch(newline) 8583 { 8584 case PCRE2_NEWLINE_CR: 8585 cb.nllen = 1; 8586 cb.nl[0] = CHAR_CR; 8587 break; 8588 8589 case PCRE2_NEWLINE_LF: 8590 cb.nllen = 1; 8591 cb.nl[0] = CHAR_NL; 8592 break; 8593 8594 case PCRE2_NEWLINE_CRLF: 8595 cb.nllen = 2; 8596 cb.nl[0] = CHAR_CR; 8597 cb.nl[1] = CHAR_NL; 8598 break; 8599 8600 case PCRE2_NEWLINE_ANY: 8601 cb.nltype = NLTYPE_ANY; 8602 break; 8603 8604 case PCRE2_NEWLINE_ANYCRLF: 8605 cb.nltype = NLTYPE_ANYCRLF; 8606 break; 8607 8608 default: 8609 errorcode = ERR56; 8610 goto HAD_ERROR; 8611 } 8612 8613 /* Before we do anything else, do a pre-scan of the pattern in order to 8614 discover the named groups and their numerical equivalents, so that this 8615 information is always available for the remaining processing. */ 8616 8617 errorcode = scan_for_captures(&ptr, cb.external_options, &cb); 8618 if (errorcode != 0) goto HAD_ERROR; 8619 8620 /* For obscure debugging this code can be enabled. */ 8621 8622 #if 0 8623 { 8624 int i; 8625 named_group *ng = cb.named_groups; 8626 fprintf(stderr, "+++Captures: %d\n", cb.final_bracount); 8627 for (i = 0; i < cb.names_found; i++, ng++) 8628 { 8629 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); 8630 } 8631 } 8632 #endif 8633 8634 /* Reset current bracket count to zero and current pointer to the start of the 8635 pattern. */ 8636 8637 cb.bracount = 0; 8638 ptr = pattern + skipatstart; 8639 8640 /* Pretend to compile the pattern while actually just accumulating the amount 8641 of memory required in the 'length' variable. This behaviour is triggered by 8642 passing a non-NULL final argument to compile_regex(). We pass a block of 8643 workspace (cworkspace) for it to compile parts of the pattern into; the 8644 compiled code is discarded when it is no longer needed, so hopefully this 8645 workspace will never overflow, though there is a test for its doing so. 8646 8647 On error, errorcode will be set non-zero, so we don't need to look at the 8648 result of the function. The initial options have been put into the cb block so 8649 that they can be changed if an option setting is found within the regex right 8650 at the beginning. Bringing initial option settings outside can help speed up 8651 starting point checks. We still have to pass a separate options variable (the 8652 first argument) because that may change as the pattern is processed. */ 8653 8654 code = cworkspace; 8655 *code = OP_BRA; 8656 8657 (void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE, 8658 FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, 8659 &cb, &length); 8660 8661 if (errorcode != 0) goto HAD_ERROR; 8662 if (length > MAX_PATTERN_SIZE) 8663 { 8664 errorcode = ERR20; 8665 goto HAD_ERROR; 8666 } 8667 8668 /* Compute the size of, and then get and initialize, the data block for storing 8669 the compiled pattern and names table. Integer overflow should no longer be 8670 possible because nowadays we limit the maximum value of cb.names_found and 8671 cb.name_entry_size. */ 8672 8673 re_blocksize = sizeof(pcre2_real_code) + 8674 CU2BYTES(length + cb.names_found * cb.name_entry_size); 8675 re = (pcre2_real_code *) 8676 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); 8677 if (re == NULL) 8678 { 8679 errorcode = ERR21; 8680 goto HAD_ERROR; 8681 } 8682 8683 re->memctl = ccontext->memctl; 8684 re->tables = tables; 8685 re->executable_jit = NULL; 8686 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t)); 8687 re->blocksize = re_blocksize; 8688 re->magic_number = MAGIC_NUMBER; 8689 re->compile_options = options; 8690 re->overall_options = cb.external_options; 8691 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags; 8692 re->limit_match = limit_match; 8693 re->limit_recursion = limit_recursion; 8694 re->first_codeunit = 0; 8695 re->last_codeunit = 0; 8696 re->bsr_convention = bsr; 8697 re->newline_convention = newline; 8698 re->max_lookbehind = 0; 8699 re->minlength = 0; 8700 re->top_bracket = 0; 8701 re->top_backref = 0; 8702 re->name_entry_size = cb.name_entry_size; 8703 re->name_count = cb.names_found; 8704 8705 /* The basic block is immediately followed by the name table, and the compiled 8706 code follows after that. */ 8707 8708 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + 8709 re->name_entry_size * re->name_count; 8710 8711 /* Workspace is needed to remember information about numbered groups: whether a 8712 group can match an empty string and what its fixed length is. This is done to 8713 avoid the possibility of recursive references causing very long compile times 8714 when checking these features. Unnumbered groups do not have this exposure since 8715 they cannot be referenced. We use an indexed vector for this purpose. If there 8716 are sufficiently few groups, it can be the c32workspace vector, as set up 8717 above. Otherwise we have to get/free a special vector. The vector must be 8718 initialized to zero. */ 8719 8720 if (cb.final_bracount >= C32_WORK_SIZE) 8721 { 8722 cb.groupinfo = ccontext->memctl.malloc( 8723 (cb.final_bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); 8724 if (cb.groupinfo == NULL) 8725 { 8726 errorcode = ERR21; 8727 goto HAD_ERROR; 8728 } 8729 } 8730 memset(cb.groupinfo, 0, (cb.final_bracount + 1) * sizeof(uint32_t)); 8731 8732 /* Update the compile data block for the actual compile. The starting points of 8733 the name/number translation table and of the code are passed around in the 8734 compile data block. The start/end pattern and initial options are already set 8735 from the pre-compile phase, as is the name_entry_size field. Reset the bracket 8736 count and the names_found field. */ 8737 8738 cb.parens_depth = 0; 8739 cb.assert_depth = 0; 8740 cb.bracount = 0; 8741 cb.max_lookbehind = 0; 8742 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); 8743 cb.start_code = codestart; 8744 cb.iscondassert = FALSE; 8745 cb.req_varyopt = 0; 8746 cb.had_accept = FALSE; 8747 cb.had_pruneorskip = FALSE; 8748 cb.check_lookbehind = FALSE; 8749 cb.open_caps = NULL; 8750 8751 /* If any named groups were found, create the name/number table from the list 8752 created in the pre-pass. */ 8753 8754 if (cb.names_found > 0) 8755 { 8756 int i = cb.names_found; 8757 named_group *ng = cb.named_groups; 8758 cb.names_found = 0; 8759 for (; i > 0; i--, ng++) 8760 add_name_to_table(&cb, ng->name, ng->length, ng->number); 8761 } 8762 8763 /* Set up a starting, non-extracting bracket, then compile the expression. On 8764 error, errorcode will be set non-zero, so we don't need to look at the result 8765 of the function here. */ 8766 8767 ptr = pattern + skipatstart; 8768 code = (PCRE2_UCHAR *)codestart; 8769 *code = OP_BRA; 8770 (void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE, 8771 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); 8772 8773 re->top_bracket = cb.bracount; 8774 re->top_backref = cb.top_backref; 8775 re->max_lookbehind = cb.max_lookbehind; 8776 8777 if (cb.had_accept) 8778 { 8779 reqcu = 0; /* Must disable after (*ACCEPT) */ 8780 reqcuflags = REQ_NONE; 8781 } 8782 8783 /* Fill in the final opcode and check for disastrous overflow. If no overflow, 8784 but the estimated length exceeds the really used length, adjust the value of 8785 re->blocksize, and if valgrind support is configured, mark the extra allocated 8786 memory as unaddressable, so that any out-of-bound reads can be detected. */ 8787 8788 *code++ = OP_END; 8789 usedlength = code - codestart; 8790 if (usedlength > length) errorcode = ERR23; else 8791 { 8792 re->blocksize -= CU2BYTES(length - usedlength); 8793 #ifdef SUPPORT_VALGRIND 8794 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength)); 8795 #endif 8796 } 8797 8798 /* Scan the pattern for recursion/subroutine calls and convert the group 8799 numbers into offsets. Maintain a small cache so that repeated groups containing 8800 recursions are efficiently handled. */ 8801 8802 #define RSCAN_CACHE_SIZE 8 8803 8804 if (errorcode == 0 && cb.had_recurse) 8805 { 8806 PCRE2_UCHAR *rcode; 8807 PCRE2_SPTR rgroup; 8808 int ccount = 0; 8809 int start = RSCAN_CACHE_SIZE; 8810 recurse_cache rc[RSCAN_CACHE_SIZE]; 8811 8812 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); 8813 rcode != NULL; 8814 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) 8815 { 8816 int i, p, recno; 8817 8818 recno = (int)GET(rcode, 1); 8819 if (recno == 0) rgroup = codestart; else 8820 { 8821 PCRE2_SPTR search_from = codestart; 8822 rgroup = NULL; 8823 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) 8824 { 8825 if (recno == rc[p].recno) 8826 { 8827 rgroup = rc[p].group; 8828 break; 8829 } 8830 8831 /* Group n+1 must always start to the right of group n, so we can save 8832 search time below when the new group number is greater than any of the 8833 previously found groups. */ 8834 8835 if (recno > rc[p].recno) search_from = rc[p].group; 8836 } 8837 8838 if (rgroup == NULL) 8839 { 8840 rgroup = PRIV(find_bracket)(search_from, utf, recno); 8841 if (rgroup == NULL) 8842 { 8843 errorcode = ERR53; 8844 break; 8845 } 8846 if (--start < 0) start = RSCAN_CACHE_SIZE - 1; 8847 rc[start].recno = recno; 8848 rc[start].group = rgroup; 8849 if (ccount < RSCAN_CACHE_SIZE) ccount++; 8850 } 8851 } 8852 8853 PUT(rcode, 1, rgroup - codestart); 8854 } 8855 } 8856 8857 /* In rare debugging situations we sometimes need to look at the compiled code 8858 at this stage. */ 8859 8860 #ifdef CALL_PRINTINT 8861 pcre2_printint(re, stderr, TRUE); 8862 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); 8863 #endif 8864 8865 /* After a successful compile, give an error if there's back reference to a 8866 non-existent capturing subpattern. Then, unless disabled, check whether any 8867 single character iterators can be auto-possessified. The function overwrites 8868 the appropriate opcode values, so the type of the pointer must be cast. NOTE: 8869 the intermediate variable "temp" is used in this code because at least one 8870 compiler gives a warning about loss of "const" attribute if the cast 8871 (PCRE2_UCHAR *)codestart is used directly in the function call. */ 8872 8873 if (errorcode == 0) 8874 { 8875 if (re->top_backref > re->top_bracket) errorcode = ERR15; 8876 else if ((re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) 8877 { 8878 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; 8879 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; 8880 } 8881 } 8882 8883 /* If there were any lookbehind assertions that contained OP_RECURSE 8884 (recursions or subroutine calls), a flag is set for them to be checked here, 8885 because they may contain forward references. Actual recursions cannot be fixed 8886 length, but subroutine calls can. It is done like this so that those without 8887 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The 8888 exceptional ones forgo this. We scan the pattern to check that they are fixed 8889 length, and set their lengths. */ 8890 8891 if (errorcode == 0 && cb.check_lookbehind) 8892 { 8893 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart; 8894 8895 /* Loop, searching for OP_REVERSE items, and process those that do not have 8896 their length set. (Actually, it will also re-process any that have a length 8897 of zero, but that is a pathological case, and it does no harm.) When we find 8898 one, we temporarily terminate the branch it is in while we scan it. Note that 8899 calling find_bracket() with a negative group number returns a pointer to the 8900 OP_REVERSE item, not the actual lookbehind. */ 8901 8902 for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1); 8903 cc != NULL; 8904 cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1)) 8905 { 8906 if (GET(cc, 1) == 0) 8907 { 8908 int fixed_length; 8909 int count = 0; 8910 PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); 8911 int end_op = *be; 8912 *be = OP_END; 8913 fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count); 8914 *be = end_op; 8915 if (fixed_length < 0) 8916 { 8917 errorcode = fixed_length_errors[-fixed_length]; 8918 break; 8919 } 8920 if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length; 8921 PUT(cc, 1, fixed_length); 8922 } 8923 cc += 1 + LINK_SIZE; 8924 } 8925 8926 /* The previous value of the maximum lookbehind was transferred to the 8927 compiled regex block above. We could have updated this value in the loop 8928 above, but keep the two values in step, just in case some later code below 8929 uses the cb value. */ 8930 8931 re->max_lookbehind = cb.max_lookbehind; 8932 } 8933 8934 /* Failed to compile, or error while post-processing. Earlier errors get here 8935 via the dreaded goto. */ 8936 8937 if (errorcode != 0) 8938 { 8939 HAD_ERROR: 8940 *erroroffset = (int)(ptr - pattern); 8941 HAD_UTF_ERROR: 8942 *errorptr = errorcode; 8943 pcre2_code_free(re); 8944 re = NULL; 8945 goto EXIT; 8946 } 8947 8948 /* Successful compile. If the anchored option was not passed, set it if 8949 we can determine that the pattern is anchored by virtue of ^ characters or \A 8950 or anything else, such as starting with non-atomic .* when DOTALL is set and 8951 there are no occurrences of *PRUNE or *SKIP (though there is an option to 8952 disable this case). */ 8953 8954 if ((re->overall_options & PCRE2_ANCHORED) == 0 && 8955 is_anchored(codestart, 0, &cb, 0)) 8956 re->overall_options |= PCRE2_ANCHORED; 8957 8958 /* If the pattern is still not anchored and we do not have a first code unit, 8959 see if there is one that is asserted (these are not saved during the compile 8960 because they can cause conflicts with actual literals that follow). This code 8961 need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would 8962 create will not be used. */ 8963 8964 if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) 8965 { 8966 if (firstcuflags < 0) 8967 firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE); 8968 8969 /* Save the data for a first code unit. */ 8970 8971 if (firstcuflags >= 0) 8972 { 8973 re->first_codeunit = firstcu; 8974 re->flags |= PCRE2_FIRSTSET; 8975 8976 /* Handle caseless first code units. */ 8977 8978 if ((firstcuflags & REQ_CASELESS) != 0) 8979 { 8980 if (firstcu < 128 || (!utf && firstcu < 255)) 8981 { 8982 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; 8983 } 8984 8985 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In 8986 8-bit UTF mode, codepoints in the range 128-255 are introductory code 8987 points and cannot have another case. In 16-bit and 32-bit modes, we can 8988 check wide characters when UTF (and therefore UCP) is supported. */ 8989 8990 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 8991 else if (firstcu <= MAX_UTF_CODE_POINT && 8992 UCD_OTHERCASE(firstcu) != firstcu) 8993 re->flags |= PCRE2_FIRSTCASELESS; 8994 #endif 8995 } 8996 } 8997 8998 /* When there is no first code unit, see if we can set the PCRE2_STARTLINE 8999 flag. This is helpful for multiline matches when all branches start with ^ 9000 and also when all branches start with non-atomic .* for non-DOTALL matches 9001 when *PRUNE and SKIP are not present. (There is an option that disables this 9002 case.) */ 9003 9004 else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; 9005 } 9006 9007 /* Handle the "required code unit", if one is set. In the case of an anchored 9008 pattern, do this only if it follows a variable length item in the pattern. 9009 Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */ 9010 9011 if (reqcuflags >= 0 && 9012 ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 || 9013 (reqcuflags & REQ_VARY) != 0)) 9014 { 9015 re->last_codeunit = reqcu; 9016 re->flags |= PCRE2_LASTSET; 9017 9018 /* Handle caseless required code units as for first code units (above). */ 9019 9020 if ((reqcuflags & REQ_CASELESS) != 0) 9021 { 9022 if (reqcu < 128 || (!utf && reqcu < 255)) 9023 { 9024 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; 9025 } 9026 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 9027 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) 9028 re->flags |= PCRE2_LASTCASELESS; 9029 #endif 9030 } 9031 } 9032 9033 /* Check for a pattern than can match an empty string, so that this information 9034 can be provided to applications. */ 9035 9036 do 9037 { 9038 int count = 0; 9039 int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count); 9040 if (rc < 0) 9041 { 9042 errorcode = ERR86; 9043 goto HAD_ERROR; 9044 } 9045 if (rc > 0) 9046 { 9047 re->flags |= PCRE2_MATCH_EMPTY; 9048 break; 9049 } 9050 codestart += GET(codestart, 1); 9051 } 9052 while (*codestart == OP_ALT); 9053 9054 /* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern 9055 to set up information such as a bitmap of starting code units and a minimum 9056 matching length. */ 9057 9058 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && 9059 PRIV(study)(re) != 0) 9060 { 9061 errorcode = ERR31; 9062 goto HAD_ERROR; 9063 } 9064 9065 /* Control ends up here in all cases. If memory was obtained for a 9066 zero-terminated copy of the pattern, remember to free it before returning. Also 9067 free the list of named groups if a larger one had to be obtained, and likewise 9068 the group information vector. */ 9069 9070 EXIT: 9071 if (copied_pattern != stack_copied_pattern) 9072 ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data); 9073 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) 9074 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); 9075 if (cb.groupinfo != c32workspace) 9076 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); 9077 9078 return re; /* Will be NULL after an error */ 9079 } 9080 9081 /* End of pcre2_compile.c */ 9082