Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10           New API code Copyright (c) 2016-2018 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 
     42 /* This module contains mode-dependent macro and structure definitions. The
     43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
     44 These mode-dependent items are kept in a separate file so that they can also be
     45 #included multiple times for different code unit widths by pcre2test in order
     46 to have access to the hidden structures at all supported widths.
     47 
     48 Some of the mode-dependent macros are required at different widths for
     49 different parts of the pcre2test code (in particular, the included
     50 pcre_printint.c file). We undefine them here so that they can be re-defined for
     51 multiple inclusions. Not all of these are used in pcre2test, but it's easier
     52 just to undefine them all. */
     53 
     54 #undef ACROSSCHAR
     55 #undef BACKCHAR
     56 #undef BYTES2CU
     57 #undef CHMAX_255
     58 #undef CU2BYTES
     59 #undef FORWARDCHAR
     60 #undef FORWARDCHARTEST
     61 #undef GET
     62 #undef GET2
     63 #undef GETCHAR
     64 #undef GETCHARINC
     65 #undef GETCHARINCTEST
     66 #undef GETCHARLEN
     67 #undef GETCHARLENTEST
     68 #undef GETCHARTEST
     69 #undef GET_EXTRALEN
     70 #undef HAS_EXTRALEN
     71 #undef IMM2_SIZE
     72 #undef MAX_255
     73 #undef MAX_MARK
     74 #undef MAX_PATTERN_SIZE
     75 #undef MAX_UTF_SINGLE_CU
     76 #undef NOT_FIRSTCU
     77 #undef PUT
     78 #undef PUT2
     79 #undef PUT2INC
     80 #undef PUTCHAR
     81 #undef PUTINC
     82 #undef TABLE_GET
     83 
     84 
     85 
     86 /* -------------------------- MACROS ----------------------------- */
     87 
     88 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
     89 (always stored in big-endian order in 8-bit mode) by default. These are used,
     90 for example, to link from the start of a subpattern to its alternatives and its
     91 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
     92 to around 64K, which is big enough for almost everybody. However, I received a
     93 request for an even bigger limit. For this reason, and also to make the code
     94 easier to maintain, the storing and loading of offsets from the compiled code
     95 unit string is now handled by the macros that are defined here.
     96 
     97 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
     98 values of 3 or 4 are also supported. */
     99 
    100 /* ------------------- 8-bit support  ------------------ */
    101 
    102 #if PCRE2_CODE_UNIT_WIDTH == 8
    103 
    104 #if LINK_SIZE == 2
    105 #define PUT(a,n,d)   \
    106   (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
    107   (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
    108 #define GET(a,n) \
    109   (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
    110 #define MAX_PATTERN_SIZE (1 << 16)
    111 
    112 #elif LINK_SIZE == 3
    113 #define PUT(a,n,d)       \
    114   (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
    115   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
    116   (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
    117 #define GET(a,n) \
    118   (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
    119 #define MAX_PATTERN_SIZE (1 << 24)
    120 
    121 #elif LINK_SIZE == 4
    122 #define PUT(a,n,d)        \
    123   (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
    124   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
    125   (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
    126   (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
    127 #define GET(a,n) \
    128   (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
    129 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
    130 
    131 #else
    132 #error LINK_SIZE must be 2, 3, or 4
    133 #endif
    134 
    135 
    136 /* ------------------- 16-bit support  ------------------ */
    137 
    138 #elif PCRE2_CODE_UNIT_WIDTH == 16
    139 
    140 #if LINK_SIZE == 2
    141 #undef LINK_SIZE
    142 #define LINK_SIZE 1
    143 #define PUT(a,n,d)   \
    144   (a[n] = (PCRE2_UCHAR)(d))
    145 #define GET(a,n) \
    146   (a[n])
    147 #define MAX_PATTERN_SIZE (1 << 16)
    148 
    149 #elif LINK_SIZE == 3 || LINK_SIZE == 4
    150 #undef LINK_SIZE
    151 #define LINK_SIZE 2
    152 #define PUT(a,n,d)   \
    153   (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
    154   (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
    155 #define GET(a,n) \
    156   (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
    157 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
    158 
    159 #else
    160 #error LINK_SIZE must be 2, 3, or 4
    161 #endif
    162 
    163 
    164 /* ------------------- 32-bit support  ------------------ */
    165 
    166 #elif PCRE2_CODE_UNIT_WIDTH == 32
    167 #undef LINK_SIZE
    168 #define LINK_SIZE 1
    169 #define PUT(a,n,d)   \
    170   (a[n] = (d))
    171 #define GET(a,n) \
    172   (a[n])
    173 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
    174 
    175 #else
    176 #error Unsupported compiling mode
    177 #endif
    178 
    179 
    180 /* --------------- Other mode-specific macros ----------------- */
    181 
    182 /* PCRE uses some other (at least) 16-bit quantities that do not change when
    183 the size of offsets changes. There are used for repeat counts and for other
    184 things such as capturing parenthesis numbers in back references.
    185 
    186 Define the number of code units required to hold a 16-bit count/offset, and
    187 macros to load and store such a value. For reasons that I do not understand,
    188 the expression in the 8-bit GET2 macro is treated by gcc as a signed
    189 expression, even when a is declared as unsigned. It seems that any kind of
    190 arithmetic results in a signed value. Hence the cast. */
    191 
    192 #if PCRE2_CODE_UNIT_WIDTH == 8
    193 #define IMM2_SIZE 2
    194 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
    195 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
    196 
    197 #else  /* Code units are 16 or 32 bits */
    198 #define IMM2_SIZE 1
    199 #define GET2(a,n) a[n]
    200 #define PUT2(a,n,d) a[n] = d
    201 #endif
    202 
    203 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
    204 whether its argument, which is assumed to be one code unit, is less than 256.
    205 The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
    206 name must fit in one code unit; currently it is set to 255 or 65535. The
    207 TABLE_GET macro is used to access elements of tables containing exactly 256
    208 items. When code points can be greater than 255, a check is needed before
    209 accessing these tables. */
    210 
    211 #if PCRE2_CODE_UNIT_WIDTH == 8
    212 #define MAX_255(c) TRUE
    213 #define MAX_MARK ((1u << 8) - 1)
    214 #ifdef SUPPORT_UNICODE
    215 #define SUPPORT_WIDE_CHARS
    216 #define CHMAX_255(c) ((c) <= 255u)
    217 #else
    218 #define CHMAX_255(c) TRUE
    219 #endif  /* SUPPORT_UNICODE */
    220 #define TABLE_GET(c, table, default) ((table)[c])
    221 
    222 #else  /* Code units are 16 or 32 bits */
    223 #define CHMAX_255(c) ((c) <= 255u)
    224 #define MAX_255(c) ((c) <= 255u)
    225 #define MAX_MARK ((1u << 16) - 1)
    226 #define SUPPORT_WIDE_CHARS
    227 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
    228 #endif
    229 
    230 
    231 
    232 /* ----------------- Character-handling macros ----------------- */
    233 
    234 /* There is a proposed future special "UTF-21" mode, in which only the lowest
    235 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
    236 high-order bits available to the application for other uses. In preparation for
    237 the future implementation of this mode, there are macros that load a data item
    238 and, if in this special mode, mask it to 21 bits. These macros all have names
    239 starting with UCHAR21. In all other modes, including the normal 32-bit
    240 library, the macros all have the same simple definitions. When the new mode is
    241 implemented, it is expected that these definitions will be varied appropriately
    242 using #ifdef when compiling the library that supports the special mode. */
    243 
    244 #define UCHAR21(eptr)        (*(eptr))
    245 #define UCHAR21TEST(eptr)    (*(eptr))
    246 #define UCHAR21INC(eptr)     (*(eptr)++)
    247 #define UCHAR21INCTEST(eptr) (*(eptr)++)
    248 
    249 /* When UTF encoding is being used, a character is no longer just a single
    250 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
    251 handling generate simple sequences when used in the basic mode, and more
    252 complicated ones for UTF characters. GETCHARLENTEST and other macros are not
    253 used when UTF is not supported. To make sure they can never even appear when
    254 UTF support is omitted, we don't even define them. */
    255 
    256 #ifndef SUPPORT_UNICODE
    257 
    258 /* #define MAX_UTF_SINGLE_CU */
    259 /* #define HAS_EXTRALEN(c) */
    260 /* #define GET_EXTRALEN(c) */
    261 /* #define NOT_FIRSTCU(c) */
    262 #define GETCHAR(c, eptr) c = *eptr;
    263 #define GETCHARTEST(c, eptr) c = *eptr;
    264 #define GETCHARINC(c, eptr) c = *eptr++;
    265 #define GETCHARINCTEST(c, eptr) c = *eptr++;
    266 #define GETCHARLEN(c, eptr, len) c = *eptr;
    267 #define PUTCHAR(c, p) (*p = c, 1)
    268 /* #define GETCHARLENTEST(c, eptr, len) */
    269 /* #define BACKCHAR(eptr) */
    270 /* #define FORWARDCHAR(eptr) */
    271 /* #define FORWARCCHARTEST(eptr,end) */
    272 /* #define ACROSSCHAR(condition, eptr, action) */
    273 
    274 #else   /* SUPPORT_UNICODE */
    275 
    276 /* ------------------- 8-bit support  ------------------ */
    277 
    278 #if PCRE2_CODE_UNIT_WIDTH == 8
    279 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
    280 
    281 /* The largest UTF code point that can be encoded as a single code unit. */
    282 
    283 #define MAX_UTF_SINGLE_CU 127
    284 
    285 /* Tests whether the code point needs extra characters to decode. */
    286 
    287 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
    288 
    289 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
    290 Otherwise it has an undefined behaviour. */
    291 
    292 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
    293 
    294 /* Returns TRUE, if the given value is not the first code unit of a UTF
    295 sequence. */
    296 
    297 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
    298 
    299 /* Get the next UTF-8 character, not advancing the pointer. This is called when
    300 we know we are in UTF-8 mode. */
    301 
    302 #define GETCHAR(c, eptr) \
    303   c = *eptr; \
    304   if (c >= 0xc0u) GETUTF8(c, eptr);
    305 
    306 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
    307 pointer. */
    308 
    309 #define GETCHARTEST(c, eptr) \
    310   c = *eptr; \
    311   if (utf && c >= 0xc0u) GETUTF8(c, eptr);
    312 
    313 /* Get the next UTF-8 character, advancing the pointer. This is called when we
    314 know we are in UTF-8 mode. */
    315 
    316 #define GETCHARINC(c, eptr) \
    317   c = *eptr++; \
    318   if (c >= 0xc0u) GETUTF8INC(c, eptr);
    319 
    320 /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
    321 This is called when we don't know if we are in UTF-8 mode. */
    322 
    323 #define GETCHARINCTEST(c, eptr) \
    324   c = *eptr++; \
    325   if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
    326 
    327 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
    328 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
    329 
    330 #define GETCHARLEN(c, eptr, len) \
    331   c = *eptr; \
    332   if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
    333 
    334 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
    335 pointer, incrementing length if there are extra bytes. This is called when we
    336 do not know if we are in UTF-8 mode. */
    337 
    338 #define GETCHARLENTEST(c, eptr, len) \
    339   c = *eptr; \
    340   if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
    341 
    342 /* If the pointer is not at the start of a character, move it back until
    343 it is. This is called only in UTF-8 mode - we don't put a test within the macro
    344 because almost all calls are already within a block of UTF-8 only code. */
    345 
    346 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
    347 
    348 /* Same as above, just in the other direction. */
    349 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
    350 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
    351 
    352 /* Same as above, but it allows a fully customizable form. */
    353 #define ACROSSCHAR(condition, eptr, action) \
    354   while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
    355 
    356 /* Deposit a character into memory, returning the number of code units. */
    357 
    358 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
    359   PRIV(ord2utf)(c,p) : (*p = c, 1))
    360 
    361 
    362 /* ------------------- 16-bit support  ------------------ */
    363 
    364 #elif PCRE2_CODE_UNIT_WIDTH == 16
    365 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
    366 
    367 /* The largest UTF code point that can be encoded as a single code unit. */
    368 
    369 #define MAX_UTF_SINGLE_CU 65535
    370 
    371 /* Tests whether the code point needs extra characters to decode. */
    372 
    373 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
    374 
    375 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
    376 Otherwise it has an undefined behaviour. */
    377 
    378 #define GET_EXTRALEN(c) 1
    379 
    380 /* Returns TRUE, if the given value is not the first code unit of a UTF
    381 sequence. */
    382 
    383 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
    384 
    385 /* Base macro to pick up the low surrogate of a UTF-16 character, not
    386 advancing the pointer. */
    387 
    388 #define GETUTF16(c, eptr) \
    389    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
    390 
    391 /* Get the next UTF-16 character, not advancing the pointer. This is called when
    392 we know we are in UTF-16 mode. */
    393 
    394 #define GETCHAR(c, eptr) \
    395   c = *eptr; \
    396   if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
    397 
    398 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
    399 pointer. */
    400 
    401 #define GETCHARTEST(c, eptr) \
    402   c = *eptr; \
    403   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
    404 
    405 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
    406 the pointer. */
    407 
    408 #define GETUTF16INC(c, eptr) \
    409    { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
    410 
    411 /* Get the next UTF-16 character, advancing the pointer. This is called when we
    412 know we are in UTF-16 mode. */
    413 
    414 #define GETCHARINC(c, eptr) \
    415   c = *eptr++; \
    416   if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
    417 
    418 /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
    419 This is called when we don't know if we are in UTF-16 mode. */
    420 
    421 #define GETCHARINCTEST(c, eptr) \
    422   c = *eptr++; \
    423   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
    424 
    425 /* Base macro to pick up the low surrogate of a UTF-16 character, not
    426 advancing the pointer, incrementing the length. */
    427 
    428 #define GETUTF16LEN(c, eptr, len) \
    429    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
    430 
    431 /* Get the next UTF-16 character, not advancing the pointer, incrementing
    432 length if there is a low surrogate. This is called when we know we are in
    433 UTF-16 mode. */
    434 
    435 #define GETCHARLEN(c, eptr, len) \
    436   c = *eptr; \
    437   if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
    438 
    439 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
    440 pointer, incrementing length if there is a low surrogate. This is called when
    441 we do not know if we are in UTF-16 mode. */
    442 
    443 #define GETCHARLENTEST(c, eptr, len) \
    444   c = *eptr; \
    445   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
    446 
    447 /* If the pointer is not at the start of a character, move it back until
    448 it is. This is called only in UTF-16 mode - we don't put a test within the
    449 macro because almost all calls are already within a block of UTF-16 only
    450 code. */
    451 
    452 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
    453 
    454 /* Same as above, just in the other direction. */
    455 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
    456 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
    457 
    458 /* Same as above, but it allows a fully customizable form. */
    459 #define ACROSSCHAR(condition, eptr, action) \
    460   if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
    461 
    462 /* Deposit a character into memory, returning the number of code units. */
    463 
    464 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
    465   PRIV(ord2utf)(c,p) : (*p = c, 1))
    466 
    467 
    468 /* ------------------- 32-bit support  ------------------ */
    469 
    470 #else
    471 
    472 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
    473 into one PCRE2_UCHAR unit. */
    474 
    475 #define MAX_UTF_SINGLE_CU (0x10ffffu)
    476 #define HAS_EXTRALEN(c) (0)
    477 #define GET_EXTRALEN(c) (0)
    478 #define NOT_FIRSTCU(c) (0)
    479 
    480 /* Get the next UTF-32 character, not advancing the pointer. This is called when
    481 we know we are in UTF-32 mode. */
    482 
    483 #define GETCHAR(c, eptr) \
    484   c = *(eptr);
    485 
    486 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
    487 pointer. */
    488 
    489 #define GETCHARTEST(c, eptr) \
    490   c = *(eptr);
    491 
    492 /* Get the next UTF-32 character, advancing the pointer. This is called when we
    493 know we are in UTF-32 mode. */
    494 
    495 #define GETCHARINC(c, eptr) \
    496   c = *((eptr)++);
    497 
    498 /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
    499 This is called when we don't know if we are in UTF-32 mode. */
    500 
    501 #define GETCHARINCTEST(c, eptr) \
    502   c = *((eptr)++);
    503 
    504 /* Get the next UTF-32 character, not advancing the pointer, not incrementing
    505 length (since all UTF-32 is of length 1). This is called when we know we are in
    506 UTF-32 mode. */
    507 
    508 #define GETCHARLEN(c, eptr, len) \
    509   GETCHAR(c, eptr)
    510 
    511 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
    512 pointer, not incrementing the length (since all UTF-32 is of length 1).
    513 This is called when we do not know if we are in UTF-32 mode. */
    514 
    515 #define GETCHARLENTEST(c, eptr, len) \
    516   GETCHARTEST(c, eptr)
    517 
    518 /* If the pointer is not at the start of a character, move it back until
    519 it is. This is called only in UTF-32 mode - we don't put a test within the
    520 macro because almost all calls are already within a block of UTF-32 only
    521 code.
    522 
    523 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
    524 
    525 #define BACKCHAR(eptr) do { } while (0)
    526 
    527 /* Same as above, just in the other direction. */
    528 
    529 #define FORWARDCHAR(eptr) do { } while (0)
    530 #define FORWARDCHARTEST(eptr,end) do { } while (0)
    531 
    532 /* Same as above, but it allows a fully customizable form. */
    533 
    534 #define ACROSSCHAR(condition, eptr, action) do { } while (0)
    535 
    536 /* Deposit a character into memory, returning the number of code units. */
    537 
    538 #define PUTCHAR(c, p) (*p = c, 1)
    539 
    540 #endif  /* UTF-32 character handling */
    541 #endif  /* SUPPORT_UNICODE */
    542 
    543 
    544 /* Mode-dependent macros that have the same definition in all modes. */
    545 
    546 #define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
    547 #define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
    548 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
    549 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
    550 
    551 
    552 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
    553 
    554 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
    555 code that uses them is simpler because it assumes this. */
    556 
    557 /* The real general context structure. At present it holds only data for custom
    558 memory control. */
    559 
    560 typedef struct pcre2_real_general_context {
    561   pcre2_memctl memctl;
    562 } pcre2_real_general_context;
    563 
    564 /* The real compile context structure */
    565 
    566 typedef struct pcre2_real_compile_context {
    567   pcre2_memctl memctl;
    568   int (*stack_guard)(uint32_t, void *);
    569   void *stack_guard_data;
    570   const uint8_t *tables;
    571   PCRE2_SIZE max_pattern_length;
    572   uint16_t bsr_convention;
    573   uint16_t newline_convention;
    574   uint32_t parens_nest_limit;
    575   uint32_t extra_options;
    576 } pcre2_real_compile_context;
    577 
    578 /* The real match context structure. */
    579 
    580 typedef struct pcre2_real_match_context {
    581   pcre2_memctl memctl;
    582 #ifdef SUPPORT_JIT
    583   pcre2_jit_callback jit_callback;
    584   void *jit_callback_data;
    585 #endif
    586   int    (*callout)(pcre2_callout_block *, void *);
    587   void    *callout_data;
    588   PCRE2_SIZE offset_limit;
    589   uint32_t heap_limit;
    590   uint32_t match_limit;
    591   uint32_t depth_limit;
    592 } pcre2_real_match_context;
    593 
    594 /* The real convert context structure. */
    595 
    596 typedef struct pcre2_real_convert_context {
    597   pcre2_memctl memctl;
    598   uint32_t glob_separator;
    599   uint32_t glob_escape;
    600 } pcre2_real_convert_context;
    601 
    602 /* The real compiled code structure. The type for the blocksize field is
    603 defined specially because it is required in pcre2_serialize_decode() when
    604 copying the size from possibly unaligned memory into a variable of the same
    605 type. Use a macro rather than a typedef to avoid compiler warnings when this
    606 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
    607 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
    608 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
    609 here.) */
    610 
    611 #undef  CODE_BLOCKSIZE_TYPE
    612 #define CODE_BLOCKSIZE_TYPE size_t
    613 
    614 #undef  LOOKBEHIND_MAX
    615 #define LOOKBEHIND_MAX UINT16_MAX
    616 
    617 typedef struct pcre2_real_code {
    618   pcre2_memctl memctl;            /* Memory control fields */
    619   const uint8_t *tables;          /* The character tables */
    620   void    *executable_jit;        /* Pointer to JIT code */
    621   uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
    622   CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
    623   uint32_t magic_number;          /* Paranoid and endianness check */
    624   uint32_t compile_options;       /* Options passed to pcre2_compile() */
    625   uint32_t overall_options;       /* Options after processing the pattern */
    626   uint32_t extra_options;         /* Taken from compile_context */
    627   uint32_t flags;                 /* Various state flags */
    628   uint32_t limit_heap;            /* Limit set in the pattern */
    629   uint32_t limit_match;           /* Limit set in the pattern */
    630   uint32_t limit_depth;           /* Limit set in the pattern */
    631   uint32_t first_codeunit;        /* Starting code unit */
    632   uint32_t last_codeunit;         /* This codeunit must be seen */
    633   uint16_t bsr_convention;        /* What \R matches */
    634   uint16_t newline_convention;    /* What is a newline? */
    635   uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
    636   uint16_t minlength;             /* Minimum length of match */
    637   uint16_t top_bracket;           /* Highest numbered group */
    638   uint16_t top_backref;           /* Highest numbered back reference */
    639   uint16_t name_entry_size;       /* Size (code units) of table entries */
    640   uint16_t name_count;            /* Number of name entries in the table */
    641 } pcre2_real_code;
    642 
    643 /* The real match data structure. Define ovector as large as it can ever
    644 actually be so that array bound checkers don't grumble. Memory for this
    645 structure is obtained by calling pcre2_match_data_create(), which sets the size
    646 as the offset of ovector plus a pair of elements for each capturable string, so
    647 the size varies from call to call. As the maximum number of capturing
    648 subpatterns is 65535 we must allow for 65536 strings to include the overall
    649 match. (See also the heapframe structure below.) */
    650 
    651 typedef struct pcre2_real_match_data {
    652   pcre2_memctl     memctl;
    653   const pcre2_real_code *code;    /* The pattern used for the match */
    654   PCRE2_SPTR       subject;       /* The subject that was matched */
    655   PCRE2_SPTR       mark;          /* Pointer to last mark */
    656   PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
    657   PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
    658   PCRE2_SIZE       startchar;     /* Offset to starting code unit */
    659   uint16_t         matchedby;     /* Type of match (normal, JIT, DFA) */
    660   uint16_t         oveccount;     /* Number of pairs */
    661   int              rc;            /* The return code from the match */
    662   PCRE2_SIZE       ovector[131072]; /* Must be last in the structure */
    663 } pcre2_real_match_data;
    664 
    665 
    666 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
    667 
    668 /* These structures are not needed for pcre2test. */
    669 
    670 #ifndef PCRE2_PCRE2TEST
    671 
    672 /* Structures for checking for mutual recursion when scanning compiled or
    673 parsed code. */
    674 
    675 typedef struct recurse_check {
    676   struct recurse_check *prev;
    677   PCRE2_SPTR group;
    678 } recurse_check;
    679 
    680 typedef struct parsed_recurse_check {
    681   struct parsed_recurse_check *prev;
    682   uint32_t *groupptr;
    683 } parsed_recurse_check;
    684 
    685 /* Structure for building a cache when filling in recursion offsets. */
    686 
    687 typedef struct recurse_cache {
    688   PCRE2_SPTR group;
    689   int groupnumber;
    690 } recurse_cache;
    691 
    692 /* Structure for maintaining a chain of pointers to the currently incomplete
    693 branches, for testing for left recursion while compiling. */
    694 
    695 typedef struct branch_chain {
    696   struct branch_chain *outer;
    697   PCRE2_UCHAR *current_branch;
    698 } branch_chain;
    699 
    700 /* Structure for building a list of named groups during the first pass of
    701 compiling. */
    702 
    703 typedef struct named_group {
    704   PCRE2_SPTR   name;          /* Points to the name in the pattern */
    705   uint32_t     number;        /* Group number */
    706   uint16_t     length;        /* Length of the name */
    707   uint16_t     isdup;         /* TRUE if a duplicate */
    708 } named_group;
    709 
    710 /* Structure for passing "static" information around between the functions
    711 doing the compiling, so that they are thread-safe. */
    712 
    713 typedef struct compile_block {
    714   pcre2_real_compile_context *cx;  /* Points to the compile context */
    715   const uint8_t *lcc;              /* Points to lower casing table */
    716   const uint8_t *fcc;              /* Points to case-flipping table */
    717   const uint8_t *cbits;            /* Points to character type table */
    718   const uint8_t *ctypes;           /* Points to table of type maps */
    719   PCRE2_SPTR start_workspace;      /* The start of working space */
    720   PCRE2_SPTR start_code;           /* The start of the compiled code */
    721   PCRE2_SPTR start_pattern;        /* The start of the pattern */
    722   PCRE2_SPTR end_pattern;          /* The end of the pattern */
    723   PCRE2_UCHAR *name_table;         /* The name/number table */
    724   PCRE2_SIZE workspace_size;       /* Size of workspace */
    725   PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
    726   PCRE2_SIZE erroroffset;          /* Offset of error in pattern */
    727   uint16_t names_found;            /* Number of entries so far */
    728   uint16_t name_entry_size;        /* Size of each entry */
    729   uint16_t parens_depth;           /* Depth of nested parentheses */
    730   uint16_t assert_depth;           /* Depth of nested assertions */
    731   open_capitem *open_caps;         /* Chain of open capture items */
    732   named_group *named_groups;       /* Points to vector in pre-compile */
    733   uint32_t named_group_list_size;  /* Number of entries in the list */
    734   uint32_t external_options;       /* External (initial) options */
    735   uint32_t external_flags;         /* External flag bits to be set */
    736   uint32_t bracount;               /* Count of capturing parentheses */
    737   uint32_t lastcapture;            /* Last capture encountered */
    738   uint32_t *parsed_pattern;        /* Parsed pattern buffer */
    739   uint32_t *parsed_pattern_end;    /* Parsed pattern should not get here */
    740   uint32_t *groupinfo;             /* Group info vector */
    741   uint32_t top_backref;            /* Maximum back reference */
    742   uint32_t backref_map;            /* Bitmap of low back refs */
    743   uint32_t nltype;                 /* Newline type */
    744   uint32_t nllen;                  /* Newline string length */
    745   uint32_t class_range_start;      /* Overall class range start */
    746   uint32_t class_range_end;        /* Overall class range end */
    747   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
    748   int  max_lookbehind;             /* Maximum lookbehind (characters) */
    749   int  req_varyopt;                /* "After variable item" flag for reqbyte */
    750   BOOL had_accept;                 /* (*ACCEPT) encountered */
    751   BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
    752   BOOL had_recurse;                /* Had a recursion or subroutine call */
    753   BOOL dupnames;                   /* Duplicate names exist */
    754 } compile_block;
    755 
    756 /* Structure for keeping the properties of the in-memory stack used
    757 by the JIT matcher. */
    758 
    759 typedef struct pcre2_real_jit_stack {
    760   pcre2_memctl memctl;
    761   void* stack;
    762 } pcre2_real_jit_stack;
    763 
    764 /* Structure for items in a linked list that represents an explicit recursive
    765 call within the pattern when running pcre_dfa_match(). */
    766 
    767 typedef struct dfa_recursion_info {
    768   struct dfa_recursion_info *prevrec;
    769   PCRE2_SPTR subject_position;
    770   uint32_t group_num;
    771 } dfa_recursion_info;
    772 
    773 /* Structure for "stack" frames that are used for remembering backtracking
    774 positions during matching. As these are used in a vector, with the ovector item
    775 being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
    776 only way to check this at compile time is to force an error by generating an
    777 array with a negative size. By putting this in a typedef (which is never used),
    778 we don't generate any code when all is well. */
    779 
    780 typedef struct heapframe {
    781 
    782   /* The first set of fields are variables that have to be preserved over calls
    783   to RRMATCH(), but which do not need to be copied to new frames. */
    784 
    785   PCRE2_SPTR ecode;          /* The current position in the pattern */
    786   PCRE2_SPTR temp_sptr[2];   /* Used for short-term PCRE_SPTR values */
    787   PCRE2_SIZE length;         /* Used for character, string, or code lengths */
    788   PCRE2_SIZE back_frame;     /* Amount to subtract on RRETURN */
    789   PCRE2_SIZE temp_size;      /* Used for short-term PCRE2_SIZE values */
    790   uint32_t rdepth;           /* "Recursion" depth */
    791   uint32_t group_frame_type; /* Type information for group frames */
    792   uint32_t temp_32[4];       /* Used for short-term 32-bit or BOOL values */
    793   uint8_t return_id;         /* Where to go on in internal "return" */
    794   uint8_t op;                /* Processing opcode */
    795 
    796   /* At this point, the structure is 16-bit aligned. On most architectures
    797   the alignment requirement for a pointer will ensure that the eptr field below
    798   is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
    799   that is 16-bit aligned. We must therefore ensure that what comes between here
    800   and eptr is an odd multiple of 16 bits so as to get back into 32-bit
    801   alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
    802   fudges in the other cases. In the 32-bit case the padding comes first so that
    803   the occu field itself is 32-bit aligned. Without the padding, this structure
    804   is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
    805 
    806 #if PCRE2_CODE_UNIT_WIDTH == 8
    807   PCRE2_UCHAR occu[6];       /* Used for other case code units */
    808 #elif PCRE2_CODE_UNIT_WIDTH == 16
    809   PCRE2_UCHAR occu[2];       /* Used for other case code units */
    810   uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
    811 #else
    812   uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
    813   PCRE2_UCHAR occu[1];       /* Used for other case code units */
    814 #endif
    815 
    816   /* The rest have to be copied from the previous frame whenever a new frame
    817   becomes current. The final field is specified as a large vector so that
    818   runtime array bound checks don't catch references to it. However, for any
    819   specific call to pcre2_match() the memory allocated for each frame structure
    820   allows for exactly the right size ovector for the number of capturing
    821   parentheses. (See also the comment for pcre2_real_match_data above.) */
    822 
    823   PCRE2_SPTR eptr;           /* MUST BE FIRST */
    824   PCRE2_SPTR start_match;    /* Can be adjusted by \K */
    825   PCRE2_SPTR mark;           /* Most recent mark on the success path */
    826   uint32_t current_recurse;  /* Current (deepest) recursion number */
    827   uint32_t capture_last;     /* Most recent capture */
    828   PCRE2_SIZE last_group_offset;  /* Saved offset to most recent group frame */
    829   PCRE2_SIZE offset_top;     /* Offset after highest capture */
    830   PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
    831 } heapframe;
    832 
    833 /* This typedef is a check that the size of the heapframe structure is a
    834 multiple of PCRE2_SIZE. See various comments above. */
    835 
    836 typedef char check_heapframe_size[
    837   ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
    838 
    839 /* Structure for passing "static" information around between the functions
    840 doing traditional NFA matching (pcre2_match() and friends). */
    841 
    842 typedef struct match_block {
    843   pcre2_memctl memctl;            /* For general use */
    844   PCRE2_SIZE frame_vector_size;   /* Size of a backtracking frame */
    845   heapframe *match_frames;        /* Points to vector of frames */
    846   heapframe *match_frames_top;    /* Points after the end of the vector */
    847   heapframe *stack_frames;        /* The original vector on the stack */
    848   PCRE2_SIZE heap_limit;          /* As it says */
    849   uint32_t match_limit;           /* As it says */
    850   uint32_t match_limit_depth;     /* As it says */
    851   uint32_t match_call_count;      /* Number of times a new frame is created */
    852   BOOL hitend;                    /* Hit the end of the subject at some point */
    853   BOOL hasthen;                   /* Pattern contains (*THEN) */
    854   const uint8_t *lcc;             /* Points to lower casing table */
    855   const uint8_t *fcc;             /* Points to case-flipping table */
    856   const uint8_t *ctypes;          /* Points to table of type maps */
    857   PCRE2_SIZE start_offset;        /* The start offset value */
    858   PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
    859   uint16_t partial;               /* PARTIAL options */
    860   uint16_t bsr_convention;        /* \R interpretation */
    861   uint16_t name_count;            /* Number of names in name table */
    862   uint16_t name_entry_size;       /* Size of entry in names table */
    863   PCRE2_SPTR name_table;          /* Table of group names */
    864   PCRE2_SPTR start_code;          /* For use when recursing */
    865   PCRE2_SPTR start_subject;       /* Start of the subject string */
    866   PCRE2_SPTR end_subject;         /* End of the subject string */
    867   PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
    868   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
    869   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
    870   PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
    871   PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
    872   PCRE2_SPTR verb_ecode_ptr;      /* For passing back info */
    873   PCRE2_SPTR verb_skip_ptr;       /* For passing back a (*SKIP) name */
    874   uint32_t verb_current_recurse;  /* Current recurse when (*VERB) happens */
    875   uint32_t moptions;              /* Match options */
    876   uint32_t poptions;              /* Pattern options */
    877   uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
    878   uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
    879   uint32_t nltype;                /* Newline type */
    880   uint32_t nllen;                 /* Newline string length */
    881   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
    882   pcre2_callout_block *cb;        /* Points to a callout block */
    883   void  *callout_data;            /* To pass back to callouts */
    884   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
    885 } match_block;
    886 
    887 /* A similar structure is used for the same purpose by the DFA matching
    888 functions. */
    889 
    890 typedef struct dfa_match_block {
    891   pcre2_memctl memctl;            /* For general use */
    892   PCRE2_SPTR start_code;          /* Start of the compiled pattern */
    893   PCRE2_SPTR start_subject ;      /* Start of the subject string */
    894   PCRE2_SPTR end_subject;         /* End of subject string */
    895   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
    896   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
    897   const uint8_t *tables;          /* Character tables */
    898   PCRE2_SIZE start_offset;        /* The start offset value */
    899   PCRE2_SIZE heap_limit;          /* As it says */
    900   PCRE2_SIZE heap_used;           /* As it says */
    901   uint32_t match_limit;           /* As it says */
    902   uint32_t match_limit_depth;     /* As it says */
    903   uint32_t match_call_count;      /* Number of calls of internal function */
    904   uint32_t moptions;              /* Match options */
    905   uint32_t poptions;              /* Pattern options */
    906   uint32_t nltype;                /* Newline type */
    907   uint32_t nllen;                 /* Newline string length */
    908   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
    909   uint16_t bsr_convention;        /* \R interpretation */
    910   pcre2_callout_block *cb;        /* Points to a callout block */
    911   void *callout_data;             /* To pass back to callouts */
    912   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
    913   dfa_recursion_info *recursive;  /* Linked list of recursion data */
    914 } dfa_match_block;
    915 
    916 #endif  /* PCRE2_PCRE2TEST */
    917 
    918 /* End of pcre2_intmodedep.h */
    919