Home | History | Annotate | Download | only in src
      1 /*************************************************
      2 *      Perl-Compatible Regular Expressions       *
      3 *************************************************/
      4 
      5 /* PCRE is a library of functions to support regular expressions whose syntax
      6 and semantics are as close as possible to those of the Perl 5 language.
      7 
      8                        Written by Philip Hazel
      9      Original API code Copyright (c) 1997-2012 University of Cambridge
     10          New API code Copyright (c) 2016 University of Cambridge
     11 
     12 -----------------------------------------------------------------------------
     13 Redistribution and use in source and binary forms, with or without
     14 modification, are permitted provided that the following conditions are met:
     15 
     16     * Redistributions of source code must retain the above copyright notice,
     17       this list of conditions and the following disclaimer.
     18 
     19     * Redistributions in binary form must reproduce the above copyright
     20       notice, this list of conditions and the following disclaimer in the
     21       documentation and/or other materials provided with the distribution.
     22 
     23     * Neither the name of the University of Cambridge nor the names of its
     24       contributors may be used to endorse or promote products derived from
     25       this software without specific prior written permission.
     26 
     27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37 POSSIBILITY OF SUCH DAMAGE.
     38 -----------------------------------------------------------------------------
     39 */
     40 
     41 
     42 /* This module contains mode-dependent macro and structure definitions. The
     43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
     44 These mode-dependent items are kept in a separate file so that they can also be
     45 #included multiple times for different code unit widths by pcre2test in order
     46 to have access to the hidden structures at all supported widths.
     47 
     48 Some of the mode-dependent macros are required at different widths for
     49 different parts of the pcre2test code (in particular, the included
     50 pcre_printint.c file). We undefine them here so that they can be re-defined for
     51 multiple inclusions. Not all of these are used in pcre2test, but it's easier
     52 just to undefine them all. */
     53 
     54 #undef ACROSSCHAR
     55 #undef BACKCHAR
     56 #undef BYTES2CU
     57 #undef CU2BYTES
     58 #undef FORWARDCHAR
     59 #undef FORWARDCHARTEST
     60 #undef GET
     61 #undef GET2
     62 #undef GETCHAR
     63 #undef GETCHARINC
     64 #undef GETCHARINCTEST
     65 #undef GETCHARLEN
     66 #undef GETCHARLENTEST
     67 #undef GETCHARTEST
     68 #undef GET_EXTRALEN
     69 #undef HAS_EXTRALEN
     70 #undef IMM2_SIZE
     71 #undef MAX_255
     72 #undef MAX_MARK
     73 #undef MAX_PATTERN_SIZE
     74 #undef MAX_UTF_SINGLE_CU
     75 #undef NOT_FIRSTCU
     76 #undef PUT
     77 #undef PUT2
     78 #undef PUT2INC
     79 #undef PUTCHAR
     80 #undef PUTINC
     81 #undef TABLE_GET
     82 
     83 
     84 
     85 /* -------------------------- MACROS ----------------------------- */
     86 
     87 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
     88 (always stored in big-endian order in 8-bit mode) by default. These are used,
     89 for example, to link from the start of a subpattern to its alternatives and its
     90 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
     91 to around 64K, which is big enough for almost everybody. However, I received a
     92 request for an even bigger limit. For this reason, and also to make the code
     93 easier to maintain, the storing and loading of offsets from the compiled code
     94 unit string is now handled by the macros that are defined here.
     95 
     96 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
     97 values of 3 or 4 are also supported. */
     98 
     99 /* ------------------- 8-bit support  ------------------ */
    100 
    101 #if PCRE2_CODE_UNIT_WIDTH == 8
    102 
    103 #if LINK_SIZE == 2
    104 #define PUT(a,n,d)   \
    105   (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
    106   (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
    107 #define GET(a,n) \
    108   (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
    109 #define MAX_PATTERN_SIZE (1 << 16)
    110 
    111 #elif LINK_SIZE == 3
    112 #define PUT(a,n,d)       \
    113   (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
    114   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
    115   (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
    116 #define GET(a,n) \
    117   (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
    118 #define MAX_PATTERN_SIZE (1 << 24)
    119 
    120 #elif LINK_SIZE == 4
    121 #define PUT(a,n,d)        \
    122   (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
    123   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
    124   (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
    125   (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
    126 #define GET(a,n) \
    127   (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
    128 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
    129 
    130 #else
    131 #error LINK_SIZE must be 2, 3, or 4
    132 #endif
    133 
    134 
    135 /* ------------------- 16-bit support  ------------------ */
    136 
    137 #elif PCRE2_CODE_UNIT_WIDTH == 16
    138 
    139 #if LINK_SIZE == 2
    140 #undef LINK_SIZE
    141 #define LINK_SIZE 1
    142 #define PUT(a,n,d)   \
    143   (a[n] = (d))
    144 #define GET(a,n) \
    145   (a[n])
    146 #define MAX_PATTERN_SIZE (1 << 16)
    147 
    148 #elif LINK_SIZE == 3 || LINK_SIZE == 4
    149 #undef LINK_SIZE
    150 #define LINK_SIZE 2
    151 #define PUT(a,n,d)   \
    152   (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
    153   (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
    154 #define GET(a,n) \
    155   (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
    156 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
    157 
    158 #else
    159 #error LINK_SIZE must be 2, 3, or 4
    160 #endif
    161 
    162 
    163 /* ------------------- 32-bit support  ------------------ */
    164 
    165 #elif PCRE2_CODE_UNIT_WIDTH == 32
    166 #undef LINK_SIZE
    167 #define LINK_SIZE 1
    168 #define PUT(a,n,d)   \
    169   (a[n] = (d))
    170 #define GET(a,n) \
    171   (a[n])
    172 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
    173 
    174 #else
    175 #error Unsupported compiling mode
    176 #endif
    177 
    178 
    179 /* --------------- Other mode-specific macros ----------------- */
    180 
    181 /* PCRE uses some other (at least) 16-bit quantities that do not change when
    182 the size of offsets changes. There are used for repeat counts and for other
    183 things such as capturing parenthesis numbers in back references.
    184 
    185 Define the number of code units required to hold a 16-bit count/offset, and
    186 macros to load and store such a value. For reasons that I do not understand,
    187 the expression in the 8-bit GET2 macro is treated by gcc as a signed
    188 expression, even when a is declared as unsigned. It seems that any kind of
    189 arithmetic results in a signed value. Hence the cast. */
    190 
    191 #if PCRE2_CODE_UNIT_WIDTH == 8
    192 #define IMM2_SIZE 2
    193 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
    194 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
    195 
    196 #else  /* Code units are 16 or 32 bits */
    197 #define IMM2_SIZE 1
    198 #define GET2(a,n) a[n]
    199 #define PUT2(a,n,d) a[n] = d
    200 #endif
    201 
    202 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
    203 whether its argument is less than 256. The maximum length of a MARK name must
    204 fit in one code unit; currently it is set to 255 or 65535. The TABLE_GET macro
    205 is used to access elements of tables containing exactly 256 items. When code
    206 points can be greater than 255, a check is needed before accessing these
    207 tables. */
    208 
    209 #if PCRE2_CODE_UNIT_WIDTH == 8
    210 #define MAX_255(c) TRUE
    211 #define MAX_MARK ((1u << 8) - 1)
    212 #ifdef SUPPORT_UNICODE
    213 #define SUPPORT_WIDE_CHARS
    214 #endif  /* SUPPORT_UNICODE */
    215 #define TABLE_GET(c, table, default) ((table)[c])
    216 
    217 #else  /* Code units are 16 or 32 bits */
    218 #define MAX_255(c) ((c) <= 255u)
    219 #define MAX_MARK ((1u << 16) - 1)
    220 #define SUPPORT_WIDE_CHARS
    221 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
    222 #endif
    223 
    224 
    225 
    226 /* ----------------- Character-handling macros ----------------- */
    227 
    228 /* There is a proposed future special "UTF-21" mode, in which only the lowest
    229 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
    230 high-order bits available to the application for other uses. In preparation for
    231 the future implementation of this mode, there are macros that load a data item
    232 and, if in this special mode, mask it to 21 bits. These macros all have names
    233 starting with UCHAR21. In all other modes, including the normal 32-bit
    234 library, the macros all have the same simple definitions. When the new mode is
    235 implemented, it is expected that these definitions will be varied appropriately
    236 using #ifdef when compiling the library that supports the special mode. */
    237 
    238 #define UCHAR21(eptr)        (*(eptr))
    239 #define UCHAR21TEST(eptr)    (*(eptr))
    240 #define UCHAR21INC(eptr)     (*(eptr)++)
    241 #define UCHAR21INCTEST(eptr) (*(eptr)++)
    242 
    243 /* When UTF encoding is being used, a character is no longer just a single
    244 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
    245 handling generate simple sequences when used in the basic mode, and more
    246 complicated ones for UTF characters. GETCHARLENTEST and other macros are not
    247 used when UTF is not supported. To make sure they can never even appear when
    248 UTF support is omitted, we don't even define them. */
    249 
    250 #ifndef SUPPORT_UNICODE
    251 
    252 /* #define MAX_UTF_SINGLE_CU */
    253 /* #define HAS_EXTRALEN(c) */
    254 /* #define GET_EXTRALEN(c) */
    255 /* #define NOT_FIRSTCU(c) */
    256 #define GETCHAR(c, eptr) c = *eptr;
    257 #define GETCHARTEST(c, eptr) c = *eptr;
    258 #define GETCHARINC(c, eptr) c = *eptr++;
    259 #define GETCHARINCTEST(c, eptr) c = *eptr++;
    260 #define GETCHARLEN(c, eptr, len) c = *eptr;
    261 #define PUTCHAR(c, p) (*p = c, 1)
    262 /* #define GETCHARLENTEST(c, eptr, len) */
    263 /* #define BACKCHAR(eptr) */
    264 /* #define FORWARDCHAR(eptr) */
    265 /* #define FORWARCCHARTEST(eptr,end) */
    266 /* #define ACROSSCHAR(condition, eptr, action) */
    267 
    268 #else   /* SUPPORT_UNICODE */
    269 
    270 /* ------------------- 8-bit support  ------------------ */
    271 
    272 #if PCRE2_CODE_UNIT_WIDTH == 8
    273 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
    274 
    275 /* The largest UTF code point that can be encoded as a single code unit. */
    276 
    277 #define MAX_UTF_SINGLE_CU 127
    278 
    279 /* Tests whether the code point needs extra characters to decode. */
    280 
    281 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
    282 
    283 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
    284 Otherwise it has an undefined behaviour. */
    285 
    286 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
    287 
    288 /* Returns TRUE, if the given value is not the first code unit of a UTF
    289 sequence. */
    290 
    291 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
    292 
    293 /* Get the next UTF-8 character, not advancing the pointer. This is called when
    294 we know we are in UTF-8 mode. */
    295 
    296 #define GETCHAR(c, eptr) \
    297   c = *eptr; \
    298   if (c >= 0xc0u) GETUTF8(c, eptr);
    299 
    300 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
    301 pointer. */
    302 
    303 #define GETCHARTEST(c, eptr) \
    304   c = *eptr; \
    305   if (utf && c >= 0xc0u) GETUTF8(c, eptr);
    306 
    307 /* Get the next UTF-8 character, advancing the pointer. This is called when we
    308 know we are in UTF-8 mode. */
    309 
    310 #define GETCHARINC(c, eptr) \
    311   c = *eptr++; \
    312   if (c >= 0xc0u) GETUTF8INC(c, eptr);
    313 
    314 /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
    315 This is called when we don't know if we are in UTF-8 mode. */
    316 
    317 #define GETCHARINCTEST(c, eptr) \
    318   c = *eptr++; \
    319   if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
    320 
    321 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
    322 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
    323 
    324 #define GETCHARLEN(c, eptr, len) \
    325   c = *eptr; \
    326   if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
    327 
    328 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
    329 pointer, incrementing length if there are extra bytes. This is called when we
    330 do not know if we are in UTF-8 mode. */
    331 
    332 #define GETCHARLENTEST(c, eptr, len) \
    333   c = *eptr; \
    334   if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
    335 
    336 /* If the pointer is not at the start of a character, move it back until
    337 it is. This is called only in UTF-8 mode - we don't put a test within the macro
    338 because almost all calls are already within a block of UTF-8 only code. */
    339 
    340 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
    341 
    342 /* Same as above, just in the other direction. */
    343 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
    344 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
    345 
    346 /* Same as above, but it allows a fully customizable form. */
    347 #define ACROSSCHAR(condition, eptr, action) \
    348   while((condition) && ((eptr) & 0xc0u) == 0x80u) action
    349 
    350 /* Deposit a character into memory, returning the number of code units. */
    351 
    352 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
    353   PRIV(ord2utf)(c,p) : (*p = c, 1))
    354 
    355 
    356 /* ------------------- 16-bit support  ------------------ */
    357 
    358 #elif PCRE2_CODE_UNIT_WIDTH == 16
    359 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
    360 
    361 /* The largest UTF code point that can be encoded as a single code unit. */
    362 
    363 #define MAX_UTF_SINGLE_CU 65535
    364 
    365 /* Tests whether the code point needs extra characters to decode. */
    366 
    367 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
    368 
    369 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
    370 Otherwise it has an undefined behaviour. */
    371 
    372 #define GET_EXTRALEN(c) 1
    373 
    374 /* Returns TRUE, if the given value is not the first code unit of a UTF
    375 sequence. */
    376 
    377 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
    378 
    379 /* Base macro to pick up the low surrogate of a UTF-16 character, not
    380 advancing the pointer. */
    381 
    382 #define GETUTF16(c, eptr) \
    383    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
    384 
    385 /* Get the next UTF-16 character, not advancing the pointer. This is called when
    386 we know we are in UTF-16 mode. */
    387 
    388 #define GETCHAR(c, eptr) \
    389   c = *eptr; \
    390   if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
    391 
    392 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
    393 pointer. */
    394 
    395 #define GETCHARTEST(c, eptr) \
    396   c = *eptr; \
    397   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
    398 
    399 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
    400 the pointer. */
    401 
    402 #define GETUTF16INC(c, eptr) \
    403    { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
    404 
    405 /* Get the next UTF-16 character, advancing the pointer. This is called when we
    406 know we are in UTF-16 mode. */
    407 
    408 #define GETCHARINC(c, eptr) \
    409   c = *eptr++; \
    410   if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
    411 
    412 /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
    413 This is called when we don't know if we are in UTF-16 mode. */
    414 
    415 #define GETCHARINCTEST(c, eptr) \
    416   c = *eptr++; \
    417   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
    418 
    419 /* Base macro to pick up the low surrogate of a UTF-16 character, not
    420 advancing the pointer, incrementing the length. */
    421 
    422 #define GETUTF16LEN(c, eptr, len) \
    423    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
    424 
    425 /* Get the next UTF-16 character, not advancing the pointer, incrementing
    426 length if there is a low surrogate. This is called when we know we are in
    427 UTF-16 mode. */
    428 
    429 #define GETCHARLEN(c, eptr, len) \
    430   c = *eptr; \
    431   if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
    432 
    433 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
    434 pointer, incrementing length if there is a low surrogate. This is called when
    435 we do not know if we are in UTF-16 mode. */
    436 
    437 #define GETCHARLENTEST(c, eptr, len) \
    438   c = *eptr; \
    439   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
    440 
    441 /* If the pointer is not at the start of a character, move it back until
    442 it is. This is called only in UTF-16 mode - we don't put a test within the
    443 macro because almost all calls are already within a block of UTF-16 only
    444 code. */
    445 
    446 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
    447 
    448 /* Same as above, just in the other direction. */
    449 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
    450 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
    451 
    452 /* Same as above, but it allows a fully customizable form. */
    453 #define ACROSSCHAR(condition, eptr, action) \
    454   if ((condition) && ((eptr) & 0xfc00u) == 0xdc00u) action
    455 
    456 /* Deposit a character into memory, returning the number of code units. */
    457 
    458 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
    459   PRIV(ord2utf)(c,p) : (*p = c, 1))
    460 
    461 
    462 /* ------------------- 32-bit support  ------------------ */
    463 
    464 #else
    465 
    466 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
    467 into one PCRE2_UCHAR unit. */
    468 
    469 #define MAX_UTF_SINGLE_CU (0x10ffffu)
    470 #define HAS_EXTRALEN(c) (0)
    471 #define GET_EXTRALEN(c) (0)
    472 #define NOT_FIRSTCU(c) (0)
    473 
    474 /* Get the next UTF-32 character, not advancing the pointer. This is called when
    475 we know we are in UTF-32 mode. */
    476 
    477 #define GETCHAR(c, eptr) \
    478   c = *(eptr);
    479 
    480 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
    481 pointer. */
    482 
    483 #define GETCHARTEST(c, eptr) \
    484   c = *(eptr);
    485 
    486 /* Get the next UTF-32 character, advancing the pointer. This is called when we
    487 know we are in UTF-32 mode. */
    488 
    489 #define GETCHARINC(c, eptr) \
    490   c = *((eptr)++);
    491 
    492 /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
    493 This is called when we don't know if we are in UTF-32 mode. */
    494 
    495 #define GETCHARINCTEST(c, eptr) \
    496   c = *((eptr)++);
    497 
    498 /* Get the next UTF-32 character, not advancing the pointer, not incrementing
    499 length (since all UTF-32 is of length 1). This is called when we know we are in
    500 UTF-32 mode. */
    501 
    502 #define GETCHARLEN(c, eptr, len) \
    503   GETCHAR(c, eptr)
    504 
    505 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
    506 pointer, not incrementing the length (since all UTF-32 is of length 1).
    507 This is called when we do not know if we are in UTF-32 mode. */
    508 
    509 #define GETCHARLENTEST(c, eptr, len) \
    510   GETCHARTEST(c, eptr)
    511 
    512 /* If the pointer is not at the start of a character, move it back until
    513 it is. This is called only in UTF-32 mode - we don't put a test within the
    514 macro because almost all calls are already within a block of UTF-32 only
    515 code.
    516 
    517 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
    518 
    519 #define BACKCHAR(eptr) do { } while (0)
    520 
    521 /* Same as above, just in the other direction. */
    522 
    523 #define FORWARDCHAR(eptr) do { } while (0)
    524 #define FORWARDCHARTEST(eptr,end) do { } while (0)
    525 
    526 /* Same as above, but it allows a fully customizable form. */
    527 
    528 #define ACROSSCHAR(condition, eptr, action) do { } while (0)
    529 
    530 /* Deposit a character into memory, returning the number of code units. */
    531 
    532 #define PUTCHAR(c, p) (*p = c, 1)
    533 
    534 #endif  /* UTF-32 character handling */
    535 #endif  /* SUPPORT_UNICODE */
    536 
    537 
    538 /* Mode-dependent macros that have the same definition in all modes. */
    539 
    540 #define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
    541 #define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
    542 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
    543 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
    544 
    545 
    546 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
    547 
    548 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
    549 code that uses them is simpler because it assumes this. */
    550 
    551 /* The real general context structure. At present it holds only data for custom
    552 memory control. */
    553 
    554 typedef struct pcre2_real_general_context {
    555   pcre2_memctl memctl;
    556 } pcre2_real_general_context;
    557 
    558 /* The real compile context structure */
    559 
    560 typedef struct pcre2_real_compile_context {
    561   pcre2_memctl memctl;
    562   int (*stack_guard)(uint32_t, void *);
    563   void *stack_guard_data;
    564   const uint8_t *tables;
    565   PCRE2_SIZE max_pattern_length;
    566   uint16_t bsr_convention;
    567   uint16_t newline_convention;
    568   uint32_t parens_nest_limit;
    569 } pcre2_real_compile_context;
    570 
    571 /* The real match context structure. */
    572 
    573 typedef struct pcre2_real_match_context {
    574   pcre2_memctl memctl;
    575 #ifdef HEAP_MATCH_RECURSE
    576   pcre2_memctl stack_memctl;
    577 #endif
    578 #ifdef SUPPORT_JIT
    579   pcre2_jit_callback jit_callback;
    580   void *jit_callback_data;
    581 #endif
    582   int    (*callout)(pcre2_callout_block *, void *);
    583   void    *callout_data;
    584   PCRE2_SIZE offset_limit;
    585   uint32_t match_limit;
    586   uint32_t recursion_limit;
    587 } pcre2_real_match_context;
    588 
    589 /* The real compiled code structure. The type for the blocksize field is
    590 defined specially because it is required in pcre2_serialize_decode() when
    591 copying the size from possibly unaligned memory into a variable of the same
    592 type. Use a macro rather than a typedef to avoid compiler warnings when this
    593 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
    594 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
    595 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
    596 here.) */
    597 
    598 #undef  CODE_BLOCKSIZE_TYPE
    599 #define CODE_BLOCKSIZE_TYPE size_t
    600 
    601 #undef  LOOKBEHIND_MAX
    602 #define LOOKBEHIND_MAX UINT16_MAX
    603 
    604 typedef struct pcre2_real_code {
    605   pcre2_memctl memctl;            /* Memory control fields */
    606   const uint8_t *tables;          /* The character tables */
    607   void    *executable_jit;        /* Pointer to JIT code */
    608   uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
    609   CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
    610   uint32_t magic_number;          /* Paranoid and endianness check */
    611   uint32_t compile_options;       /* Options passed to pcre2_compile() */
    612   uint32_t overall_options;       /* Options after processing the pattern */
    613   uint32_t flags;                 /* Various state flags */
    614   uint32_t limit_match;           /* Limit set in the pattern */
    615   uint32_t limit_recursion;       /* Limit set in the pattern */
    616   uint32_t first_codeunit;        /* Starting code unit */
    617   uint32_t last_codeunit;         /* This codeunit must be seen */
    618   uint16_t bsr_convention;        /* What \R matches */
    619   uint16_t newline_convention;    /* What is a newline? */
    620   uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
    621   uint16_t minlength;             /* Minimum length of match */
    622   uint16_t top_bracket;           /* Highest numbered group */
    623   uint16_t top_backref;           /* Highest numbered back reference */
    624   uint16_t name_entry_size;       /* Size (code units) of table entries */
    625   uint16_t name_count;            /* Number of name entries in the table */
    626 } pcre2_real_code;
    627 
    628 /* The real match data structure. */
    629 
    630 typedef struct pcre2_real_match_data {
    631   pcre2_memctl     memctl;
    632   const pcre2_real_code *code;    /* The pattern used for the match */
    633   PCRE2_SPTR       subject;       /* The subject that was matched */
    634   PCRE2_SPTR       mark;          /* Pointer to last mark */
    635   PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
    636   PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
    637   PCRE2_SIZE       startchar;     /* Offset to starting code unit */
    638   uint16_t         matchedby;     /* Type of match (normal, JIT, DFA) */
    639   uint16_t         oveccount;     /* Number of pairs */
    640   int              rc;            /* The return code from the match */
    641   PCRE2_SIZE       ovector[1];    /* The first field */
    642 } pcre2_real_match_data;
    643 
    644 
    645 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
    646 
    647 /* These structures are not needed for pcre2test. */
    648 
    649 #ifndef PCRE2_PCRE2TEST
    650 
    651 /* Structure for checking for mutual recursion when scanning compiled code. */
    652 
    653 typedef struct recurse_check {
    654   struct recurse_check *prev;
    655   PCRE2_SPTR group;
    656 } recurse_check;
    657 
    658 /* Structure for building a cache when filling in recursion offsets. */
    659 
    660 typedef struct recurse_cache {
    661   PCRE2_SPTR group;
    662   int recno;
    663 } recurse_cache;
    664 
    665 /* Structure for maintaining a chain of pointers to the currently incomplete
    666 branches, for testing for left recursion while compiling. */
    667 
    668 typedef struct branch_chain {
    669   struct branch_chain *outer;
    670   PCRE2_UCHAR *current_branch;
    671 } branch_chain;
    672 
    673 /* Structure for building a list of named groups during the first pass of
    674 compiling. */
    675 
    676 typedef struct named_group {
    677   PCRE2_SPTR   name;          /* Points to the name in the pattern */
    678   uint32_t     number;        /* Group number */
    679   uint16_t     length;        /* Length of the name */
    680   uint16_t     isdup;         /* TRUE if a duplicate */
    681 } named_group;
    682 
    683 /* Structure for passing "static" information around between the functions
    684 doing the compiling, so that they are thread-safe. */
    685 
    686 typedef struct compile_block {
    687   pcre2_real_compile_context *cx;  /* Points to the compile context */
    688   const uint8_t *lcc;              /* Points to lower casing table */
    689   const uint8_t *fcc;              /* Points to case-flipping table */
    690   const uint8_t *cbits;            /* Points to character type table */
    691   const uint8_t *ctypes;           /* Points to table of type maps */
    692   PCRE2_SPTR start_workspace;      /* The start of working space */
    693   PCRE2_SPTR start_code;           /* The start of the compiled code */
    694   PCRE2_SPTR start_pattern;        /* The start of the pattern */
    695   PCRE2_SPTR end_pattern;          /* The end of the pattern */
    696   PCRE2_SPTR nestptr[2];           /* Pointer(s) saved for string substitution */
    697   PCRE2_UCHAR *name_table;         /* The name/number table */
    698   size_t workspace_size;           /* Size of workspace */
    699   uint16_t names_found;            /* Number of entries so far */
    700   uint16_t name_entry_size;        /* Size of each entry */
    701   open_capitem *open_caps;         /* Chain of open capture items */
    702   named_group *named_groups;       /* Points to vector in pre-compile */
    703   uint32_t named_group_list_size;  /* Number of entries in the list */
    704   uint32_t external_options;       /* External (initial) options */
    705   uint32_t external_flags;         /* External flag bits to be set */
    706   uint32_t bracount;               /* Count of capturing parens as we compile */
    707   uint32_t final_bracount;         /* Saved value after first pass */
    708   uint32_t *groupinfo;             /* Group info vector */
    709   uint32_t top_backref;            /* Maximum back reference */
    710   uint32_t backref_map;            /* Bitmap of low back refs */
    711   uint32_t nltype;                 /* Newline type */
    712   uint32_t nllen;                  /* Newline string length */
    713   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
    714   int  max_lookbehind;             /* Maximum lookbehind (characters) */
    715   int  parens_depth;               /* Depth of nested parentheses */
    716   int  assert_depth;               /* Depth of nested assertions */
    717   int  req_varyopt;                /* "After variable item" flag for reqbyte */
    718   BOOL had_accept;                 /* (*ACCEPT) encountered */
    719   BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
    720   BOOL had_recurse;                /* Had a recursion or subroutine call */
    721   BOOL check_lookbehind;           /* Lookbehinds need later checking */
    722   BOOL dupnames;                   /* Duplicate names exist */
    723   BOOL iscondassert;               /* Next assert is a condition */
    724 } compile_block;
    725 
    726 /* Structure for keeping the properties of the in-memory stack used
    727 by the JIT matcher. */
    728 
    729 typedef struct pcre2_real_jit_stack {
    730   pcre2_memctl memctl;
    731   void* stack;
    732 } pcre2_real_jit_stack;
    733 
    734 /* Structure for keeping a chain of heap blocks used for saving ovectors
    735 during pattern recursion when the ovector is larger than can be saved on
    736 the system stack. */
    737 
    738 typedef struct ovecsave_frame {
    739   struct ovecsave_frame *next;     /* Next frame on free chain */
    740   PCRE2_SIZE saved_ovec[1];        /* First vector element */
    741 } ovecsave_frame;
    742 
    743 /* Structure for items in a linked list that represents an explicit recursive
    744 call within the pattern; used by pcre_match(). */
    745 
    746 typedef struct recursion_info {
    747   struct recursion_info *prevrec;  /* Previous recursion record (or NULL) */
    748   unsigned int group_num;          /* Number of group that was called */
    749   PCRE2_SIZE *ovec_save;           /* Pointer to saved ovector frame */
    750   uint32_t saved_capture_last;     /* Last capture number */
    751   PCRE2_SPTR subject_position;     /* Position at start of recursion */
    752 } recursion_info;
    753 
    754 /* A similar structure for pcre_dfa_match(). */
    755 
    756 typedef struct dfa_recursion_info {
    757   struct dfa_recursion_info *prevrec;
    758   PCRE2_SPTR subject_position;
    759   uint32_t group_num;
    760 } dfa_recursion_info;
    761 
    762 /* Structure for building a chain of data for holding the values of the subject
    763 pointer at the start of each subpattern, so as to detect when an empty string
    764 has been matched by a subpattern - to break infinite loops; used by
    765 pcre2_match(). */
    766 
    767 typedef struct eptrblock {
    768   struct eptrblock *epb_prev;
    769   PCRE2_SPTR epb_saved_eptr;
    770 } eptrblock;
    771 
    772 /* Structure for passing "static" information around between the functions
    773 doing traditional NFA matching (pcre2_match() and friends). */
    774 
    775 typedef struct match_block {
    776   pcre2_memctl memctl;            /* For general use */
    777 #ifdef HEAP_MATCH_RECURSE
    778   pcre2_memctl stack_memctl;      /* For "stack" frames */
    779 #endif
    780   uint32_t match_call_count;      /* As it says */
    781   uint32_t match_limit;           /* As it says */
    782   uint32_t match_limit_recursion; /* As it says */
    783   BOOL hitend;                    /* Hit the end of the subject at some point */
    784   BOOL hasthen;                   /* Pattern contains (*THEN) */
    785   const uint8_t *lcc;             /* Points to lower casing table */
    786   const uint8_t *fcc;             /* Points to case-flipping table */
    787   const uint8_t *ctypes;          /* Points to table of type maps */
    788   PCRE2_SIZE *ovector;            /* Pointer to the offset vector */
    789   PCRE2_SIZE offset_end;          /* One past the end */
    790   PCRE2_SIZE offset_max;          /* The maximum usable for return data */
    791   PCRE2_SIZE start_offset;        /* The start offset value */
    792   PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
    793   uint16_t partial;               /* PARTIAL options */
    794   uint16_t bsr_convention;        /* \R interpretation */
    795   uint16_t name_count;            /* Number of names in name table */
    796   uint16_t name_entry_size;       /* Size of entry in names table */
    797   PCRE2_SPTR name_table;          /* Table of group names */
    798   PCRE2_SPTR start_code;          /* For use when recursing */
    799   PCRE2_SPTR start_subject;       /* Start of the subject string */
    800   PCRE2_SPTR end_subject;         /* End of the subject string */
    801   PCRE2_SPTR start_match_ptr;     /* Start of matched string */
    802   PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
    803   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
    804   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
    805   PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
    806   PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
    807   PCRE2_SPTR once_target;         /* Where to back up to for atomic groups */
    808   uint32_t moptions;              /* Match options */
    809   uint32_t poptions;              /* Pattern options */
    810   uint32_t capture_last;          /* Most recent capture number + overflow flag */
    811   uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
    812   uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
    813   uint32_t match_function_type;   /* Set for certain special calls of match() */
    814   uint32_t nltype;                /* Newline type */
    815   uint32_t nllen;                 /* Newline string length */
    816   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
    817   eptrblock *eptrchain;           /* Chain of eptrblocks for tail recursions */
    818   recursion_info *recursive;      /* Linked list of recursion data */
    819   ovecsave_frame *ovecsave_chain; /* Linked list of free ovecsave blocks */
    820   void  *callout_data;            /* To pass back to callouts */
    821   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
    822 #ifdef HEAP_MATCH_RECURSE
    823   void  *match_frames_base;       /* For remembering malloc'd frames */
    824 #endif
    825 } match_block;
    826 
    827 /* A similar structure is used for the same purpose by the DFA matching
    828 functions. */
    829 
    830 typedef struct dfa_match_block {
    831   pcre2_memctl memctl;            /* For general use */
    832   PCRE2_SPTR start_code;          /* Start of the compiled pattern */
    833   PCRE2_SPTR start_subject ;      /* Start of the subject string */
    834   PCRE2_SPTR end_subject;         /* End of subject string */
    835   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
    836   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
    837   const uint8_t *tables;          /* Character tables */
    838   PCRE2_SIZE start_offset;        /* The start offset value */
    839   uint32_t moptions;              /* Match options */
    840   uint32_t poptions;              /* Pattern options */
    841   uint32_t nltype;                /* Newline type */
    842   uint32_t nllen;                 /* Newline string length */
    843   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
    844   uint16_t bsr_convention;        /* \R interpretation */
    845   void *callout_data;             /* To pass back to callouts */
    846   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
    847   dfa_recursion_info *recursive;  /* Linked list of recursion data */
    848 } dfa_match_block;
    849 
    850 #endif  /* PCRE2_PCRE2TEST */
    851 
    852 /* End of pcre2_intmodedep.h */
    853