Lines Matching refs:utf
299 /* We need to be able to check input text for UTF-8 validity, whatever code
301 8-bit code units. So we include the UTF validity checking function for 8-bit
623 { "utf", MOD_PATP, MOD_OPT, PCRE2_UTF, PO(options) },
926 #define PCHARS(lv, p, offset, len, utf, f) \
928 lv = pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f); \
930 lv = pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f); \
932 lv = pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
934 #define PCHARSV(p, offset, len, utf, f) \
936 (void)pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f); \
938 (void)pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f); \
940 (void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
1400 #define PCHARS(lv, p, offset, len, utf, f) \
1402 lv = G(pchars,BITONE)((G(PCRE2_SPTR,BITONE))(p)+offset, len, utf, f); \
1404 lv = G(pchars,BITTWO)((G(PCRE2_SPTR,BITTWO))(p)+offset, len, utf, f)
1406 #define PCHARSV(p, offset, len, utf, f) \
1408 (void)G(pchars,BITONE)((G(PCRE2_SPTR,BITONE))(p)+offset, len, utf, f); \
1410 (void)G(pchars,BITTWO)((G(PCRE2_SPTR,BITTWO))(p)+offset, len, utf, f)
1760 #define PCHARS(lv, p, offset, len, utf, f) \
1761 lv = pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
1762 #define PCHARSV(p, offset, len, utf, f) \
1763 (void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
1855 #define PCHARS(lv, p, offset, len, utf, f) \
1856 lv = pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
1857 #define PCHARSV(p, offset, len, utf, f) \
1858 (void)pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
1950 #define PCHARS(lv, p, offset, len, utf, f) \
1951 lv = pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
1952 #define PCHARSV(p, offset, len, utf, f) \
1953 (void)pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
2447 * Convert UTF-8 character to code point *
2450 /* This function reads one or more bytes that represent a UTF-8 character,
2452 the original UTF-8 definition of RFC 2279, allowing for values in the range 0
2455 checking, and also for generating 32-bit non-UTF data values above the UTF
2463 -6 to 0 => malformed UTF-8 character at offset = (-return)
2480 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
2518 utf TRUE in UTF mode
2525 pchar(uint32_t c, BOOL utf, FILE *f)
2536 if (utf)
2588 /* Must handle UTF-8 strings in utf8 mode. Yields number of characters printed.
2592 static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f)
2600 if (utf)
2607 yield += pchar(c, utf, f);
2612 yield += pchar(c, utf, f);
2625 /* Must handle UTF-16 strings in utf mode. Yields number of characters printed.
2629 static int pchars16(PCRE2_SPTR16 p, int length, BOOL utf, FILE *f)
2636 if (utf && c >= 0xD800 && c < 0xDC00 && length > 0)
2646 yield += pchar(c, utf, f);
2659 /* Must handle UTF-32 strings in utf mode. Yields number of characters printed.
2663 static int pchars32(PCRE2_SPTR32 p, int length, BOOL utf, FILE *f)
2666 (void)(utf); /* Avoid compiler warning */
2671 yield += pchar(c, utf, f);
2682 * Convert character value to UTF-8 *
2686 and encodes it as a UTF-8 character in 0 to 6 bytes.
2721 /* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If
2724 more than double, because up to 0xffff uses no more than 3 bytes in UTF-8 but
2725 possibly 4 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in
2726 UTF-16. The result is always left in pbuffer16. Impose a minimum size to save
2730 deliberate; it makes it possible to construct UTF-16 strings that are invalid,
2735 utf non-zero if converting to UTF-16
2740 OR -1 if a UTF-8 string is malformed
2741 OR -2 if a value > 0x10ffff is encountered in UTF mode
2742 OR -3 if a value > 0xffff is encountered when not in UTF mode
2746 to16(uint8_t *p, int utf, PCRE2_SIZE *lenptr)
2766 if (!utf)
2780 if (!utf) return -3;
2800 /* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If
2808 deliberate; it makes it possible to construct UTF-32 strings that are invalid,
2813 utf true if UTF-8 (to be converted to UTF-32)
2818 OR -1 if a UTF-8 string is malformed
2819 OR -2 if a value > 0x10ffff is encountered in UTF mode
2823 to32(uint8_t *p, int utf, PCRE2_SIZE *lenptr)
2843 if (!utf)
2852 if (utf && c > 0x10ffff) return -2;
2877 utf TRUE if in UTF mode
2883 backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf)
2885 if (!utf || test_mode == PCRE32_MODE)
3715 ((options & PCRE2_UTF) != 0)? " utf" : "",
3805 BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
3815 cb->callout_string_length, utf, outfile);
3970 /* Remove UTF/UCP if they were there only because of forbid_utf. This saves
3971 cluttering up the verification output of non-UTF test files. */
4459 BOOL utf;
4508 utf = (pat_patctl.options & PCRE2_UTF) != 0;
4777 if (utf) cflags |= REG_UTF;
4877 if (test_mode == PCRE16_MODE) errorcode = to16(pbuffer8, utf, &patlen);
4881 if (test_mode == PCRE32_MODE) errorcode = to32(pbuffer8, utf, &patlen);
4887 fprintf(outfile, "** Failed: invalid UTF-8 string cannot be "
4893 "cannot be converted to UTF\n");
4898 "cannot be converted to 16-bit in non-UTF mode\n");
4966 /* If forbid_utf is non-zero, we are running a non-UTF test. UTF and UCP are
5144 BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
5162 cb->callout_string_length, utf, outfile);
5188 cb->offset_vector[i+1] - cb->offset_vector[i], utf, f);
5203 PCHARS(pre_start, cb->subject, 0, cb->start_match, utf, f);
5214 current_position - cb->start_match, utf, f);
5219 utf, f);
5223 PCHARS(subject_length, cb->subject, 0, cb->subject_length, utf, NULL);
5270 PCHARSV(cb->mark, 0, -1, utf, outfile);
5300 utf TRUE for utf
5307 copy_and_get(BOOL utf, int capcount)
5345 PCHARSV(copybuffer, 0, length, utf, outfile);
5369 if (test_mode == PCRE16_MODE)(void)to16(nptr, utf, &cnl);
5372 if (test_mode == PCRE32_MODE)(void)to32(nptr, utf, &cnl);
5404 PCHARSV(copybuffer, 0, length, utf, outfile);
5431 PCHARSV(gotbuffer, 0, length, utf, outfile);
5456 if (test_mode == PCRE16_MODE)(void)to16(nptr, utf, &cnl);
5459 if (test_mode == PCRE32_MODE)(void)to32(nptr, utf, &cnl);
5477 PCHARSV(gotbuffer, 0, length, utf, outfile);
5506 PCHARSV(stringlist[i], 0, lengths[i], utf, outfile);
5541 BOOL utf;
5568 utf = ((((pat_patctl.control & CTL_POSIX) != 0)?
5572 utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
5582 /* Check that the data is well-formed UTF-8 if we're in UTF mode. To create
5585 if (utf)
5593 fprintf(outfile, "** Failed: invalid UTF-8 string cannot be used as input "
5594 "in UTF mode\n");
5625 buffer of the appropriate width. In UTF mode, input can be UTF-8. */
5698 if (utf && HASUTF8EXTRALEN(c)) { GETUTF8INC(c, p); }
5766 allows UTF-8 characters to be constructed byte by byte, and also allows
5767 invalid UTF-8 sequences to be made. Just copy the byte in UTF-8 mode.
5777 if (utf && (test_mode == PCRE8_MODE))
5810 In 8-bit mode we convert to UTF-8 if we are in UTF mode. Values greater
5811 than 127 in UTF mode must have come from \x{...} or octal constructs
5812 because values from \x.. get this far only in non-UTF mode. */
5817 if (utf)
5822 "and so cannot be converted to UTF-8\n", c);
5832 "and UTF-8 mode is not enabled.\n", c);
5843 if (utf)
5848 "0x10ffff and so cannot be converted to UTF-16\n", c);
5865 "and UTF-16 mode is not enabled.\n", c);
5989 pmatch[i].rm_eo - pmatch[i].rm_so, utf, outfile);
5996 utf, outfile);
6164 escape processing is done for replacements. In UTF mode, check for an invalid
6165 UTF-8 input string, and if it is invalid, just copy its code units without
6166 UTF interpretation. This provides a means of checking that an invalid string
6167 is detected. Otherwise, UTF-8 can be used to include wide characters in a
6170 if (utf) badutf = valid_utf(pr, strlen((const char *)pr), &erroroffset);
6172 /* Not UTF or invalid UTF-8: just copy the code units. */
6174 if (!utf || badutf)
6190 /* Valid UTF-8 replacement string */
6243 PCHARSV(nbuffer, 0, nsize, utf, outfile);
6518 PCHARS(lleft, pp, leftchar, start - leftchar, utf, outfile);
6519 PCHARS(lmiddle, pp, start, end - start, utf, outfile);
6520 PCHARS(lright, pp, end, rightchar - end, utf, outfile);
6537 PCHARS(lleft, pp, startchar, start - startchar, utf, outfile);
6538 PCHARSV(pp, start, end - start, utf, outfile);
6552 PCHARSV(pp, start, end - start, utf, outfile);
6562 PCHARSV(pp, start, end - start, utf, outfile);
6574 PCHARSV(pp, ovector[i+1], ulen - ovector[i+1], utf, outfile);
6585 PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
6591 copy_and_get(utf, capcount);
6609 PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf,
6616 poffset = backchars(pp, ovector[0], maxlookbehind, utf);
6617 PCHARS(backlength, pp, poffset, ovector[0] - poffset, utf, outfile);
6618 PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
6634 copy_and_get(utf, 1);
6651 Otherwise, in the case of UTF-8 or UTF-16 matching, the advance must be one
6667 else if (utf && test_mode != PCRE32_MODE)
6702 PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
6711 fprintf(outfile, "Error %d (bad UTF-%d offset)\n", capcount, test_mode);
6774 if (utf && test_mode != PCRE32_MODE)
6918 printf(" unicode Unicode and UTF support enabled [0, 1]\n");
7038 printf(" UTF and UCP support (");