1 /* gutf8.c - Operations on UTF-8 strings. 2 * 3 * Copyright (C) 1999 Tom Tromey 4 * Copyright (C) 2000 Red Hat, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19 * Boston, MA 02111-1307, USA. 20 */ 21 22 #include "config.h" 23 24 #include <stdlib.h> 25 #ifndef ANDROID_STUB 26 #ifdef HAVE_CODESET 27 #include <langinfo.h> 28 #endif 29 #endif 30 #include <string.h> 31 32 #include "glib.h" 33 34 #ifdef G_PLATFORM_WIN32 35 #include <stdio.h> 36 #define STRICT 37 #include <windows.h> 38 #undef STRICT 39 #endif 40 41 #ifndef ANDROID_STUB 42 #include "libcharset/libcharset.h" 43 #endif 44 45 #include "glibintl.h" 46 #include "galias.h" 47 48 #define UTF8_COMPUTE(Char, Mask, Len) \ 49 if (Char < 128) \ 50 { \ 51 Len = 1; \ 52 Mask = 0x7f; \ 53 } \ 54 else if ((Char & 0xe0) == 0xc0) \ 55 { \ 56 Len = 2; \ 57 Mask = 0x1f; \ 58 } \ 59 else if ((Char & 0xf0) == 0xe0) \ 60 { \ 61 Len = 3; \ 62 Mask = 0x0f; \ 63 } \ 64 else if ((Char & 0xf8) == 0xf0) \ 65 { \ 66 Len = 4; \ 67 Mask = 0x07; \ 68 } \ 69 else if ((Char & 0xfc) == 0xf8) \ 70 { \ 71 Len = 5; \ 72 Mask = 0x03; \ 73 } \ 74 else if ((Char & 0xfe) == 0xfc) \ 75 { \ 76 Len = 6; \ 77 Mask = 0x01; \ 78 } \ 79 else \ 80 Len = -1; 81 82 #define UTF8_LENGTH(Char) \ 83 ((Char) < 0x80 ? 1 : \ 84 ((Char) < 0x800 ? 2 : \ 85 ((Char) < 0x10000 ? 3 : \ 86 ((Char) < 0x200000 ? 4 : \ 87 ((Char) < 0x4000000 ? 5 : 6))))) 88 89 90 #define UTF8_GET(Result, Chars, Count, Mask, Len) \ 91 (Result) = (Chars)[0] & (Mask); \ 92 for ((Count) = 1; (Count) < (Len); ++(Count)) \ 93 { \ 94 if (((Chars)[(Count)] & 0xc0) != 0x80) \ 95 { \ 96 (Result) = -1; \ 97 break; \ 98 } \ 99 (Result) <<= 6; \ 100 (Result) |= ((Chars)[(Count)] & 0x3f); \ 101 } 102 103 #define UNICODE_VALID(Char) \ 104 ((Char) < 0x110000 && \ 105 (((Char) & 0xFFFFF800) != 0xD800) && \ 106 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ 107 ((Char) & 0xFFFE) != 0xFFFE) 108 109 110 static const gchar utf8_skip_data[256] = { 111 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 112 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 113 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 114 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 115 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 116 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 117 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 118 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 119 }; 120 121 const gchar * const g_utf8_skip = utf8_skip_data; 122 123 /** 124 * g_utf8_find_prev_char: 125 * @str: pointer to the beginning of a UTF-8 encoded string 126 * @p: pointer to some position within @str 127 * 128 * Given a position @p with a UTF-8 encoded string @str, find the start 129 * of the previous UTF-8 character starting before @p. Returns %NULL if no 130 * UTF-8 characters are present in @str before @p. 131 * 132 * @p does not have to be at the beginning of a UTF-8 character. No check 133 * is made to see if the character found is actually valid other than 134 * it starts with an appropriate byte. 135 * 136 * Return value: a pointer to the found character or %NULL. 137 **/ 138 gchar * 139 g_utf8_find_prev_char (const char *str, 140 const char *p) 141 { 142 for (--p; p >= str; --p) 143 { 144 if ((*p & 0xc0) != 0x80) 145 return (gchar *)p; 146 } 147 return NULL; 148 } 149 150 /** 151 * g_utf8_find_next_char: 152 * @p: a pointer to a position within a UTF-8 encoded string 153 * @end: a pointer to the byte following the end of the string, 154 * or %NULL to indicate that the string is nul-terminated. 155 * 156 * Finds the start of the next UTF-8 character in the string after @p. 157 * 158 * @p does not have to be at the beginning of a UTF-8 character. No check 159 * is made to see if the character found is actually valid other than 160 * it starts with an appropriate byte. 161 * 162 * Return value: a pointer to the found character or %NULL 163 **/ 164 gchar * 165 g_utf8_find_next_char (const gchar *p, 166 const gchar *end) 167 { 168 if (*p) 169 { 170 if (end) 171 for (++p; p < end && (*p & 0xc0) == 0x80; ++p) 172 ; 173 else 174 for (++p; (*p & 0xc0) == 0x80; ++p) 175 ; 176 } 177 return (p == end) ? NULL : (gchar *)p; 178 } 179 180 /** 181 * g_utf8_prev_char: 182 * @p: a pointer to a position within a UTF-8 encoded string 183 * 184 * Finds the previous UTF-8 character in the string before @p. 185 * 186 * @p does not have to be at the beginning of a UTF-8 character. No check 187 * is made to see if the character found is actually valid other than 188 * it starts with an appropriate byte. If @p might be the first 189 * character of the string, you must use g_utf8_find_prev_char() instead. 190 * 191 * Return value: a pointer to the found character. 192 **/ 193 gchar * 194 g_utf8_prev_char (const gchar *p) 195 { 196 while (TRUE) 197 { 198 p--; 199 if ((*p & 0xc0) != 0x80) 200 return (gchar *)p; 201 } 202 } 203 204 /** 205 * g_utf8_strlen: 206 * @p: pointer to the start of a UTF-8 encoded string. 207 * @max: the maximum number of bytes to examine. If @max 208 * is less than 0, then the string is assumed to be 209 * nul-terminated. If @max is 0, @p will not be examined and 210 * may be %NULL. 211 * 212 * Returns the length of the string in characters. 213 * 214 * Return value: the length of the string in characters 215 **/ 216 glong 217 g_utf8_strlen (const gchar *p, 218 gssize max) 219 { 220 glong len = 0; 221 const gchar *start = p; 222 g_return_val_if_fail (p != NULL || max == 0, 0); 223 224 if (max < 0) 225 { 226 while (*p) 227 { 228 p = g_utf8_next_char (p); 229 ++len; 230 } 231 } 232 else 233 { 234 if (max == 0 || !*p) 235 return 0; 236 237 p = g_utf8_next_char (p); 238 239 while (p - start < max && *p) 240 { 241 ++len; 242 p = g_utf8_next_char (p); 243 } 244 245 /* only do the last len increment if we got a complete 246 * char (don't count partial chars) 247 */ 248 if (p - start <= max) 249 ++len; 250 } 251 252 return len; 253 } 254 255 /** 256 * g_utf8_get_char: 257 * @p: a pointer to Unicode character encoded as UTF-8 258 * 259 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 260 * If @p does not point to a valid UTF-8 encoded character, results are 261 * undefined. If you are not sure that the bytes are complete 262 * valid Unicode characters, you should use g_utf8_get_char_validated() 263 * instead. 264 * 265 * Return value: the resulting character 266 **/ 267 gunichar 268 g_utf8_get_char (const gchar *p) 269 { 270 int i, mask = 0, len; 271 gunichar result; 272 unsigned char c = (unsigned char) *p; 273 274 UTF8_COMPUTE (c, mask, len); 275 if (len == -1) 276 return (gunichar)-1; 277 UTF8_GET (result, p, i, mask, len); 278 279 return result; 280 } 281 282 /** 283 * g_utf8_offset_to_pointer: 284 * @str: a UTF-8 encoded string 285 * @offset: a character offset within @str 286 * 287 * Converts from an integer character offset to a pointer to a position 288 * within the string. 289 * 290 * Since 2.10, this function allows to pass a negative @offset to 291 * step backwards. It is usually worth stepping backwards from the end 292 * instead of forwards if @offset is in the last fourth of the string, 293 * since moving forward is about 3 times faster than moving backward. 294 * 295 * <note><para> 296 * This function doesn't abort when reaching the end of @str. Therefore 297 * you should be sure that @offset is within string boundaries before 298 * calling that function. Call g_utf8_strlen() when unsure. 299 * 300 * This limitation exists as this function is called frequently during 301 * text rendering and therefore has to be as fast as possible. 302 * </para></note> 303 * 304 * Return value: the resulting pointer 305 **/ 306 gchar * 307 g_utf8_offset_to_pointer (const gchar *str, 308 glong offset) 309 { 310 const gchar *s = str; 311 312 if (offset > 0) 313 while (offset--) 314 s = g_utf8_next_char (s); 315 else 316 { 317 const char *s1; 318 319 /* This nice technique for fast backwards stepping 320 * through a UTF-8 string was dubbed "stutter stepping" 321 * by its inventor, Larry Ewing. 322 */ 323 while (offset) 324 { 325 s1 = s; 326 s += offset; 327 while ((*s & 0xc0) == 0x80) 328 s--; 329 330 offset += g_utf8_pointer_to_offset (s, s1); 331 } 332 } 333 334 return (gchar *)s; 335 } 336 337 /** 338 * g_utf8_pointer_to_offset: 339 * @str: a UTF-8 encoded string 340 * @pos: a pointer to a position within @str 341 * 342 * Converts from a pointer to position within a string to a integer 343 * character offset. 344 * 345 * Since 2.10, this function allows @pos to be before @str, and returns 346 * a negative offset in this case. 347 * 348 * Return value: the resulting character offset 349 **/ 350 glong 351 g_utf8_pointer_to_offset (const gchar *str, 352 const gchar *pos) 353 { 354 const gchar *s = str; 355 glong offset = 0; 356 357 if (pos < str) 358 offset = - g_utf8_pointer_to_offset (pos, str); 359 else 360 while (s < pos) 361 { 362 s = g_utf8_next_char (s); 363 offset++; 364 } 365 366 return offset; 367 } 368 369 370 /** 371 * g_utf8_strncpy: 372 * @dest: buffer to fill with characters from @src 373 * @src: UTF-8 encoded string 374 * @n: character count 375 * 376 * Like the standard C strncpy() function, but 377 * copies a given number of characters instead of a given number of 378 * bytes. The @src string must be valid UTF-8 encoded text. 379 * (Use g_utf8_validate() on all text before trying to use UTF-8 380 * utility functions with it.) 381 * 382 * Return value: @dest 383 **/ 384 gchar * 385 g_utf8_strncpy (gchar *dest, 386 const gchar *src, 387 gsize n) 388 { 389 const gchar *s = src; 390 while (n && *s) 391 { 392 s = g_utf8_next_char(s); 393 n--; 394 } 395 strncpy(dest, src, s - src); 396 dest[s - src] = 0; 397 return dest; 398 } 399 400 G_LOCK_DEFINE_STATIC (aliases); 401 402 static GHashTable * 403 get_alias_hash (void) 404 { 405 static GHashTable *alias_hash = NULL; 406 const char *aliases; 407 408 G_LOCK (aliases); 409 410 if (!alias_hash) 411 { 412 alias_hash = g_hash_table_new (g_str_hash, g_str_equal); 413 414 aliases = _g_locale_get_charset_aliases (); 415 while (*aliases != '\0') 416 { 417 const char *canonical; 418 const char *alias; 419 const char **alias_array; 420 int count = 0; 421 422 alias = aliases; 423 aliases += strlen (aliases) + 1; 424 canonical = aliases; 425 aliases += strlen (aliases) + 1; 426 427 alias_array = g_hash_table_lookup (alias_hash, canonical); 428 if (alias_array) 429 { 430 while (alias_array[count]) 431 count++; 432 } 433 434 alias_array = g_renew (const char *, alias_array, count + 2); 435 alias_array[count] = alias; 436 alias_array[count + 1] = NULL; 437 438 g_hash_table_insert (alias_hash, (char *)canonical, alias_array); 439 } 440 } 441 442 G_UNLOCK (aliases); 443 444 return alias_hash; 445 } 446 447 /* As an abuse of the alias table, the following routines gets 448 * the charsets that are aliases for the canonical name. 449 */ 450 #ifndef ANDROID_STUB 451 G_GNUC_INTERNAL const char ** 452 _g_charset_get_aliases (const char *canonical_name) 453 { 454 GHashTable *alias_hash = get_alias_hash (); 455 456 return g_hash_table_lookup (alias_hash, canonical_name); 457 } 458 #endif 459 460 static gboolean 461 g_utf8_get_charset_internal (const char *raw_data, 462 const char **a) 463 { 464 const char *charset = getenv("CHARSET"); 465 466 if (charset && *charset) 467 { 468 *a = charset; 469 470 if (charset && strstr (charset, "UTF-8")) 471 return TRUE; 472 else 473 return FALSE; 474 } 475 476 /* The libcharset code tries to be thread-safe without 477 * a lock, but has a memory leak and a missing memory 478 * barrier, so we lock for it 479 */ 480 #ifndef ANDROID_STUB 481 G_LOCK (aliases); 482 charset = _g_locale_charset_unalias (raw_data); 483 G_UNLOCK (aliases); 484 485 if (charset && *charset) 486 { 487 *a = charset; 488 489 if (charset && strstr (charset, "UTF-8")) 490 return TRUE; 491 else 492 return FALSE; 493 } 494 #endif 495 496 /* Assume this for compatibility at present. */ 497 *a = "US-ASCII"; 498 499 return FALSE; 500 } 501 502 typedef struct _GCharsetCache GCharsetCache; 503 504 struct _GCharsetCache { 505 gboolean is_utf8; 506 gchar *raw; 507 gchar *charset; 508 }; 509 510 static void 511 charset_cache_free (gpointer data) 512 { 513 GCharsetCache *cache = data; 514 g_free (cache->raw); 515 g_free (cache->charset); 516 g_free (cache); 517 } 518 519 /** 520 * g_get_charset: 521 * @charset: return location for character set name 522 * 523 * Obtains the character set for the <link linkend="setlocale">current 524 * locale</link>; you might use this character set as an argument to 525 * g_convert(), to convert from the current locale's encoding to some 526 * other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8() 527 * are nice shortcuts, though.) 528 * 529 * On Windows the character set returned by this function is the 530 * so-called system default ANSI code-page. That is the character set 531 * used by the "narrow" versions of C library and Win32 functions that 532 * handle file names. It might be different from the character set 533 * used by the C library's current locale. 534 * 535 * The return value is %TRUE if the locale's encoding is UTF-8, in that 536 * case you can perhaps avoid calling g_convert(). 537 * 538 * The string returned in @charset is not allocated, and should not be 539 * freed. 540 * 541 * Return value: %TRUE if the returned charset is UTF-8 542 **/ 543 gboolean 544 g_get_charset (G_CONST_RETURN char **charset) 545 { 546 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; 547 GCharsetCache *cache = g_static_private_get (&cache_private); 548 const gchar *raw; 549 550 if (!cache) 551 { 552 cache = g_new0 (GCharsetCache, 1); 553 g_static_private_set (&cache_private, cache, charset_cache_free); 554 } 555 556 #ifndef ANDROID_STUB 557 raw = _g_locale_charset_raw (); 558 559 if (!(cache->raw && strcmp (cache->raw, raw) == 0)) 560 { 561 const gchar *new_charset; 562 563 g_free (cache->raw); 564 g_free (cache->charset); 565 cache->raw = g_strdup (raw); 566 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset); 567 cache->charset = g_strdup (new_charset); 568 } 569 #else 570 cache->charset = g_strdup("UTF-8"); 571 cache->is_utf8 = TRUE; 572 #endif 573 if (charset) 574 *charset = cache->charset; 575 576 return cache->is_utf8; 577 } 578 579 /* unicode_strchr */ 580 581 /** 582 * g_unichar_to_utf8: 583 * @c: a Unicode character code 584 * @outbuf: output buffer, must have at least 6 bytes of space. 585 * If %NULL, the length will be computed and returned 586 * and nothing will be written to @outbuf. 587 * 588 * Converts a single character to UTF-8. 589 * 590 * Return value: number of bytes written 591 **/ 592 int 593 g_unichar_to_utf8 (gunichar c, 594 gchar *outbuf) 595 { 596 /* If this gets modified, also update the copy in g_string_insert_unichar() */ 597 guint len = 0; 598 int first; 599 int i; 600 601 if (c < 0x80) 602 { 603 first = 0; 604 len = 1; 605 } 606 else if (c < 0x800) 607 { 608 first = 0xc0; 609 len = 2; 610 } 611 else if (c < 0x10000) 612 { 613 first = 0xe0; 614 len = 3; 615 } 616 else if (c < 0x200000) 617 { 618 first = 0xf0; 619 len = 4; 620 } 621 else if (c < 0x4000000) 622 { 623 first = 0xf8; 624 len = 5; 625 } 626 else 627 { 628 first = 0xfc; 629 len = 6; 630 } 631 632 if (outbuf) 633 { 634 for (i = len - 1; i > 0; --i) 635 { 636 outbuf[i] = (c & 0x3f) | 0x80; 637 c >>= 6; 638 } 639 outbuf[0] = c | first; 640 } 641 642 return len; 643 } 644 645 /** 646 * g_utf8_strchr: 647 * @p: a nul-terminated UTF-8 encoded string 648 * @len: the maximum length of @p 649 * @c: a Unicode character 650 * 651 * Finds the leftmost occurrence of the given Unicode character 652 * in a UTF-8 encoded string, while limiting the search to @len bytes. 653 * If @len is -1, allow unbounded search. 654 * 655 * Return value: %NULL if the string does not contain the character, 656 * otherwise, a pointer to the start of the leftmost occurrence of 657 * the character in the string. 658 **/ 659 gchar * 660 g_utf8_strchr (const char *p, 661 gssize len, 662 gunichar c) 663 { 664 gchar ch[10]; 665 666 gint charlen = g_unichar_to_utf8 (c, ch); 667 ch[charlen] = '\0'; 668 669 return g_strstr_len (p, len, ch); 670 } 671 672 673 /** 674 * g_utf8_strrchr: 675 * @p: a nul-terminated UTF-8 encoded string 676 * @len: the maximum length of @p 677 * @c: a Unicode character 678 * 679 * Find the rightmost occurrence of the given Unicode character 680 * in a UTF-8 encoded string, while limiting the search to @len bytes. 681 * If @len is -1, allow unbounded search. 682 * 683 * Return value: %NULL if the string does not contain the character, 684 * otherwise, a pointer to the start of the rightmost occurrence of the 685 * character in the string. 686 **/ 687 gchar * 688 g_utf8_strrchr (const char *p, 689 gssize len, 690 gunichar c) 691 { 692 gchar ch[10]; 693 694 gint charlen = g_unichar_to_utf8 (c, ch); 695 ch[charlen] = '\0'; 696 697 return g_strrstr_len (p, len, ch); 698 } 699 700 701 /* Like g_utf8_get_char, but take a maximum length 702 * and return (gunichar)-2 on incomplete trailing character 703 */ 704 static inline gunichar 705 g_utf8_get_char_extended (const gchar *p, 706 gssize max_len) 707 { 708 guint i, len; 709 gunichar wc = (guchar) *p; 710 711 if (wc < 0x80) 712 { 713 return wc; 714 } 715 else if (wc < 0xc0) 716 { 717 return (gunichar)-1; 718 } 719 else if (wc < 0xe0) 720 { 721 len = 2; 722 wc &= 0x1f; 723 } 724 else if (wc < 0xf0) 725 { 726 len = 3; 727 wc &= 0x0f; 728 } 729 else if (wc < 0xf8) 730 { 731 len = 4; 732 wc &= 0x07; 733 } 734 else if (wc < 0xfc) 735 { 736 len = 5; 737 wc &= 0x03; 738 } 739 else if (wc < 0xfe) 740 { 741 len = 6; 742 wc &= 0x01; 743 } 744 else 745 { 746 return (gunichar)-1; 747 } 748 749 if (max_len >= 0 && len > max_len) 750 { 751 for (i = 1; i < max_len; i++) 752 { 753 if ((((guchar *)p)[i] & 0xc0) != 0x80) 754 return (gunichar)-1; 755 } 756 return (gunichar)-2; 757 } 758 759 for (i = 1; i < len; ++i) 760 { 761 gunichar ch = ((guchar *)p)[i]; 762 763 if ((ch & 0xc0) != 0x80) 764 { 765 if (ch) 766 return (gunichar)-1; 767 else 768 return (gunichar)-2; 769 } 770 771 wc <<= 6; 772 wc |= (ch & 0x3f); 773 } 774 775 if (UTF8_LENGTH(wc) != len) 776 return (gunichar)-1; 777 778 return wc; 779 } 780 781 /** 782 * g_utf8_get_char_validated: 783 * @p: a pointer to Unicode character encoded as UTF-8 784 * @max_len: the maximum number of bytes to read, or -1, for no maximum or 785 * if @p is nul-terminated 786 * 787 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 788 * This function checks for incomplete characters, for invalid characters 789 * such as characters that are out of the range of Unicode, and for 790 * overlong encodings of valid characters. 791 * 792 * Return value: the resulting character. If @p points to a partial 793 * sequence at the end of a string that could begin a valid 794 * character (or if @max_len is zero), returns (gunichar)-2; 795 * otherwise, if @p does not point to a valid UTF-8 encoded 796 * Unicode character, returns (gunichar)-1. 797 **/ 798 gunichar 799 g_utf8_get_char_validated (const gchar *p, 800 gssize max_len) 801 { 802 gunichar result; 803 804 if (max_len == 0) 805 return (gunichar)-2; 806 807 result = g_utf8_get_char_extended (p, max_len); 808 809 if (result & 0x80000000) 810 return result; 811 else if (!UNICODE_VALID (result)) 812 return (gunichar)-1; 813 else 814 return result; 815 } 816 817 /** 818 * g_utf8_to_ucs4_fast: 819 * @str: a UTF-8 encoded string 820 * @len: the maximum length of @str to use, in bytes. If @len < 0, 821 * then the string is nul-terminated. 822 * @items_written: location to store the number of characters in the 823 * result, or %NULL. 824 * 825 * Convert a string from UTF-8 to a 32-bit fixed width 826 * representation as UCS-4, assuming valid UTF-8 input. 827 * This function is roughly twice as fast as g_utf8_to_ucs4() 828 * but does no error checking on the input. 829 * 830 * Return value: a pointer to a newly allocated UCS-4 string. 831 * This value must be freed with g_free(). 832 **/ 833 gunichar * 834 g_utf8_to_ucs4_fast (const gchar *str, 835 glong len, 836 glong *items_written) 837 { 838 gint j, charlen; 839 gunichar *result; 840 gint n_chars, i; 841 const gchar *p; 842 843 g_return_val_if_fail (str != NULL, NULL); 844 845 p = str; 846 n_chars = 0; 847 if (len < 0) 848 { 849 while (*p) 850 { 851 p = g_utf8_next_char (p); 852 ++n_chars; 853 } 854 } 855 else 856 { 857 while (p < str + len && *p) 858 { 859 p = g_utf8_next_char (p); 860 ++n_chars; 861 } 862 } 863 864 result = g_new (gunichar, n_chars + 1); 865 866 p = str; 867 for (i=0; i < n_chars; i++) 868 { 869 gunichar wc = ((unsigned char *)p)[0]; 870 871 if (wc < 0x80) 872 { 873 result[i] = wc; 874 p++; 875 } 876 else 877 { 878 if (wc < 0xe0) 879 { 880 charlen = 2; 881 wc &= 0x1f; 882 } 883 else if (wc < 0xf0) 884 { 885 charlen = 3; 886 wc &= 0x0f; 887 } 888 else if (wc < 0xf8) 889 { 890 charlen = 4; 891 wc &= 0x07; 892 } 893 else if (wc < 0xfc) 894 { 895 charlen = 5; 896 wc &= 0x03; 897 } 898 else 899 { 900 charlen = 6; 901 wc &= 0x01; 902 } 903 904 for (j = 1; j < charlen; j++) 905 { 906 wc <<= 6; 907 wc |= ((unsigned char *)p)[j] & 0x3f; 908 } 909 910 result[i] = wc; 911 p += charlen; 912 } 913 } 914 result[i] = 0; 915 916 if (items_written) 917 *items_written = i; 918 919 return result; 920 } 921 922 /** 923 * g_utf8_to_ucs4: 924 * @str: a UTF-8 encoded string 925 * @len: the maximum length of @str to use, in bytes. If @len < 0, 926 * then the string is nul-terminated. 927 * @items_read: location to store number of bytes read, or %NULL. 928 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 929 * returned in case @str contains a trailing partial 930 * character. If an error occurs then the index of the 931 * invalid input is stored here. 932 * @items_written: location to store number of characters written or %NULL. 933 * The value here stored does not include the trailing 0 934 * character. 935 * @error: location to store the error occuring, or %NULL to ignore 936 * errors. Any of the errors in #GConvertError other than 937 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 938 * 939 * Convert a string from UTF-8 to a 32-bit fixed width 940 * representation as UCS-4. A trailing 0 will be added to the 941 * string after the converted text. 942 * 943 * Return value: a pointer to a newly allocated UCS-4 string. 944 * This value must be freed with g_free(). If an 945 * error occurs, %NULL will be returned and 946 * @error set. 947 **/ 948 gunichar * 949 g_utf8_to_ucs4 (const gchar *str, 950 glong len, 951 glong *items_read, 952 glong *items_written, 953 GError **error) 954 { 955 gunichar *result = NULL; 956 gint n_chars, i; 957 const gchar *in; 958 959 in = str; 960 n_chars = 0; 961 while ((len < 0 || str + len - in > 0) && *in) 962 { 963 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); 964 if (wc & 0x80000000) 965 { 966 if (wc == (gunichar)-2) 967 { 968 if (items_read) 969 break; 970 else 971 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 972 _("Partial character sequence at end of input")); 973 } 974 else 975 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 976 _("Invalid byte sequence in conversion input")); 977 978 goto err_out; 979 } 980 981 n_chars++; 982 983 in = g_utf8_next_char (in); 984 } 985 986 result = g_new (gunichar, n_chars + 1); 987 988 in = str; 989 for (i=0; i < n_chars; i++) 990 { 991 result[i] = g_utf8_get_char (in); 992 in = g_utf8_next_char (in); 993 } 994 result[i] = 0; 995 996 if (items_written) 997 *items_written = n_chars; 998 999 err_out: 1000 if (items_read) 1001 *items_read = in - str; 1002 1003 return result; 1004 } 1005 1006 /** 1007 * g_ucs4_to_utf8: 1008 * @str: a UCS-4 encoded string 1009 * @len: the maximum length (number of characters) of @str to use. 1010 * If @len < 0, then the string is nul-terminated. 1011 * @items_read: location to store number of characters read, or %NULL. 1012 * @items_written: location to store number of bytes written or %NULL. 1013 * The value here stored does not include the trailing 0 1014 * byte. 1015 * @error: location to store the error occuring, or %NULL to ignore 1016 * errors. Any of the errors in #GConvertError other than 1017 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1018 * 1019 * Convert a string from a 32-bit fixed width representation as UCS-4. 1020 * to UTF-8. The result will be terminated with a 0 byte. 1021 * 1022 * Return value: a pointer to a newly allocated UTF-8 string. 1023 * This value must be freed with g_free(). If an 1024 * error occurs, %NULL will be returned and 1025 * @error set. In that case, @items_read will be 1026 * set to the position of the first invalid input 1027 * character. 1028 **/ 1029 gchar * 1030 g_ucs4_to_utf8 (const gunichar *str, 1031 glong len, 1032 glong *items_read, 1033 glong *items_written, 1034 GError **error) 1035 { 1036 gint result_length; 1037 gchar *result = NULL; 1038 gchar *p; 1039 gint i; 1040 1041 result_length = 0; 1042 for (i = 0; len < 0 || i < len ; i++) 1043 { 1044 if (!str[i]) 1045 break; 1046 1047 if (str[i] >= 0x80000000) 1048 { 1049 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1050 _("Character out of range for UTF-8")); 1051 goto err_out; 1052 } 1053 1054 result_length += UTF8_LENGTH (str[i]); 1055 } 1056 1057 result = g_malloc (result_length + 1); 1058 p = result; 1059 1060 i = 0; 1061 while (p < result + result_length) 1062 p += g_unichar_to_utf8 (str[i++], p); 1063 1064 *p = '\0'; 1065 1066 if (items_written) 1067 *items_written = p - result; 1068 1069 err_out: 1070 if (items_read) 1071 *items_read = i; 1072 1073 return result; 1074 } 1075 1076 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) 1077 1078 /** 1079 * g_utf16_to_utf8: 1080 * @str: a UTF-16 encoded string 1081 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. 1082 * If @len < 0, then the string is nul-terminated. 1083 * @items_read: location to store number of words read, or %NULL. 1084 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1085 * returned in case @str contains a trailing partial 1086 * character. If an error occurs then the index of the 1087 * invalid input is stored here. 1088 * @items_written: location to store number of bytes written, or %NULL. 1089 * The value stored here does not include the trailing 1090 * 0 byte. 1091 * @error: location to store the error occuring, or %NULL to ignore 1092 * errors. Any of the errors in #GConvertError other than 1093 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1094 * 1095 * Convert a string from UTF-16 to UTF-8. The result will be 1096 * terminated with a 0 byte. 1097 * 1098 * Note that the input is expected to be already in native endianness, 1099 * an initial byte-order-mark character is not handled specially. 1100 * g_convert() can be used to convert a byte buffer of UTF-16 data of 1101 * ambiguous endianess. 1102 * 1103 * Return value: a pointer to a newly allocated UTF-8 string. 1104 * This value must be freed with g_free(). If an 1105 * error occurs, %NULL will be returned and 1106 * @error set. 1107 **/ 1108 gchar * 1109 g_utf16_to_utf8 (const gunichar2 *str, 1110 glong len, 1111 glong *items_read, 1112 glong *items_written, 1113 GError **error) 1114 { 1115 /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ 1116 * are marked. 1117 */ 1118 const gunichar2 *in; 1119 gchar *out; 1120 gchar *result = NULL; 1121 gint n_bytes; 1122 gunichar high_surrogate; 1123 1124 g_return_val_if_fail (str != NULL, NULL); 1125 1126 n_bytes = 0; 1127 in = str; 1128 high_surrogate = 0; 1129 while ((len < 0 || in - str < len) && *in) 1130 { 1131 gunichar2 c = *in; 1132 gunichar wc; 1133 1134 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1135 { 1136 if (high_surrogate) 1137 { 1138 wc = SURROGATE_VALUE (high_surrogate, c); 1139 high_surrogate = 0; 1140 } 1141 else 1142 { 1143 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1144 _("Invalid sequence in conversion input")); 1145 goto err_out; 1146 } 1147 } 1148 else 1149 { 1150 if (high_surrogate) 1151 { 1152 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1153 _("Invalid sequence in conversion input")); 1154 goto err_out; 1155 } 1156 1157 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1158 { 1159 high_surrogate = c; 1160 goto next1; 1161 } 1162 else 1163 wc = c; 1164 } 1165 1166 /********** DIFFERENT for UTF8/UCS4 **********/ 1167 n_bytes += UTF8_LENGTH (wc); 1168 1169 next1: 1170 in++; 1171 } 1172 1173 if (high_surrogate && !items_read) 1174 { 1175 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1176 _("Partial character sequence at end of input")); 1177 goto err_out; 1178 } 1179 1180 /* At this point, everything is valid, and we just need to convert 1181 */ 1182 /********** DIFFERENT for UTF8/UCS4 **********/ 1183 result = g_malloc (n_bytes + 1); 1184 1185 high_surrogate = 0; 1186 out = result; 1187 in = str; 1188 while (out < result + n_bytes) 1189 { 1190 gunichar2 c = *in; 1191 gunichar wc; 1192 1193 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1194 { 1195 wc = SURROGATE_VALUE (high_surrogate, c); 1196 high_surrogate = 0; 1197 } 1198 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1199 { 1200 high_surrogate = c; 1201 goto next2; 1202 } 1203 else 1204 wc = c; 1205 1206 /********** DIFFERENT for UTF8/UCS4 **********/ 1207 out += g_unichar_to_utf8 (wc, out); 1208 1209 next2: 1210 in++; 1211 } 1212 1213 /********** DIFFERENT for UTF8/UCS4 **********/ 1214 *out = '\0'; 1215 1216 if (items_written) 1217 /********** DIFFERENT for UTF8/UCS4 **********/ 1218 *items_written = out - result; 1219 1220 err_out: 1221 if (items_read) 1222 *items_read = in - str; 1223 1224 return result; 1225 } 1226 1227 /** 1228 * g_utf16_to_ucs4: 1229 * @str: a UTF-16 encoded string 1230 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. 1231 * If @len < 0, then the string is nul-terminated. 1232 * @items_read: location to store number of words read, or %NULL. 1233 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1234 * returned in case @str contains a trailing partial 1235 * character. If an error occurs then the index of the 1236 * invalid input is stored here. 1237 * @items_written: location to store number of characters written, or %NULL. 1238 * The value stored here does not include the trailing 1239 * 0 character. 1240 * @error: location to store the error occuring, or %NULL to ignore 1241 * errors. Any of the errors in #GConvertError other than 1242 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1243 * 1244 * Convert a string from UTF-16 to UCS-4. The result will be 1245 * nul-terminated. 1246 * 1247 * Return value: a pointer to a newly allocated UCS-4 string. 1248 * This value must be freed with g_free(). If an 1249 * error occurs, %NULL will be returned and 1250 * @error set. 1251 **/ 1252 gunichar * 1253 g_utf16_to_ucs4 (const gunichar2 *str, 1254 glong len, 1255 glong *items_read, 1256 glong *items_written, 1257 GError **error) 1258 { 1259 const gunichar2 *in; 1260 gchar *out; 1261 gchar *result = NULL; 1262 gint n_bytes; 1263 gunichar high_surrogate; 1264 1265 g_return_val_if_fail (str != NULL, NULL); 1266 1267 n_bytes = 0; 1268 in = str; 1269 high_surrogate = 0; 1270 while ((len < 0 || in - str < len) && *in) 1271 { 1272 gunichar2 c = *in; 1273 gunichar wc; 1274 1275 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1276 { 1277 if (high_surrogate) 1278 { 1279 wc = SURROGATE_VALUE (high_surrogate, c); 1280 high_surrogate = 0; 1281 } 1282 else 1283 { 1284 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1285 _("Invalid sequence in conversion input")); 1286 goto err_out; 1287 } 1288 } 1289 else 1290 { 1291 if (high_surrogate) 1292 { 1293 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1294 _("Invalid sequence in conversion input")); 1295 goto err_out; 1296 } 1297 1298 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1299 { 1300 high_surrogate = c; 1301 goto next1; 1302 } 1303 else 1304 wc = c; 1305 } 1306 1307 /********** DIFFERENT for UTF8/UCS4 **********/ 1308 n_bytes += sizeof (gunichar); 1309 1310 next1: 1311 in++; 1312 } 1313 1314 if (high_surrogate && !items_read) 1315 { 1316 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1317 _("Partial character sequence at end of input")); 1318 goto err_out; 1319 } 1320 1321 /* At this point, everything is valid, and we just need to convert 1322 */ 1323 /********** DIFFERENT for UTF8/UCS4 **********/ 1324 result = g_malloc (n_bytes + 4); 1325 1326 high_surrogate = 0; 1327 out = result; 1328 in = str; 1329 while (out < result + n_bytes) 1330 { 1331 gunichar2 c = *in; 1332 gunichar wc; 1333 1334 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ 1335 { 1336 wc = SURROGATE_VALUE (high_surrogate, c); 1337 high_surrogate = 0; 1338 } 1339 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ 1340 { 1341 high_surrogate = c; 1342 goto next2; 1343 } 1344 else 1345 wc = c; 1346 1347 /********** DIFFERENT for UTF8/UCS4 **********/ 1348 *(gunichar *)out = wc; 1349 out += sizeof (gunichar); 1350 1351 next2: 1352 in++; 1353 } 1354 1355 /********** DIFFERENT for UTF8/UCS4 **********/ 1356 *(gunichar *)out = 0; 1357 1358 if (items_written) 1359 /********** DIFFERENT for UTF8/UCS4 **********/ 1360 *items_written = (out - result) / sizeof (gunichar); 1361 1362 err_out: 1363 if (items_read) 1364 *items_read = in - str; 1365 1366 return (gunichar *)result; 1367 } 1368 1369 /** 1370 * g_utf8_to_utf16: 1371 * @str: a UTF-8 encoded string 1372 * @len: the maximum length (number of characters) of @str to use. 1373 * If @len < 0, then the string is nul-terminated. 1374 * @items_read: location to store number of bytes read, or %NULL. 1375 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1376 * returned in case @str contains a trailing partial 1377 * character. If an error occurs then the index of the 1378 * invalid input is stored here. 1379 * @items_written: location to store number of <type>gunichar2</type> written, 1380 * or %NULL. 1381 * The value stored here does not include the trailing 0. 1382 * @error: location to store the error occuring, or %NULL to ignore 1383 * errors. Any of the errors in #GConvertError other than 1384 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1385 * 1386 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1387 * added to the result after the converted text. 1388 * 1389 * Return value: a pointer to a newly allocated UTF-16 string. 1390 * This value must be freed with g_free(). If an 1391 * error occurs, %NULL will be returned and 1392 * @error set. 1393 **/ 1394 gunichar2 * 1395 g_utf8_to_utf16 (const gchar *str, 1396 glong len, 1397 glong *items_read, 1398 glong *items_written, 1399 GError **error) 1400 { 1401 gunichar2 *result = NULL; 1402 gint n16; 1403 const gchar *in; 1404 gint i; 1405 1406 g_return_val_if_fail (str != NULL, NULL); 1407 1408 in = str; 1409 n16 = 0; 1410 while ((len < 0 || str + len - in > 0) && *in) 1411 { 1412 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); 1413 if (wc & 0x80000000) 1414 { 1415 if (wc == (gunichar)-2) 1416 { 1417 if (items_read) 1418 break; 1419 else 1420 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 1421 _("Partial character sequence at end of input")); 1422 } 1423 else 1424 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1425 _("Invalid byte sequence in conversion input")); 1426 1427 goto err_out; 1428 } 1429 1430 if (wc < 0xd800) 1431 n16 += 1; 1432 else if (wc < 0xe000) 1433 { 1434 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1435 _("Invalid sequence in conversion input")); 1436 1437 goto err_out; 1438 } 1439 else if (wc < 0x10000) 1440 n16 += 1; 1441 else if (wc < 0x110000) 1442 n16 += 2; 1443 else 1444 { 1445 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1446 _("Character out of range for UTF-16")); 1447 1448 goto err_out; 1449 } 1450 1451 in = g_utf8_next_char (in); 1452 } 1453 1454 result = g_new (gunichar2, n16 + 1); 1455 1456 in = str; 1457 for (i = 0; i < n16;) 1458 { 1459 gunichar wc = g_utf8_get_char (in); 1460 1461 if (wc < 0x10000) 1462 { 1463 result[i++] = wc; 1464 } 1465 else 1466 { 1467 result[i++] = (wc - 0x10000) / 0x400 + 0xd800; 1468 result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; 1469 } 1470 1471 in = g_utf8_next_char (in); 1472 } 1473 1474 result[i] = 0; 1475 1476 if (items_written) 1477 *items_written = n16; 1478 1479 err_out: 1480 if (items_read) 1481 *items_read = in - str; 1482 1483 return result; 1484 } 1485 1486 /** 1487 * g_ucs4_to_utf16: 1488 * @str: a UCS-4 encoded string 1489 * @len: the maximum length (number of characters) of @str to use. 1490 * If @len < 0, then the string is nul-terminated. 1491 * @items_read: location to store number of bytes read, or %NULL. 1492 * If an error occurs then the index of the invalid input 1493 * is stored here. 1494 * @items_written: location to store number of <type>gunichar2</type> 1495 * written, or %NULL. The value stored here does not 1496 * include the trailing 0. 1497 * @error: location to store the error occuring, or %NULL to ignore 1498 * errors. Any of the errors in #GConvertError other than 1499 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 1500 * 1501 * Convert a string from UCS-4 to UTF-16. A 0 character will be 1502 * added to the result after the converted text. 1503 * 1504 * Return value: a pointer to a newly allocated UTF-16 string. 1505 * This value must be freed with g_free(). If an 1506 * error occurs, %NULL will be returned and 1507 * @error set. 1508 **/ 1509 gunichar2 * 1510 g_ucs4_to_utf16 (const gunichar *str, 1511 glong len, 1512 glong *items_read, 1513 glong *items_written, 1514 GError **error) 1515 { 1516 gunichar2 *result = NULL; 1517 gint n16; 1518 gint i, j; 1519 1520 n16 = 0; 1521 i = 0; 1522 while ((len < 0 || i < len) && str[i]) 1523 { 1524 gunichar wc = str[i]; 1525 1526 if (wc < 0xd800) 1527 n16 += 1; 1528 else if (wc < 0xe000) 1529 { 1530 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1531 _("Invalid sequence in conversion input")); 1532 1533 goto err_out; 1534 } 1535 else if (wc < 0x10000) 1536 n16 += 1; 1537 else if (wc < 0x110000) 1538 n16 += 2; 1539 else 1540 { 1541 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1542 _("Character out of range for UTF-16")); 1543 1544 goto err_out; 1545 } 1546 1547 i++; 1548 } 1549 1550 result = g_new (gunichar2, n16 + 1); 1551 1552 for (i = 0, j = 0; j < n16; i++) 1553 { 1554 gunichar wc = str[i]; 1555 1556 if (wc < 0x10000) 1557 { 1558 result[j++] = wc; 1559 } 1560 else 1561 { 1562 result[j++] = (wc - 0x10000) / 0x400 + 0xd800; 1563 result[j++] = (wc - 0x10000) % 0x400 + 0xdc00; 1564 } 1565 } 1566 result[j] = 0; 1567 1568 if (items_written) 1569 *items_written = n16; 1570 1571 err_out: 1572 if (items_read) 1573 *items_read = i; 1574 1575 return result; 1576 } 1577 1578 #define CONTINUATION_CHAR \ 1579 G_STMT_START { \ 1580 if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ 1581 goto error; \ 1582 val <<= 6; \ 1583 val |= (*(guchar *)p) & 0x3f; \ 1584 } G_STMT_END 1585 1586 static const gchar * 1587 fast_validate (const char *str) 1588 1589 { 1590 gunichar val = 0; 1591 gunichar min = 0; 1592 const gchar *p; 1593 1594 for (p = str; *p; p++) 1595 { 1596 if (*(guchar *)p < 128) 1597 /* done */; 1598 else 1599 { 1600 const gchar *last; 1601 1602 last = p; 1603 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 1604 { 1605 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 1606 goto error; 1607 p++; 1608 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 1609 goto error; 1610 } 1611 else 1612 { 1613 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 1614 { 1615 min = (1 << 11); 1616 val = *(guchar *)p & 0x0f; 1617 goto TWO_REMAINING; 1618 } 1619 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 1620 { 1621 min = (1 << 16); 1622 val = *(guchar *)p & 0x07; 1623 } 1624 else 1625 goto error; 1626 1627 p++; 1628 CONTINUATION_CHAR; 1629 TWO_REMAINING: 1630 p++; 1631 CONTINUATION_CHAR; 1632 p++; 1633 CONTINUATION_CHAR; 1634 1635 if (G_UNLIKELY (val < min)) 1636 goto error; 1637 1638 if (G_UNLIKELY (!UNICODE_VALID(val))) 1639 goto error; 1640 } 1641 1642 continue; 1643 1644 error: 1645 return last; 1646 } 1647 } 1648 1649 return p; 1650 } 1651 1652 static const gchar * 1653 fast_validate_len (const char *str, 1654 gssize max_len) 1655 1656 { 1657 gunichar val = 0; 1658 gunichar min = 0; 1659 const gchar *p; 1660 1661 g_assert (max_len >= 0); 1662 1663 for (p = str; ((p - str) < max_len) && *p; p++) 1664 { 1665 if (*(guchar *)p < 128) 1666 /* done */; 1667 else 1668 { 1669 const gchar *last; 1670 1671 last = p; 1672 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ 1673 { 1674 if (G_UNLIKELY (max_len - (p - str) < 2)) 1675 goto error; 1676 1677 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) 1678 goto error; 1679 p++; 1680 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ 1681 goto error; 1682 } 1683 else 1684 { 1685 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ 1686 { 1687 if (G_UNLIKELY (max_len - (p - str) < 3)) 1688 goto error; 1689 1690 min = (1 << 11); 1691 val = *(guchar *)p & 0x0f; 1692 goto TWO_REMAINING; 1693 } 1694 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ 1695 { 1696 if (G_UNLIKELY (max_len - (p - str) < 4)) 1697 goto error; 1698 1699 min = (1 << 16); 1700 val = *(guchar *)p & 0x07; 1701 } 1702 else 1703 goto error; 1704 1705 p++; 1706 CONTINUATION_CHAR; 1707 TWO_REMAINING: 1708 p++; 1709 CONTINUATION_CHAR; 1710 p++; 1711 CONTINUATION_CHAR; 1712 1713 if (G_UNLIKELY (val < min)) 1714 goto error; 1715 if (G_UNLIKELY (!UNICODE_VALID(val))) 1716 goto error; 1717 } 1718 1719 continue; 1720 1721 error: 1722 return last; 1723 } 1724 } 1725 1726 return p; 1727 } 1728 1729 /** 1730 * g_utf8_validate: 1731 * @str: a pointer to character data 1732 * @max_len: max bytes to validate, or -1 to go until NUL 1733 * @end: return location for end of valid data 1734 * 1735 * Validates UTF-8 encoded text. @str is the text to validate; 1736 * if @str is nul-terminated, then @max_len can be -1, otherwise 1737 * @max_len should be the number of bytes to validate. 1738 * If @end is non-%NULL, then the end of the valid range 1739 * will be stored there (i.e. the start of the first invalid 1740 * character if some bytes were invalid, or the end of the text 1741 * being validated otherwise). 1742 * 1743 * Note that g_utf8_validate() returns %FALSE if @max_len is 1744 * positive and NUL is met before @max_len bytes have been read. 1745 * 1746 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1747 * routines <emphasis>require</emphasis> valid UTF-8 as input; 1748 * so data read from a file or the network should be checked 1749 * with g_utf8_validate() before doing anything else with it. 1750 * 1751 * Return value: %TRUE if the text was valid UTF-8 1752 **/ 1753 gboolean 1754 g_utf8_validate (const char *str, 1755 gssize max_len, 1756 const gchar **end) 1757 1758 { 1759 const gchar *p; 1760 1761 if (max_len < 0) 1762 p = fast_validate (str); 1763 else 1764 p = fast_validate_len (str, max_len); 1765 1766 if (end) 1767 *end = p; 1768 1769 if ((max_len >= 0 && p != str + max_len) || 1770 (max_len < 0 && *p != '\0')) 1771 return FALSE; 1772 else 1773 return TRUE; 1774 } 1775 1776 /** 1777 * g_unichar_validate: 1778 * @ch: a Unicode character 1779 * 1780 * Checks whether @ch is a valid Unicode character. Some possible 1781 * integer values of @ch will not be valid. 0 is considered a valid 1782 * character, though it's normally a string terminator. 1783 * 1784 * Return value: %TRUE if @ch is a valid Unicode character 1785 **/ 1786 gboolean 1787 g_unichar_validate (gunichar ch) 1788 { 1789 return UNICODE_VALID (ch); 1790 } 1791 1792 /** 1793 * g_utf8_strreverse: 1794 * @str: a UTF-8 encoded string 1795 * @len: the maximum length of @str to use, in bytes. If @len < 0, 1796 * then the string is nul-terminated. 1797 * 1798 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1799 * (Use g_utf8_validate() on all text before trying to use UTF-8 1800 * utility functions with it.) 1801 * 1802 * This function is intended for programmatic uses of reversed strings. 1803 * It pays no attention to decomposed characters, combining marks, byte 1804 * order marks, directional indicators (LRM, LRO, etc) and similar 1805 * characters which might need special handling when reversing a string 1806 * for display purposes. 1807 * 1808 * Note that unlike g_strreverse(), this function returns 1809 * newly-allocated memory, which should be freed with g_free() when 1810 * no longer needed. 1811 * 1812 * Returns: a newly-allocated string which is the reverse of @str. 1813 * 1814 * Since: 2.2 1815 */ 1816 gchar * 1817 g_utf8_strreverse (const gchar *str, 1818 gssize len) 1819 { 1820 gchar *r, *result; 1821 const gchar *p; 1822 1823 if (len < 0) 1824 len = strlen (str); 1825 1826 result = g_new (gchar, len + 1); 1827 r = result + len; 1828 p = str; 1829 while (r > result) 1830 { 1831 gchar *m, skip = g_utf8_skip[*(guchar*) p]; 1832 r -= skip; 1833 for (m = r; skip; skip--) 1834 *m++ = *p++; 1835 } 1836 result[len] = 0; 1837 1838 return result; 1839 } 1840 1841 1842 gchar * 1843 _g_utf8_make_valid (const gchar *name) 1844 { 1845 GString *string; 1846 const gchar *remainder, *invalid; 1847 gint remaining_bytes, valid_bytes; 1848 1849 g_return_val_if_fail (name != NULL, NULL); 1850 1851 string = NULL; 1852 remainder = name; 1853 remaining_bytes = strlen (name); 1854 1855 while (remaining_bytes != 0) 1856 { 1857 if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 1858 break; 1859 valid_bytes = invalid - remainder; 1860 1861 if (string == NULL) 1862 string = g_string_sized_new (remaining_bytes); 1863 1864 g_string_append_len (string, remainder, valid_bytes); 1865 /* append U+FFFD REPLACEMENT CHARACTER */ 1866 g_string_append (string, "\357\277\275"); 1867 1868 remaining_bytes -= valid_bytes + 1; 1869 remainder = invalid + 1; 1870 } 1871 1872 if (string == NULL) 1873 return g_strdup (name); 1874 1875 g_string_append (string, remainder); 1876 1877 g_assert (g_utf8_validate (string->str, -1, NULL)); 1878 1879 return g_string_free (string, FALSE); 1880 } 1881 1882 1883 #define __G_UTF8_C__ 1884 #include "galiasdef.c" 1885