1 /* GLIB - Library of useful routines for C programming 2 * 3 * gconvert.c: Convert between character sets using iconv 4 * Copyright Red Hat Inc., 2000 5 * Authors: Havoc Pennington <hp (at) redhat.com>, Owen Taylor <otaylor (at) redhat.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, write to the 19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 20 * Boston, MA 02111-1307, USA. 21 */ 22 23 #include "config.h" 24 25 #include "glib.h" 26 27 28 #ifndef ANDROID_STUB 29 #ifndef G_OS_WIN32 30 #include <iconv.h> 31 #endif 32 #endif 33 34 #include <errno.h> 35 #include <stdio.h> 36 #include <string.h> 37 #include <stdlib.h> 38 39 #include "gprintfint.h" 40 #include "gthreadprivate.h" 41 #include "gunicode.h" 42 43 #ifdef G_OS_WIN32 44 #include "win_iconv.c" 45 #endif 46 47 #ifdef G_PLATFORM_WIN32 48 #define STRICT 49 #include <windows.h> 50 #undef STRICT 51 #endif 52 53 #include "glibintl.h" 54 55 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H) 56 #error GNU libiconv in use but included iconv.h not from libiconv 57 #endif 58 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) 59 #error GNU libiconv not in use but included iconv.h is from libiconv 60 #endif 61 62 #include "galias.h" 63 64 typedef void iconv_t; 65 66 GQuark 67 g_convert_error_quark (void) 68 { 69 return g_quark_from_static_string ("g_convert_error"); 70 } 71 72 static gboolean 73 try_conversion (const char *to_codeset, 74 const char *from_codeset, 75 iconv_t *cd) 76 { 77 #ifndef ANDROID_STUB 78 *cd = iconv_open (to_codeset, from_codeset); 79 80 if (*cd == (iconv_t)-1 && errno == EINVAL) 81 return FALSE; 82 else 83 return TRUE; 84 #else 85 return FALSE; 86 #endif 87 } 88 89 static gboolean 90 try_to_aliases (const char **to_aliases, 91 const char *from_codeset, 92 iconv_t *cd) 93 { 94 if (to_aliases) 95 { 96 const char **p = to_aliases; 97 while (*p) 98 { 99 if (try_conversion (*p, from_codeset, cd)) 100 return TRUE; 101 102 p++; 103 } 104 } 105 106 return FALSE; 107 } 108 109 #ifndef ANDROID_STUB 110 G_GNUC_INTERNAL extern const char ** 111 _g_charset_get_aliases (const char *canonical_name); 112 #endif 113 114 /** 115 * g_iconv_open: 116 * @to_codeset: destination codeset 117 * @from_codeset: source codeset 118 * 119 * Same as the standard UNIX routine iconv_open(), but 120 * may be implemented via libiconv on UNIX flavors that lack 121 * a native implementation. 122 * 123 * GLib provides g_convert() and g_locale_to_utf8() which are likely 124 * more convenient than the raw iconv wrappers. 125 * 126 * Return value: a "conversion descriptor", or (GIConv)-1 if 127 * opening the converter failed. 128 **/ 129 GIConv 130 g_iconv_open (const gchar *to_codeset, 131 const gchar *from_codeset) 132 { 133 #ifndef ANDROID_STUB 134 iconv_t cd; 135 136 if (!try_conversion (to_codeset, from_codeset, &cd)) 137 { 138 const char **to_aliases = _g_charset_get_aliases (to_codeset); 139 const char **from_aliases = _g_charset_get_aliases (from_codeset); 140 141 if (from_aliases) 142 { 143 const char **p = from_aliases; 144 while (*p) 145 { 146 if (try_conversion (to_codeset, *p, &cd)) 147 goto out; 148 149 if (try_to_aliases (to_aliases, *p, &cd)) 150 goto out; 151 152 p++; 153 } 154 } 155 156 if (try_to_aliases (to_aliases, from_codeset, &cd)) 157 goto out; 158 } 159 160 out: 161 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; 162 #else 163 return (GIConv) -1; 164 #endif 165 } 166 167 /** 168 * g_iconv: 169 * @converter: conversion descriptor from g_iconv_open() 170 * @inbuf: bytes to convert 171 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf 172 * @outbuf: converted output bytes 173 * @outbytes_left: inout parameter, bytes available to fill in @outbuf 174 * 175 * Same as the standard UNIX routine iconv(), but 176 * may be implemented via libiconv on UNIX flavors that lack 177 * a native implementation. 178 * 179 * GLib provides g_convert() and g_locale_to_utf8() which are likely 180 * more convenient than the raw iconv wrappers. 181 * 182 * Return value: count of non-reversible conversions, or -1 on error 183 **/ 184 gsize 185 g_iconv (GIConv converter, 186 gchar **inbuf, 187 gsize *inbytes_left, 188 gchar **outbuf, 189 gsize *outbytes_left) 190 { 191 #ifndef ANDROID_STUB 192 iconv_t cd = (iconv_t)converter; 193 194 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); 195 #else 196 return -1; 197 #endif 198 } 199 200 /** 201 * g_iconv_close: 202 * @converter: a conversion descriptor from g_iconv_open() 203 * 204 * Same as the standard UNIX routine iconv_close(), but 205 * may be implemented via libiconv on UNIX flavors that lack 206 * a native implementation. Should be called to clean up 207 * the conversion descriptor from g_iconv_open() when 208 * you are done converting things. 209 * 210 * GLib provides g_convert() and g_locale_to_utf8() which are likely 211 * more convenient than the raw iconv wrappers. 212 * 213 * Return value: -1 on error, 0 on success 214 **/ 215 gint 216 g_iconv_close (GIConv converter) 217 { 218 #ifndef ANDROID_STUB 219 iconv_t cd = (iconv_t)converter; 220 221 return iconv_close (cd); 222 #else 223 return -1; 224 #endif 225 } 226 227 228 #ifdef NEED_ICONV_CACHE 229 230 #define ICONV_CACHE_SIZE (16) 231 232 struct _iconv_cache_bucket { 233 gchar *key; 234 guint32 refcount; 235 gboolean used; 236 GIConv cd; 237 }; 238 239 static GList *iconv_cache_list; 240 static GHashTable *iconv_cache; 241 static GHashTable *iconv_open_hash; 242 static guint iconv_cache_size = 0; 243 G_LOCK_DEFINE_STATIC (iconv_cache_lock); 244 245 /* caller *must* hold the iconv_cache_lock */ 246 static void 247 iconv_cache_init (void) 248 { 249 static gboolean initialized = FALSE; 250 251 if (initialized) 252 return; 253 254 iconv_cache_list = NULL; 255 iconv_cache = g_hash_table_new (g_str_hash, g_str_equal); 256 iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal); 257 258 initialized = TRUE; 259 } 260 261 262 /* 263 * iconv_cache_bucket_new: 264 * @key: cache key 265 * @cd: iconv descriptor 266 * 267 * Creates a new cache bucket, inserts it into the cache and 268 * increments the cache size. 269 * 270 * This assumes ownership of @key. 271 * 272 * Returns a pointer to the newly allocated cache bucket. 273 **/ 274 static struct _iconv_cache_bucket * 275 iconv_cache_bucket_new (gchar *key, GIConv cd) 276 { 277 struct _iconv_cache_bucket *bucket; 278 279 bucket = g_new (struct _iconv_cache_bucket, 1); 280 bucket->key = key; 281 bucket->refcount = 1; 282 bucket->used = TRUE; 283 bucket->cd = cd; 284 285 g_hash_table_insert (iconv_cache, bucket->key, bucket); 286 287 /* FIXME: if we sorted the list so items with few refcounts were 288 first, then we could expire them faster in iconv_cache_expire_unused () */ 289 iconv_cache_list = g_list_prepend (iconv_cache_list, bucket); 290 291 iconv_cache_size++; 292 293 return bucket; 294 } 295 296 297 /* 298 * iconv_cache_bucket_expire: 299 * @node: cache bucket's node 300 * @bucket: cache bucket 301 * 302 * Expires a single cache bucket @bucket. This should only ever be 303 * called on a bucket that currently has no used iconv descriptors 304 * open. 305 * 306 * @node is not a required argument. If @node is not supplied, we 307 * search for it ourselves. 308 **/ 309 static void 310 iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket) 311 { 312 g_hash_table_remove (iconv_cache, bucket->key); 313 314 if (node == NULL) 315 node = g_list_find (iconv_cache_list, bucket); 316 317 g_assert (node != NULL); 318 319 if (node->prev) 320 { 321 node->prev->next = node->next; 322 if (node->next) 323 node->next->prev = node->prev; 324 } 325 else 326 { 327 iconv_cache_list = node->next; 328 if (node->next) 329 node->next->prev = NULL; 330 } 331 332 g_list_free_1 (node); 333 334 g_free (bucket->key); 335 g_iconv_close (bucket->cd); 336 g_free (bucket); 337 338 iconv_cache_size--; 339 } 340 341 342 /* 343 * iconv_cache_expire_unused: 344 * 345 * Expires as many unused cache buckets as it needs to in order to get 346 * the total number of buckets < ICONV_CACHE_SIZE. 347 **/ 348 static void 349 iconv_cache_expire_unused (void) 350 { 351 struct _iconv_cache_bucket *bucket; 352 GList *node, *next; 353 354 node = iconv_cache_list; 355 while (node && iconv_cache_size >= ICONV_CACHE_SIZE) 356 { 357 next = node->next; 358 359 bucket = node->data; 360 if (bucket->refcount == 0) 361 iconv_cache_bucket_expire (node, bucket); 362 363 node = next; 364 } 365 } 366 367 static GIConv 368 open_converter (const gchar *to_codeset, 369 const gchar *from_codeset, 370 GError **error) 371 { 372 struct _iconv_cache_bucket *bucket; 373 gchar *key, *dyn_key, auto_key[80]; 374 GIConv cd; 375 gsize len_from_codeset, len_to_codeset; 376 377 /* create our key */ 378 len_from_codeset = strlen (from_codeset); 379 len_to_codeset = strlen (to_codeset); 380 if (len_from_codeset + len_to_codeset + 2 < sizeof (auto_key)) 381 { 382 key = auto_key; 383 dyn_key = NULL; 384 } 385 else 386 key = dyn_key = g_malloc (len_from_codeset + len_to_codeset + 2); 387 memcpy (key, from_codeset, len_from_codeset); 388 key[len_from_codeset] = ':'; 389 strcpy (key + len_from_codeset + 1, to_codeset); 390 391 G_LOCK (iconv_cache_lock); 392 393 /* make sure the cache has been initialized */ 394 iconv_cache_init (); 395 396 bucket = g_hash_table_lookup (iconv_cache, key); 397 if (bucket) 398 { 399 g_free (dyn_key); 400 401 if (bucket->used) 402 { 403 cd = g_iconv_open (to_codeset, from_codeset); 404 if (cd == (GIConv) -1) 405 goto error; 406 } 407 else 408 { 409 /* Apparently iconv on Solaris <= 7 segfaults if you pass in 410 * NULL for anything but inbuf; work around that. (NULL outbuf 411 * or NULL *outbuf is allowed by Unix98.) 412 */ 413 gsize inbytes_left = 0; 414 gchar *outbuf = NULL; 415 gsize outbytes_left = 0; 416 417 cd = bucket->cd; 418 bucket->used = TRUE; 419 420 /* reset the descriptor */ 421 g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left); 422 } 423 424 bucket->refcount++; 425 } 426 else 427 { 428 cd = g_iconv_open (to_codeset, from_codeset); 429 if (cd == (GIConv) -1) 430 { 431 g_free (dyn_key); 432 goto error; 433 } 434 435 iconv_cache_expire_unused (); 436 437 bucket = iconv_cache_bucket_new (dyn_key ? dyn_key : g_strdup (key), cd); 438 } 439 440 g_hash_table_insert (iconv_open_hash, cd, bucket->key); 441 442 G_UNLOCK (iconv_cache_lock); 443 444 return cd; 445 446 error: 447 448 G_UNLOCK (iconv_cache_lock); 449 450 /* Something went wrong. */ 451 if (error) 452 { 453 if (errno == EINVAL) 454 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, 455 _("Conversion from character set '%s' to '%s' is not supported"), 456 from_codeset, to_codeset); 457 else 458 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, 459 _("Could not open converter from '%s' to '%s'"), 460 from_codeset, to_codeset); 461 } 462 463 return cd; 464 } 465 466 static int 467 close_converter (GIConv converter) 468 { 469 struct _iconv_cache_bucket *bucket; 470 const gchar *key; 471 GIConv cd; 472 473 cd = converter; 474 475 if (cd == (GIConv) -1) 476 return 0; 477 478 G_LOCK (iconv_cache_lock); 479 480 key = g_hash_table_lookup (iconv_open_hash, cd); 481 if (key) 482 { 483 g_hash_table_remove (iconv_open_hash, cd); 484 485 bucket = g_hash_table_lookup (iconv_cache, key); 486 g_assert (bucket); 487 488 bucket->refcount--; 489 490 if (cd == bucket->cd) 491 bucket->used = FALSE; 492 else 493 g_iconv_close (cd); 494 495 if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE) 496 { 497 /* expire this cache bucket */ 498 iconv_cache_bucket_expire (NULL, bucket); 499 } 500 } 501 else 502 { 503 G_UNLOCK (iconv_cache_lock); 504 505 g_warning ("This iconv context wasn't opened using open_converter"); 506 507 return g_iconv_close (converter); 508 } 509 510 G_UNLOCK (iconv_cache_lock); 511 512 return 0; 513 } 514 515 #else /* !NEED_ICONV_CACHE */ 516 517 static GIConv 518 open_converter (const gchar *to_codeset, 519 const gchar *from_codeset, 520 GError **error) 521 { 522 GIConv cd; 523 524 cd = g_iconv_open (to_codeset, from_codeset); 525 526 if (cd == (GIConv) -1) 527 { 528 /* Something went wrong. */ 529 if (error) 530 { 531 if (errno == EINVAL) 532 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, 533 _("Conversion from character set '%s' to '%s' is not supported"), 534 from_codeset, to_codeset); 535 else 536 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, 537 _("Could not open converter from '%s' to '%s'"), 538 from_codeset, to_codeset); 539 } 540 } 541 542 return cd; 543 } 544 545 static int 546 close_converter (GIConv cd) 547 { 548 if (cd == (GIConv) -1) 549 return 0; 550 551 return g_iconv_close (cd); 552 } 553 554 #endif /* NEED_ICONV_CACHE */ 555 556 /** 557 * g_convert_with_iconv: 558 * @str: the string to convert 559 * @len: the length of the string, or -1 if the string is 560 * nul-terminated<footnoteref linkend="nul-unsafe"/>. 561 * @converter: conversion descriptor from g_iconv_open() 562 * @bytes_read: location to store the number of bytes in the 563 * input string that were successfully converted, or %NULL. 564 * Even if the conversion was successful, this may be 565 * less than @len if there were partial characters 566 * at the end of the input. If the error 567 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 568 * stored will the byte offset after the last valid 569 * input sequence. 570 * @bytes_written: the number of bytes stored in the output buffer (not 571 * including the terminating nul). 572 * @error: location to store the error occuring, or %NULL to ignore 573 * errors. Any of the errors in #GConvertError may occur. 574 * 575 * Converts a string from one character set to another. 576 * 577 * Note that you should use g_iconv() for streaming 578 * conversions<footnote id="streaming-state"> 579 * <para> 580 * Despite the fact that @byes_read can return information about partial 581 * characters, the <literal>g_convert_...</literal> functions 582 * are not generally suitable for streaming. If the underlying converter 583 * being used maintains internal state, then this won't be preserved 584 * across successive calls to g_convert(), g_convert_with_iconv() or 585 * g_convert_with_fallback(). (An example of this is the GNU C converter 586 * for CP1255 which does not emit a base character until it knows that 587 * the next character is not a mark that could combine with the base 588 * character.) 589 * </para> 590 * </footnote>. 591 * 592 * Return value: If the conversion was successful, a newly allocated 593 * nul-terminated string, which must be freed with 594 * g_free(). Otherwise %NULL and @error will be set. 595 **/ 596 gchar* 597 g_convert_with_iconv (const gchar *str, 598 gssize len, 599 GIConv converter, 600 gsize *bytes_read, 601 gsize *bytes_written, 602 GError **error) 603 { 604 gchar *dest; 605 gchar *outp; 606 const gchar *p; 607 gsize inbytes_remaining; 608 gsize outbytes_remaining; 609 gsize err; 610 gsize outbuf_size; 611 gboolean have_error = FALSE; 612 gboolean done = FALSE; 613 gboolean reset = FALSE; 614 615 g_return_val_if_fail (converter != (GIConv) -1, NULL); 616 617 if (len < 0) 618 len = strlen (str); 619 620 p = str; 621 inbytes_remaining = len; 622 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ 623 624 outbytes_remaining = outbuf_size - 1; /* -1 for nul */ 625 outp = dest = g_malloc (outbuf_size); 626 627 while (!done && !have_error) 628 { 629 if (reset) 630 err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); 631 else 632 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); 633 634 if (err == (gsize) -1) 635 { 636 switch (errno) 637 { 638 case EINVAL: 639 /* Incomplete text, do not report an error */ 640 done = TRUE; 641 break; 642 case E2BIG: 643 { 644 gsize used = outp - dest; 645 646 outbuf_size *= 2; 647 dest = g_realloc (dest, outbuf_size); 648 649 outp = dest + used; 650 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ 651 } 652 break; 653 case EILSEQ: 654 if (error) 655 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 656 _("Invalid byte sequence in conversion input")); 657 have_error = TRUE; 658 break; 659 default: 660 if (error) 661 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, 662 _("Error during conversion: %s"), 663 g_strerror (errno)); 664 have_error = TRUE; 665 break; 666 } 667 } 668 else 669 { 670 if (!reset) 671 { 672 /* call g_iconv with NULL inbuf to cleanup shift state */ 673 reset = TRUE; 674 inbytes_remaining = 0; 675 } 676 else 677 done = TRUE; 678 } 679 } 680 681 *outp = '\0'; 682 683 if (bytes_read) 684 *bytes_read = p - str; 685 else 686 { 687 if ((p - str) != len) 688 { 689 if (!have_error) 690 { 691 if (error) 692 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, 693 _("Partial character sequence at end of input")); 694 have_error = TRUE; 695 } 696 } 697 } 698 699 if (bytes_written) 700 *bytes_written = outp - dest; /* Doesn't include '\0' */ 701 702 if (have_error) 703 { 704 g_free (dest); 705 return NULL; 706 } 707 else 708 return dest; 709 } 710 711 /** 712 * g_convert: 713 * @str: the string to convert 714 * @len: the length of the string, or -1 if the string is 715 * nul-terminated<footnote id="nul-unsafe"> 716 <para> 717 Note that some encodings may allow nul bytes to 718 occur inside strings. In that case, using -1 for 719 the @len parameter is unsafe. 720 </para> 721 </footnote>. 722 * @to_codeset: name of character set into which to convert @str 723 * @from_codeset: character set of @str. 724 * @bytes_read: location to store the number of bytes in the 725 * input string that were successfully converted, or %NULL. 726 * Even if the conversion was successful, this may be 727 * less than @len if there were partial characters 728 * at the end of the input. If the error 729 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 730 * stored will the byte offset after the last valid 731 * input sequence. 732 * @bytes_written: the number of bytes stored in the output buffer (not 733 * including the terminating nul). 734 * @error: location to store the error occuring, or %NULL to ignore 735 * errors. Any of the errors in #GConvertError may occur. 736 * 737 * Converts a string from one character set to another. 738 * 739 * Note that you should use g_iconv() for streaming 740 * conversions<footnoteref linkend="streaming-state"/>. 741 * 742 * Return value: If the conversion was successful, a newly allocated 743 * nul-terminated string, which must be freed with 744 * g_free(). Otherwise %NULL and @error will be set. 745 **/ 746 gchar* 747 g_convert (const gchar *str, 748 gssize len, 749 const gchar *to_codeset, 750 const gchar *from_codeset, 751 gsize *bytes_read, 752 gsize *bytes_written, 753 GError **error) 754 { 755 gchar *res; 756 GIConv cd; 757 758 g_return_val_if_fail (str != NULL, NULL); 759 g_return_val_if_fail (to_codeset != NULL, NULL); 760 g_return_val_if_fail (from_codeset != NULL, NULL); 761 762 cd = open_converter (to_codeset, from_codeset, error); 763 764 if (cd == (GIConv) -1) 765 { 766 if (bytes_read) 767 *bytes_read = 0; 768 769 if (bytes_written) 770 *bytes_written = 0; 771 772 return NULL; 773 } 774 775 res = g_convert_with_iconv (str, len, cd, 776 bytes_read, bytes_written, 777 error); 778 779 close_converter (cd); 780 781 return res; 782 } 783 784 /** 785 * g_convert_with_fallback: 786 * @str: the string to convert 787 * @len: the length of the string, or -1 if the string is 788 * nul-terminated<footnoteref linkend="nul-unsafe"/>. 789 * @to_codeset: name of character set into which to convert @str 790 * @from_codeset: character set of @str. 791 * @fallback: UTF-8 string to use in place of character not 792 * present in the target encoding. (The string must be 793 * representable in the target encoding). 794 If %NULL, characters not in the target encoding will 795 be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. 796 * @bytes_read: location to store the number of bytes in the 797 * input string that were successfully converted, or %NULL. 798 * Even if the conversion was successful, this may be 799 * less than @len if there were partial characters 800 * at the end of the input. 801 * @bytes_written: the number of bytes stored in the output buffer (not 802 * including the terminating nul). 803 * @error: location to store the error occuring, or %NULL to ignore 804 * errors. Any of the errors in #GConvertError may occur. 805 * 806 * Converts a string from one character set to another, possibly 807 * including fallback sequences for characters not representable 808 * in the output. Note that it is not guaranteed that the specification 809 * for the fallback sequences in @fallback will be honored. Some 810 * systems may do an approximate conversion from @from_codeset 811 * to @to_codeset in their iconv() functions, 812 * in which case GLib will simply return that approximate conversion. 813 * 814 * Note that you should use g_iconv() for streaming 815 * conversions<footnoteref linkend="streaming-state"/>. 816 * 817 * Return value: If the conversion was successful, a newly allocated 818 * nul-terminated string, which must be freed with 819 * g_free(). Otherwise %NULL and @error will be set. 820 **/ 821 gchar* 822 g_convert_with_fallback (const gchar *str, 823 gssize len, 824 const gchar *to_codeset, 825 const gchar *from_codeset, 826 gchar *fallback, 827 gsize *bytes_read, 828 gsize *bytes_written, 829 GError **error) 830 { 831 gchar *utf8; 832 gchar *dest; 833 gchar *outp; 834 const gchar *insert_str = NULL; 835 const gchar *p; 836 gsize inbytes_remaining; 837 const gchar *save_p = NULL; 838 gsize save_inbytes = 0; 839 gsize outbytes_remaining; 840 gsize err; 841 GIConv cd; 842 gsize outbuf_size; 843 gboolean have_error = FALSE; 844 gboolean done = FALSE; 845 846 GError *local_error = NULL; 847 848 g_return_val_if_fail (str != NULL, NULL); 849 g_return_val_if_fail (to_codeset != NULL, NULL); 850 g_return_val_if_fail (from_codeset != NULL, NULL); 851 852 if (len < 0) 853 len = strlen (str); 854 855 /* Try an exact conversion; we only proceed if this fails 856 * due to an illegal sequence in the input string. 857 */ 858 dest = g_convert (str, len, to_codeset, from_codeset, 859 bytes_read, bytes_written, &local_error); 860 if (!local_error) 861 return dest; 862 863 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) 864 { 865 g_propagate_error (error, local_error); 866 return NULL; 867 } 868 else 869 g_error_free (local_error); 870 871 local_error = NULL; 872 873 /* No go; to proceed, we need a converter from "UTF-8" to 874 * to_codeset, and the string as UTF-8. 875 */ 876 cd = open_converter (to_codeset, "UTF-8", error); 877 if (cd == (GIConv) -1) 878 { 879 if (bytes_read) 880 *bytes_read = 0; 881 882 if (bytes_written) 883 *bytes_written = 0; 884 885 return NULL; 886 } 887 888 utf8 = g_convert (str, len, "UTF-8", from_codeset, 889 bytes_read, &inbytes_remaining, error); 890 if (!utf8) 891 { 892 close_converter (cd); 893 if (bytes_written) 894 *bytes_written = 0; 895 return NULL; 896 } 897 898 /* Now the heart of the code. We loop through the UTF-8 string, and 899 * whenever we hit an offending character, we form fallback, convert 900 * the fallback to the target codeset, and then go back to 901 * converting the original string after finishing with the fallback. 902 * 903 * The variables save_p and save_inbytes store the input state 904 * for the original string while we are converting the fallback 905 */ 906 p = utf8; 907 908 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ 909 outbytes_remaining = outbuf_size - 1; /* -1 for nul */ 910 outp = dest = g_malloc (outbuf_size); 911 912 while (!done && !have_error) 913 { 914 gsize inbytes_tmp = inbytes_remaining; 915 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); 916 inbytes_remaining = inbytes_tmp; 917 918 if (err == (gsize) -1) 919 { 920 switch (errno) 921 { 922 case EINVAL: 923 g_assert_not_reached(); 924 break; 925 case E2BIG: 926 { 927 gsize used = outp - dest; 928 929 outbuf_size *= 2; 930 dest = g_realloc (dest, outbuf_size); 931 932 outp = dest + used; 933 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ 934 935 break; 936 } 937 case EILSEQ: 938 if (save_p) 939 { 940 /* Error converting fallback string - fatal 941 */ 942 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 943 _("Cannot convert fallback '%s' to codeset '%s'"), 944 insert_str, to_codeset); 945 have_error = TRUE; 946 break; 947 } 948 else if (p) 949 { 950 if (!fallback) 951 { 952 gunichar ch = g_utf8_get_char (p); 953 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", 954 ch); 955 } 956 else 957 insert_str = fallback; 958 959 save_p = g_utf8_next_char (p); 960 save_inbytes = inbytes_remaining - (save_p - p); 961 p = insert_str; 962 inbytes_remaining = strlen (p); 963 break; 964 } 965 /* fall thru if p is NULL */ 966 default: 967 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, 968 _("Error during conversion: %s"), 969 g_strerror (errno)); 970 have_error = TRUE; 971 break; 972 } 973 } 974 else 975 { 976 if (save_p) 977 { 978 if (!fallback) 979 g_free ((gchar *)insert_str); 980 p = save_p; 981 inbytes_remaining = save_inbytes; 982 save_p = NULL; 983 } 984 else if (p) 985 { 986 /* call g_iconv with NULL inbuf to cleanup shift state */ 987 p = NULL; 988 inbytes_remaining = 0; 989 } 990 else 991 done = TRUE; 992 } 993 } 994 995 /* Cleanup 996 */ 997 *outp = '\0'; 998 999 close_converter (cd); 1000 1001 if (bytes_written) 1002 *bytes_written = outp - dest; /* Doesn't include '\0' */ 1003 1004 g_free (utf8); 1005 1006 if (have_error) 1007 { 1008 if (save_p && !fallback) 1009 g_free ((gchar *)insert_str); 1010 g_free (dest); 1011 return NULL; 1012 } 1013 else 1014 return dest; 1015 } 1016 1017 /* 1018 * g_locale_to_utf8 1019 * 1020 * 1021 */ 1022 1023 static gchar * 1024 strdup_len (const gchar *string, 1025 gssize len, 1026 gsize *bytes_written, 1027 gsize *bytes_read, 1028 GError **error) 1029 1030 { 1031 gsize real_len; 1032 1033 if (!g_utf8_validate (string, len, NULL)) 1034 { 1035 if (bytes_read) 1036 *bytes_read = 0; 1037 if (bytes_written) 1038 *bytes_written = 0; 1039 1040 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1041 _("Invalid byte sequence in conversion input")); 1042 return NULL; 1043 } 1044 1045 if (len < 0) 1046 real_len = strlen (string); 1047 else 1048 { 1049 real_len = 0; 1050 1051 while (real_len < len && string[real_len]) 1052 real_len++; 1053 } 1054 1055 if (bytes_read) 1056 *bytes_read = real_len; 1057 if (bytes_written) 1058 *bytes_written = real_len; 1059 1060 return g_strndup (string, real_len); 1061 } 1062 1063 /** 1064 * g_locale_to_utf8: 1065 * @opsysstring: a string in the encoding of the current locale. On Windows 1066 * this means the system codepage. 1067 * @len: the length of the string, or -1 if the string is 1068 * nul-terminated<footnoteref linkend="nul-unsafe"/>. 1069 * @bytes_read: location to store the number of bytes in the 1070 * input string that were successfully converted, or %NULL. 1071 * Even if the conversion was successful, this may be 1072 * less than @len if there were partial characters 1073 * at the end of the input. If the error 1074 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 1075 * stored will the byte offset after the last valid 1076 * input sequence. 1077 * @bytes_written: the number of bytes stored in the output buffer (not 1078 * including the terminating nul). 1079 * @error: location to store the error occuring, or %NULL to ignore 1080 * errors. Any of the errors in #GConvertError may occur. 1081 * 1082 * Converts a string which is in the encoding used for strings by 1083 * the C runtime (usually the same as that used by the operating 1084 * system) in the <link linkend="setlocale">current locale</link> into a 1085 * UTF-8 string. 1086 * 1087 * Return value: The converted string, or %NULL on an error. 1088 **/ 1089 gchar * 1090 g_locale_to_utf8 (const gchar *opsysstring, 1091 gssize len, 1092 gsize *bytes_read, 1093 gsize *bytes_written, 1094 GError **error) 1095 { 1096 const char *charset; 1097 1098 if (g_get_charset (&charset)) 1099 return strdup_len (opsysstring, len, bytes_read, bytes_written, error); 1100 else 1101 return g_convert (opsysstring, len, 1102 "UTF-8", charset, bytes_read, bytes_written, error); 1103 } 1104 1105 /** 1106 * g_locale_from_utf8: 1107 * @utf8string: a UTF-8 encoded string 1108 * @len: the length of the string, or -1 if the string is 1109 * nul-terminated<footnoteref linkend="nul-unsafe"/>. 1110 * @bytes_read: location to store the number of bytes in the 1111 * input string that were successfully converted, or %NULL. 1112 * Even if the conversion was successful, this may be 1113 * less than @len if there were partial characters 1114 * at the end of the input. If the error 1115 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 1116 * stored will the byte offset after the last valid 1117 * input sequence. 1118 * @bytes_written: the number of bytes stored in the output buffer (not 1119 * including the terminating nul). 1120 * @error: location to store the error occuring, or %NULL to ignore 1121 * errors. Any of the errors in #GConvertError may occur. 1122 * 1123 * Converts a string from UTF-8 to the encoding used for strings by 1124 * the C runtime (usually the same as that used by the operating 1125 * system) in the <link linkend="setlocale">current locale</link>. On 1126 * Windows this means the system codepage. 1127 * 1128 * Return value: The converted string, or %NULL on an error. 1129 **/ 1130 gchar * 1131 g_locale_from_utf8 (const gchar *utf8string, 1132 gssize len, 1133 gsize *bytes_read, 1134 gsize *bytes_written, 1135 GError **error) 1136 { 1137 const gchar *charset; 1138 1139 if (g_get_charset (&charset)) 1140 return strdup_len (utf8string, len, bytes_read, bytes_written, error); 1141 else 1142 return g_convert (utf8string, len, 1143 charset, "UTF-8", bytes_read, bytes_written, error); 1144 } 1145 1146 #ifndef G_PLATFORM_WIN32 1147 1148 typedef struct _GFilenameCharsetCache GFilenameCharsetCache; 1149 1150 struct _GFilenameCharsetCache { 1151 gboolean is_utf8; 1152 gchar *charset; 1153 gchar **filename_charsets; 1154 }; 1155 1156 static void 1157 filename_charset_cache_free (gpointer data) 1158 { 1159 GFilenameCharsetCache *cache = data; 1160 g_free (cache->charset); 1161 g_strfreev (cache->filename_charsets); 1162 g_free (cache); 1163 } 1164 1165 /** 1166 * g_get_filename_charsets: 1167 * @charsets: return location for the %NULL-terminated list of encoding names 1168 * 1169 * Determines the preferred character sets used for filenames. 1170 * The first character set from the @charsets is the filename encoding, the 1171 * subsequent character sets are used when trying to generate a displayable 1172 * representation of a filename, see g_filename_display_name(). 1173 * 1174 * On Unix, the character sets are determined by consulting the 1175 * environment variables <envar>G_FILENAME_ENCODING</envar> and 1176 * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set 1177 * used in the GLib API is always UTF-8 and said environment variables 1178 * have no effect. 1179 * 1180 * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list 1181 * of character set names. The special token "@locale" is taken to 1182 * mean the character set for the <link linkend="setlocale">current 1183 * locale</link>. If <envar>G_FILENAME_ENCODING</envar> is not set, but 1184 * <envar>G_BROKEN_FILENAMES</envar> is, the character set of the current 1185 * locale is taken as the filename encoding. If neither environment variable 1186 * is set, UTF-8 is taken as the filename encoding, but the character 1187 * set of the current locale is also put in the list of encodings. 1188 * 1189 * The returned @charsets belong to GLib and must not be freed. 1190 * 1191 * Note that on Unix, regardless of the locale character set or 1192 * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present 1193 * on a system might be in any random encoding or just gibberish. 1194 * 1195 * Return value: %TRUE if the filename encoding is UTF-8. 1196 * 1197 * Since: 2.6 1198 */ 1199 gboolean 1200 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) 1201 { 1202 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; 1203 GFilenameCharsetCache *cache = g_static_private_get (&cache_private); 1204 const gchar *charset; 1205 1206 if (!cache) 1207 { 1208 cache = g_new0 (GFilenameCharsetCache, 1); 1209 g_static_private_set (&cache_private, cache, filename_charset_cache_free); 1210 } 1211 1212 g_get_charset (&charset); 1213 1214 if (!(cache->charset && strcmp (cache->charset, charset) == 0)) 1215 { 1216 const gchar *new_charset; 1217 gchar *p; 1218 gint i; 1219 1220 g_free (cache->charset); 1221 g_strfreev (cache->filename_charsets); 1222 cache->charset = g_strdup (charset); 1223 1224 p = getenv ("G_FILENAME_ENCODING"); 1225 if (p != NULL && p[0] != '\0') 1226 { 1227 cache->filename_charsets = g_strsplit (p, ",", 0); 1228 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); 1229 1230 for (i = 0; cache->filename_charsets[i]; i++) 1231 { 1232 if (strcmp ("@locale", cache->filename_charsets[i]) == 0) 1233 { 1234 g_get_charset (&new_charset); 1235 g_free (cache->filename_charsets[i]); 1236 cache->filename_charsets[i] = g_strdup (new_charset); 1237 } 1238 } 1239 } 1240 else if (getenv ("G_BROKEN_FILENAMES") != NULL) 1241 { 1242 cache->filename_charsets = g_new0 (gchar *, 2); 1243 cache->is_utf8 = g_get_charset (&new_charset); 1244 cache->filename_charsets[0] = g_strdup (new_charset); 1245 } 1246 else 1247 { 1248 cache->filename_charsets = g_new0 (gchar *, 3); 1249 cache->is_utf8 = TRUE; 1250 cache->filename_charsets[0] = g_strdup ("UTF-8"); 1251 if (!g_get_charset (&new_charset)) 1252 cache->filename_charsets[1] = g_strdup (new_charset); 1253 } 1254 } 1255 1256 if (filename_charsets) 1257 *filename_charsets = (const gchar **)cache->filename_charsets; 1258 1259 return cache->is_utf8; 1260 } 1261 1262 #else /* G_PLATFORM_WIN32 */ 1263 1264 gboolean 1265 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) 1266 { 1267 static const gchar *charsets[] = { 1268 "UTF-8", 1269 NULL 1270 }; 1271 1272 #ifdef G_OS_WIN32 1273 /* On Windows GLib pretends that the filename charset is UTF-8 */ 1274 if (filename_charsets) 1275 *filename_charsets = charsets; 1276 1277 return TRUE; 1278 #else 1279 gboolean result; 1280 1281 /* Cygwin works like before */ 1282 result = g_get_charset (&(charsets[0])); 1283 1284 if (filename_charsets) 1285 *filename_charsets = charsets; 1286 1287 return result; 1288 #endif 1289 } 1290 1291 #endif /* G_PLATFORM_WIN32 */ 1292 1293 static gboolean 1294 get_filename_charset (const gchar **filename_charset) 1295 { 1296 const gchar **charsets; 1297 gboolean is_utf8; 1298 1299 is_utf8 = g_get_filename_charsets (&charsets); 1300 1301 if (filename_charset) 1302 *filename_charset = charsets[0]; 1303 1304 return is_utf8; 1305 } 1306 1307 /* This is called from g_thread_init(). It's used to 1308 * initialize some static data in a threadsafe way. 1309 */ 1310 void 1311 _g_convert_thread_init (void) 1312 { 1313 const gchar **dummy; 1314 (void) g_get_filename_charsets (&dummy); 1315 } 1316 1317 /** 1318 * g_filename_to_utf8: 1319 * @opsysstring: a string in the encoding for filenames 1320 * @len: the length of the string, or -1 if the string is 1321 * nul-terminated<footnoteref linkend="nul-unsafe"/>. 1322 * @bytes_read: location to store the number of bytes in the 1323 * input string that were successfully converted, or %NULL. 1324 * Even if the conversion was successful, this may be 1325 * less than @len if there were partial characters 1326 * at the end of the input. If the error 1327 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 1328 * stored will the byte offset after the last valid 1329 * input sequence. 1330 * @bytes_written: the number of bytes stored in the output buffer (not 1331 * including the terminating nul). 1332 * @error: location to store the error occuring, or %NULL to ignore 1333 * errors. Any of the errors in #GConvertError may occur. 1334 * 1335 * Converts a string which is in the encoding used by GLib for 1336 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 1337 * for filenames; on other platforms, this function indirectly depends on 1338 * the <link linkend="setlocale">current locale</link>. 1339 * 1340 * Return value: The converted string, or %NULL on an error. 1341 **/ 1342 gchar* 1343 g_filename_to_utf8 (const gchar *opsysstring, 1344 gssize len, 1345 gsize *bytes_read, 1346 gsize *bytes_written, 1347 GError **error) 1348 { 1349 const gchar *charset; 1350 1351 if (get_filename_charset (&charset)) 1352 return strdup_len (opsysstring, len, bytes_read, bytes_written, error); 1353 else 1354 return g_convert (opsysstring, len, 1355 "UTF-8", charset, bytes_read, bytes_written, error); 1356 } 1357 1358 #if defined (G_OS_WIN32) && !defined (_WIN64) 1359 1360 #undef g_filename_to_utf8 1361 1362 /* Binary compatibility version. Not for newly compiled code. Also not needed for 1363 * 64-bit versions as there should be no old deployed binaries that would use 1364 * the old versions. 1365 */ 1366 1367 gchar* 1368 g_filename_to_utf8 (const gchar *opsysstring, 1369 gssize len, 1370 gsize *bytes_read, 1371 gsize *bytes_written, 1372 GError **error) 1373 { 1374 const gchar *charset; 1375 1376 if (g_get_charset (&charset)) 1377 return strdup_len (opsysstring, len, bytes_read, bytes_written, error); 1378 else 1379 return g_convert (opsysstring, len, 1380 "UTF-8", charset, bytes_read, bytes_written, error); 1381 } 1382 1383 #endif 1384 1385 /** 1386 * g_filename_from_utf8: 1387 * @utf8string: a UTF-8 encoded string. 1388 * @len: the length of the string, or -1 if the string is 1389 * nul-terminated. 1390 * @bytes_read: location to store the number of bytes in the 1391 * input string that were successfully converted, or %NULL. 1392 * Even if the conversion was successful, this may be 1393 * less than @len if there were partial characters 1394 * at the end of the input. If the error 1395 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value 1396 * stored will the byte offset after the last valid 1397 * input sequence. 1398 * @bytes_written: the number of bytes stored in the output buffer (not 1399 * including the terminating nul). 1400 * @error: location to store the error occuring, or %NULL to ignore 1401 * errors. Any of the errors in #GConvertError may occur. 1402 * 1403 * Converts a string from UTF-8 to the encoding GLib uses for 1404 * filenames. Note that on Windows GLib uses UTF-8 for filenames; 1405 * on other platforms, this function indirectly depends on the 1406 * <link linkend="setlocale">current locale</link>. 1407 * 1408 * Return value: The converted string, or %NULL on an error. 1409 **/ 1410 gchar* 1411 g_filename_from_utf8 (const gchar *utf8string, 1412 gssize len, 1413 gsize *bytes_read, 1414 gsize *bytes_written, 1415 GError **error) 1416 { 1417 const gchar *charset; 1418 1419 if (get_filename_charset (&charset)) 1420 return strdup_len (utf8string, len, bytes_read, bytes_written, error); 1421 else 1422 return g_convert (utf8string, len, 1423 charset, "UTF-8", bytes_read, bytes_written, error); 1424 } 1425 1426 #if defined (G_OS_WIN32) && !defined (_WIN64) 1427 1428 #undef g_filename_from_utf8 1429 1430 /* Binary compatibility version. Not for newly compiled code. */ 1431 1432 gchar* 1433 g_filename_from_utf8 (const gchar *utf8string, 1434 gssize len, 1435 gsize *bytes_read, 1436 gsize *bytes_written, 1437 GError **error) 1438 { 1439 const gchar *charset; 1440 1441 if (g_get_charset (&charset)) 1442 return strdup_len (utf8string, len, bytes_read, bytes_written, error); 1443 else 1444 return g_convert (utf8string, len, 1445 charset, "UTF-8", bytes_read, bytes_written, error); 1446 } 1447 1448 #endif 1449 1450 /* Test of haystack has the needle prefix, comparing case 1451 * insensitive. haystack may be UTF-8, but needle must 1452 * contain only ascii. */ 1453 static gboolean 1454 has_case_prefix (const gchar *haystack, const gchar *needle) 1455 { 1456 const gchar *h, *n; 1457 1458 /* Eat one character at a time. */ 1459 h = haystack; 1460 n = needle; 1461 1462 while (*n && *h && 1463 g_ascii_tolower (*n) == g_ascii_tolower (*h)) 1464 { 1465 n++; 1466 h++; 1467 } 1468 1469 return *n == '\0'; 1470 } 1471 1472 typedef enum { 1473 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ 1474 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ 1475 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ 1476 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ 1477 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ 1478 } UnsafeCharacterSet; 1479 1480 static const guchar acceptable[96] = { 1481 /* A table of the ASCII chars from space (32) to DEL (127) */ 1482 /* ! " # $ % & ' ( ) * + , - . / */ 1483 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, 1484 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1485 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, 1486 /* @ A B C D E F G H I J K L M N O */ 1487 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, 1488 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1489 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, 1490 /* ` a b c d e f g h i j k l m n o */ 1491 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, 1492 /* p q r s t u v w x y z { | } ~ DEL */ 1493 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 1494 }; 1495 1496 static const gchar hex[16] = "0123456789ABCDEF"; 1497 1498 /* Note: This escape function works on file: URIs, but if you want to 1499 * escape something else, please read RFC-2396 */ 1500 static gchar * 1501 g_escape_uri_string (const gchar *string, 1502 UnsafeCharacterSet mask) 1503 { 1504 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) 1505 1506 const gchar *p; 1507 gchar *q; 1508 gchar *result; 1509 int c; 1510 gint unacceptable; 1511 UnsafeCharacterSet use_mask; 1512 1513 g_return_val_if_fail (mask == UNSAFE_ALL 1514 || mask == UNSAFE_ALLOW_PLUS 1515 || mask == UNSAFE_PATH 1516 || mask == UNSAFE_HOST 1517 || mask == UNSAFE_SLASHES, NULL); 1518 1519 unacceptable = 0; 1520 use_mask = mask; 1521 for (p = string; *p != '\0'; p++) 1522 { 1523 c = (guchar) *p; 1524 if (!ACCEPTABLE (c)) 1525 unacceptable++; 1526 } 1527 1528 result = g_malloc (p - string + unacceptable * 2 + 1); 1529 1530 use_mask = mask; 1531 for (q = result, p = string; *p != '\0'; p++) 1532 { 1533 c = (guchar) *p; 1534 1535 if (!ACCEPTABLE (c)) 1536 { 1537 *q++ = '%'; /* means hex coming */ 1538 *q++ = hex[c >> 4]; 1539 *q++ = hex[c & 15]; 1540 } 1541 else 1542 *q++ = *p; 1543 } 1544 1545 *q = '\0'; 1546 1547 return result; 1548 } 1549 1550 1551 static gchar * 1552 g_escape_file_uri (const gchar *hostname, 1553 const gchar *pathname) 1554 { 1555 char *escaped_hostname = NULL; 1556 char *escaped_path; 1557 char *res; 1558 1559 #ifdef G_OS_WIN32 1560 char *p, *backslash; 1561 1562 /* Turn backslashes into forward slashes. That's what Netscape 1563 * does, and they are actually more or less equivalent in Windows. 1564 */ 1565 1566 pathname = g_strdup (pathname); 1567 p = (char *) pathname; 1568 1569 while ((backslash = strchr (p, '\\')) != NULL) 1570 { 1571 *backslash = '/'; 1572 p = backslash + 1; 1573 } 1574 #endif 1575 1576 if (hostname && *hostname != '\0') 1577 { 1578 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); 1579 } 1580 1581 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); 1582 1583 res = g_strconcat ("file://", 1584 (escaped_hostname) ? escaped_hostname : "", 1585 (*escaped_path != '/') ? "/" : "", 1586 escaped_path, 1587 NULL); 1588 1589 #ifdef G_OS_WIN32 1590 g_free ((char *) pathname); 1591 #endif 1592 1593 g_free (escaped_hostname); 1594 g_free (escaped_path); 1595 1596 return res; 1597 } 1598 1599 static int 1600 unescape_character (const char *scanner) 1601 { 1602 int first_digit; 1603 int second_digit; 1604 1605 first_digit = g_ascii_xdigit_value (scanner[0]); 1606 if (first_digit < 0) 1607 return -1; 1608 1609 second_digit = g_ascii_xdigit_value (scanner[1]); 1610 if (second_digit < 0) 1611 return -1; 1612 1613 return (first_digit << 4) | second_digit; 1614 } 1615 1616 static gchar * 1617 g_unescape_uri_string (const char *escaped, 1618 int len, 1619 const char *illegal_escaped_characters, 1620 gboolean ascii_must_not_be_escaped) 1621 { 1622 const gchar *in, *in_end; 1623 gchar *out, *result; 1624 int c; 1625 1626 if (escaped == NULL) 1627 return NULL; 1628 1629 if (len < 0) 1630 len = strlen (escaped); 1631 1632 result = g_malloc (len + 1); 1633 1634 out = result; 1635 for (in = escaped, in_end = escaped + len; in < in_end; in++) 1636 { 1637 c = *in; 1638 1639 if (c == '%') 1640 { 1641 /* catch partial escape sequences past the end of the substring */ 1642 if (in + 3 > in_end) 1643 break; 1644 1645 c = unescape_character (in + 1); 1646 1647 /* catch bad escape sequences and NUL characters */ 1648 if (c <= 0) 1649 break; 1650 1651 /* catch escaped ASCII */ 1652 if (ascii_must_not_be_escaped && c <= 0x7F) 1653 break; 1654 1655 /* catch other illegal escaped characters */ 1656 if (strchr (illegal_escaped_characters, c) != NULL) 1657 break; 1658 1659 in += 2; 1660 } 1661 1662 *out++ = c; 1663 } 1664 1665 g_assert (out - result <= len); 1666 *out = '\0'; 1667 1668 if (in != in_end) 1669 { 1670 g_free (result); 1671 return NULL; 1672 } 1673 1674 return result; 1675 } 1676 1677 static gboolean 1678 is_asciialphanum (gunichar c) 1679 { 1680 return c <= 0x7F && g_ascii_isalnum (c); 1681 } 1682 1683 static gboolean 1684 is_asciialpha (gunichar c) 1685 { 1686 return c <= 0x7F && g_ascii_isalpha (c); 1687 } 1688 1689 /* allows an empty string */ 1690 static gboolean 1691 hostname_validate (const char *hostname) 1692 { 1693 const char *p; 1694 gunichar c, first_char, last_char; 1695 1696 p = hostname; 1697 if (*p == '\0') 1698 return TRUE; 1699 do 1700 { 1701 /* read in a label */ 1702 c = g_utf8_get_char (p); 1703 p = g_utf8_next_char (p); 1704 if (!is_asciialphanum (c)) 1705 return FALSE; 1706 first_char = c; 1707 do 1708 { 1709 last_char = c; 1710 c = g_utf8_get_char (p); 1711 p = g_utf8_next_char (p); 1712 } 1713 while (is_asciialphanum (c) || c == '-'); 1714 if (last_char == '-') 1715 return FALSE; 1716 1717 /* if that was the last label, check that it was a toplabel */ 1718 if (c == '\0' || (c == '.' && *p == '\0')) 1719 return is_asciialpha (first_char); 1720 } 1721 while (c == '.'); 1722 return FALSE; 1723 } 1724 1725 /** 1726 * g_filename_from_uri: 1727 * @uri: a uri describing a filename (escaped, encoded in ASCII). 1728 * @hostname: Location to store hostname for the URI, or %NULL. 1729 * If there is no hostname in the URI, %NULL will be 1730 * stored in this location. 1731 * @error: location to store the error occuring, or %NULL to ignore 1732 * errors. Any of the errors in #GConvertError may occur. 1733 * 1734 * Converts an escaped ASCII-encoded URI to a local filename in the 1735 * encoding used for filenames. 1736 * 1737 * Return value: a newly-allocated string holding the resulting 1738 * filename, or %NULL on an error. 1739 **/ 1740 gchar * 1741 g_filename_from_uri (const gchar *uri, 1742 gchar **hostname, 1743 GError **error) 1744 { 1745 const char *path_part; 1746 const char *host_part; 1747 char *unescaped_hostname; 1748 char *result; 1749 char *filename; 1750 int offs; 1751 #ifdef G_OS_WIN32 1752 char *p, *slash; 1753 #endif 1754 1755 if (hostname) 1756 *hostname = NULL; 1757 1758 if (!has_case_prefix (uri, "file:/")) 1759 { 1760 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, 1761 _("The URI '%s' is not an absolute URI using the \"file\" scheme"), 1762 uri); 1763 return NULL; 1764 } 1765 1766 path_part = uri + strlen ("file:"); 1767 1768 if (strchr (path_part, '#') != NULL) 1769 { 1770 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, 1771 _("The local file URI '%s' may not include a '#'"), 1772 uri); 1773 return NULL; 1774 } 1775 1776 if (has_case_prefix (path_part, "///")) 1777 path_part += 2; 1778 else if (has_case_prefix (path_part, "//")) 1779 { 1780 path_part += 2; 1781 host_part = path_part; 1782 1783 path_part = strchr (path_part, '/'); 1784 1785 if (path_part == NULL) 1786 { 1787 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, 1788 _("The URI '%s' is invalid"), 1789 uri); 1790 return NULL; 1791 } 1792 1793 unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE); 1794 1795 if (unescaped_hostname == NULL || 1796 !hostname_validate (unescaped_hostname)) 1797 { 1798 g_free (unescaped_hostname); 1799 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, 1800 _("The hostname of the URI '%s' is invalid"), 1801 uri); 1802 return NULL; 1803 } 1804 1805 if (hostname) 1806 *hostname = unescaped_hostname; 1807 else 1808 g_free (unescaped_hostname); 1809 } 1810 1811 filename = g_unescape_uri_string (path_part, -1, "/", FALSE); 1812 1813 if (filename == NULL) 1814 { 1815 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, 1816 _("The URI '%s' contains invalidly escaped characters"), 1817 uri); 1818 return NULL; 1819 } 1820 1821 offs = 0; 1822 #ifdef G_OS_WIN32 1823 /* Drop localhost */ 1824 if (hostname && *hostname != NULL && 1825 g_ascii_strcasecmp (*hostname, "localhost") == 0) 1826 { 1827 g_free (*hostname); 1828 *hostname = NULL; 1829 } 1830 1831 /* Turn slashes into backslashes, because that's the canonical spelling */ 1832 p = filename; 1833 while ((slash = strchr (p, '/')) != NULL) 1834 { 1835 *slash = '\\'; 1836 p = slash + 1; 1837 } 1838 1839 /* Windows URIs with a drive letter can be like "file://host/c:/foo" 1840 * or "file://host/c|/foo" (some Netscape versions). In those cases, start 1841 * the filename from the drive letter. 1842 */ 1843 if (g_ascii_isalpha (filename[1])) 1844 { 1845 if (filename[2] == ':') 1846 offs = 1; 1847 else if (filename[2] == '|') 1848 { 1849 filename[2] = ':'; 1850 offs = 1; 1851 } 1852 } 1853 #endif 1854 1855 result = g_strdup (filename + offs); 1856 g_free (filename); 1857 1858 return result; 1859 } 1860 1861 #if defined (G_OS_WIN32) && !defined (_WIN64) 1862 1863 #undef g_filename_from_uri 1864 1865 gchar * 1866 g_filename_from_uri (const gchar *uri, 1867 gchar **hostname, 1868 GError **error) 1869 { 1870 gchar *utf8_filename; 1871 gchar *retval = NULL; 1872 1873 utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error); 1874 if (utf8_filename) 1875 { 1876 retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error); 1877 g_free (utf8_filename); 1878 } 1879 return retval; 1880 } 1881 1882 #endif 1883 1884 /** 1885 * g_filename_to_uri: 1886 * @filename: an absolute filename specified in the GLib file name encoding, 1887 * which is the on-disk file name bytes on Unix, and UTF-8 on 1888 * Windows 1889 * @hostname: A UTF-8 encoded hostname, or %NULL for none. 1890 * @error: location to store the error occuring, or %NULL to ignore 1891 * errors. Any of the errors in #GConvertError may occur. 1892 * 1893 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path 1894 * component following Section 3.3. of RFC 2396. 1895 * 1896 * Return value: a newly-allocated string holding the resulting 1897 * URI, or %NULL on an error. 1898 **/ 1899 gchar * 1900 g_filename_to_uri (const gchar *filename, 1901 const gchar *hostname, 1902 GError **error) 1903 { 1904 char *escaped_uri; 1905 1906 g_return_val_if_fail (filename != NULL, NULL); 1907 1908 if (!g_path_is_absolute (filename)) 1909 { 1910 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, 1911 _("The pathname '%s' is not an absolute path"), 1912 filename); 1913 return NULL; 1914 } 1915 1916 if (hostname && 1917 !(g_utf8_validate (hostname, -1, NULL) 1918 && hostname_validate (hostname))) 1919 { 1920 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, 1921 _("Invalid hostname")); 1922 return NULL; 1923 } 1924 1925 #ifdef G_OS_WIN32 1926 /* Don't use localhost unnecessarily */ 1927 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) 1928 hostname = NULL; 1929 #endif 1930 1931 escaped_uri = g_escape_file_uri (hostname, filename); 1932 1933 return escaped_uri; 1934 } 1935 1936 #if defined (G_OS_WIN32) && !defined (_WIN64) 1937 1938 #undef g_filename_to_uri 1939 1940 gchar * 1941 g_filename_to_uri (const gchar *filename, 1942 const gchar *hostname, 1943 GError **error) 1944 { 1945 gchar *utf8_filename; 1946 gchar *retval = NULL; 1947 1948 utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error); 1949 1950 if (utf8_filename) 1951 { 1952 retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error); 1953 g_free (utf8_filename); 1954 } 1955 1956 return retval; 1957 } 1958 1959 #endif 1960 1961 /** 1962 * g_uri_list_extract_uris: 1963 * @uri_list: an URI list 1964 * 1965 * Splits an URI list conforming to the text/uri-list 1966 * mime type defined in RFC 2483 into individual URIs, 1967 * discarding any comments. The URIs are not validated. 1968 * 1969 * Returns: a newly allocated %NULL-terminated list of 1970 * strings holding the individual URIs. The array should 1971 * be freed with g_strfreev(). 1972 * 1973 * Since: 2.6 1974 */ 1975 gchar ** 1976 g_uri_list_extract_uris (const gchar *uri_list) 1977 { 1978 GSList *uris, *u; 1979 const gchar *p, *q; 1980 gchar **result; 1981 gint n_uris = 0; 1982 1983 uris = NULL; 1984 1985 p = uri_list; 1986 1987 /* We don't actually try to validate the URI according to RFC 1988 * 2396, or even check for allowed characters - we just ignore 1989 * comments and trim whitespace off the ends. We also 1990 * allow LF delimination as well as the specified CRLF. 1991 * 1992 * We do allow comments like specified in RFC 2483. 1993 */ 1994 while (p) 1995 { 1996 if (*p != '#') 1997 { 1998 while (g_ascii_isspace (*p)) 1999 p++; 2000 2001 q = p; 2002 while (*q && (*q != '\n') && (*q != '\r')) 2003 q++; 2004 2005 if (q > p) 2006 { 2007 q--; 2008 while (q > p && g_ascii_isspace (*q)) 2009 q--; 2010 2011 if (q > p) 2012 { 2013 uris = g_slist_prepend (uris, g_strndup (p, q - p + 1)); 2014 n_uris++; 2015 } 2016 } 2017 } 2018 p = strchr (p, '\n'); 2019 if (p) 2020 p++; 2021 } 2022 2023 result = g_new (gchar *, n_uris + 1); 2024 2025 result[n_uris--] = NULL; 2026 for (u = uris; u; u = u->next) 2027 result[n_uris--] = u->data; 2028 2029 g_slist_free (uris); 2030 2031 return result; 2032 } 2033 2034 /** 2035 * g_filename_display_basename: 2036 * @filename: an absolute pathname in the GLib file name encoding 2037 * 2038 * Returns the display basename for the particular filename, guaranteed 2039 * to be valid UTF-8. The display name might not be identical to the filename, 2040 * for instance there might be problems converting it to UTF-8, and some files 2041 * can be translated in the display. 2042 * 2043 * If GLib can not make sense of the encoding of @filename, as a last resort it 2044 * replaces unknown characters with U+FFFD, the Unicode replacement character. 2045 * You can search the result for the UTF-8 encoding of this character (which is 2046 * "\357\277\275" in octal notation) to find out if @filename was in an invalid 2047 * encoding. 2048 * 2049 * You must pass the whole absolute pathname to this functions so that 2050 * translation of well known locations can be done. 2051 * 2052 * This function is preferred over g_filename_display_name() if you know the 2053 * whole path, as it allows translation. 2054 * 2055 * Return value: a newly allocated string containing 2056 * a rendition of the basename of the filename in valid UTF-8 2057 * 2058 * Since: 2.6 2059 **/ 2060 gchar * 2061 g_filename_display_basename (const gchar *filename) 2062 { 2063 char *basename; 2064 char *display_name; 2065 2066 g_return_val_if_fail (filename != NULL, NULL); 2067 2068 basename = g_path_get_basename (filename); 2069 display_name = g_filename_display_name (basename); 2070 g_free (basename); 2071 return display_name; 2072 } 2073 2074 /** 2075 * g_filename_display_name: 2076 * @filename: a pathname hopefully in the GLib file name encoding 2077 * 2078 * Converts a filename into a valid UTF-8 string. The conversion is 2079 * not necessarily reversible, so you should keep the original around 2080 * and use the return value of this function only for display purposes. 2081 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL 2082 * even if the filename actually isn't in the GLib file name encoding. 2083 * 2084 * If GLib can not make sense of the encoding of @filename, as a last resort it 2085 * replaces unknown characters with U+FFFD, the Unicode replacement character. 2086 * You can search the result for the UTF-8 encoding of this character (which is 2087 * "\357\277\275" in octal notation) to find out if @filename was in an invalid 2088 * encoding. 2089 * 2090 * If you know the whole pathname of the file you should use 2091 * g_filename_display_basename(), since that allows location-based 2092 * translation of filenames. 2093 * 2094 * Return value: a newly allocated string containing 2095 * a rendition of the filename in valid UTF-8 2096 * 2097 * Since: 2.6 2098 **/ 2099 gchar * 2100 g_filename_display_name (const gchar *filename) 2101 { 2102 gint i; 2103 const gchar **charsets; 2104 gchar *display_name = NULL; 2105 gboolean is_utf8; 2106 2107 is_utf8 = g_get_filename_charsets (&charsets); 2108 2109 if (is_utf8) 2110 { 2111 if (g_utf8_validate (filename, -1, NULL)) 2112 display_name = g_strdup (filename); 2113 } 2114 2115 if (!display_name) 2116 { 2117 /* Try to convert from the filename charsets to UTF-8. 2118 * Skip the first charset if it is UTF-8. 2119 */ 2120 for (i = is_utf8 ? 1 : 0; charsets[i]; i++) 2121 { 2122 display_name = g_convert (filename, -1, "UTF-8", charsets[i], 2123 NULL, NULL, NULL); 2124 2125 if (display_name) 2126 break; 2127 } 2128 } 2129 2130 /* if all conversions failed, we replace invalid UTF-8 2131 * by a question mark 2132 */ 2133 if (!display_name) 2134 display_name = _g_utf8_make_valid (filename); 2135 2136 return display_name; 2137 } 2138 2139 #define __G_CONVERT_C__ 2140 #include "galiasdef.c" 2141