1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, see <http://www.gnu.org/licenses/>. */ 17 18 /* Written by Bruno Haible <bruno (at) clisp.org>. */ 19 20 #include <config.h> 21 22 /* Specification. */ 23 #include "localcharset.h" 24 25 #include <fcntl.h> 26 #include <stddef.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <stdlib.h> 30 31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 33 #endif 34 35 #if defined _WIN32 || defined __WIN32__ 36 # define WINDOWS_NATIVE 37 #endif 38 39 #if defined __EMX__ 40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 41 # ifndef OS2 42 # define OS2 43 # endif 44 #endif 45 46 #if !defined WINDOWS_NATIVE 47 # include <unistd.h> 48 # if HAVE_LANGINFO_CODESET 49 # include <langinfo.h> 50 # else 51 # if 0 /* see comment below */ 52 # include <locale.h> 53 # endif 54 # endif 55 # ifdef __CYGWIN__ 56 # define WIN32_LEAN_AND_MEAN 57 # include <windows.h> 58 # endif 59 #elif defined WINDOWS_NATIVE 60 # define WIN32_LEAN_AND_MEAN 61 # include <windows.h> 62 #endif 63 #if defined OS2 64 # define INCL_DOS 65 # include <os2.h> 66 #endif 67 68 #if ENABLE_RELOCATABLE 69 # include "relocatable.h" 70 #else 71 # define relocate(pathname) (pathname) 72 #endif 73 74 /* Get LIBDIR. */ 75 #ifndef LIBDIR 76 # include "configmake.h" 77 #endif 78 79 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */ 80 #ifndef O_NOFOLLOW 81 # define O_NOFOLLOW 0 82 #endif 83 84 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 85 /* Native Windows, Cygwin, OS/2, DOS */ 86 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 87 #endif 88 89 #ifndef DIRECTORY_SEPARATOR 90 # define DIRECTORY_SEPARATOR '/' 91 #endif 92 93 #ifndef ISSLASH 94 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 95 #endif 96 97 #if HAVE_DECL_GETC_UNLOCKED 98 # undef getc 99 # define getc getc_unlocked 100 #endif 101 102 /* The following static variable is declared 'volatile' to avoid a 103 possible multithread problem in the function get_charset_aliases. If we 104 are running in a threaded environment, and if two threads initialize 105 'charset_aliases' simultaneously, both will produce the same value, 106 and everything will be ok if the two assignments to 'charset_aliases' 107 are atomic. But I don't know what will happen if the two assignments mix. */ 108 #if __STDC__ != 1 109 # define volatile /* empty */ 110 #endif 111 /* Pointer to the contents of the charset.alias file, if it has already been 112 read, else NULL. Its format is: 113 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 114 static const char * volatile charset_aliases; 115 116 /* Return a pointer to the contents of the charset.alias file. */ 117 static const char * 118 get_charset_aliases (void) 119 { 120 const char *cp; 121 122 cp = charset_aliases; 123 if (cp == NULL) 124 { 125 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__) 126 const char *dir; 127 const char *base = "charset.alias"; 128 char *file_name; 129 130 /* Make it possible to override the charset.alias location. This is 131 necessary for running the testsuite before "make install". */ 132 dir = getenv ("CHARSETALIASDIR"); 133 if (dir == NULL || dir[0] == '\0') 134 dir = relocate (LIBDIR); 135 136 /* Concatenate dir and base into freshly allocated file_name. */ 137 { 138 size_t dir_len = strlen (dir); 139 size_t base_len = strlen (base); 140 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 141 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 142 if (file_name != NULL) 143 { 144 memcpy (file_name, dir, dir_len); 145 if (add_slash) 146 file_name[dir_len] = DIRECTORY_SEPARATOR; 147 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 148 } 149 } 150 151 if (file_name == NULL) 152 /* Out of memory. Treat the file as empty. */ 153 cp = ""; 154 else 155 { 156 int fd; 157 158 /* Open the file. Reject symbolic links on platforms that support 159 O_NOFOLLOW. This is a security feature. Without it, an attacker 160 could retrieve parts of the contents (namely, the tail of the 161 first line that starts with "* ") of an arbitrary file by placing 162 a symbolic link to that file under the name "charset.alias" in 163 some writable directory and defining the environment variable 164 CHARSETALIASDIR to point to that directory. */ 165 fd = open (file_name, 166 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0)); 167 if (fd < 0) 168 /* File not found. Treat it as empty. */ 169 cp = ""; 170 else 171 { 172 FILE *fp; 173 174 fp = fdopen (fd, "r"); 175 if (fp == NULL) 176 { 177 /* Out of memory. Treat the file as empty. */ 178 close (fd); 179 cp = ""; 180 } 181 else 182 { 183 /* Parse the file's contents. */ 184 char *res_ptr = NULL; 185 size_t res_size = 0; 186 187 for (;;) 188 { 189 int c; 190 char buf1[50+1]; 191 char buf2[50+1]; 192 size_t l1, l2; 193 char *old_res_ptr; 194 195 c = getc (fp); 196 if (c == EOF) 197 break; 198 if (c == '\n' || c == ' ' || c == '\t') 199 continue; 200 if (c == '#') 201 { 202 /* Skip comment, to end of line. */ 203 do 204 c = getc (fp); 205 while (!(c == EOF || c == '\n')); 206 if (c == EOF) 207 break; 208 continue; 209 } 210 ungetc (c, fp); 211 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 212 break; 213 l1 = strlen (buf1); 214 l2 = strlen (buf2); 215 old_res_ptr = res_ptr; 216 if (res_size == 0) 217 { 218 res_size = l1 + 1 + l2 + 1; 219 res_ptr = (char *) malloc (res_size + 1); 220 } 221 else 222 { 223 res_size += l1 + 1 + l2 + 1; 224 res_ptr = (char *) realloc (res_ptr, res_size + 1); 225 } 226 if (res_ptr == NULL) 227 { 228 /* Out of memory. */ 229 res_size = 0; 230 free (old_res_ptr); 231 break; 232 } 233 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 234 strcpy (res_ptr + res_size - (l2 + 1), buf2); 235 } 236 fclose (fp); 237 if (res_size == 0) 238 cp = ""; 239 else 240 { 241 *(res_ptr + res_size) = '\0'; 242 cp = res_ptr; 243 } 244 } 245 } 246 247 free (file_name); 248 } 249 250 #else 251 252 # if defined DARWIN7 253 /* To avoid the trouble of installing a file that is shared by many 254 GNU packages -- many packaging systems have problems with this --, 255 simply inline the aliases here. */ 256 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 257 "ISO8859-2" "\0" "ISO-8859-2" "\0" 258 "ISO8859-4" "\0" "ISO-8859-4" "\0" 259 "ISO8859-5" "\0" "ISO-8859-5" "\0" 260 "ISO8859-7" "\0" "ISO-8859-7" "\0" 261 "ISO8859-9" "\0" "ISO-8859-9" "\0" 262 "ISO8859-13" "\0" "ISO-8859-13" "\0" 263 "ISO8859-15" "\0" "ISO-8859-15" "\0" 264 "KOI8-R" "\0" "KOI8-R" "\0" 265 "KOI8-U" "\0" "KOI8-U" "\0" 266 "CP866" "\0" "CP866" "\0" 267 "CP949" "\0" "CP949" "\0" 268 "CP1131" "\0" "CP1131" "\0" 269 "CP1251" "\0" "CP1251" "\0" 270 "eucCN" "\0" "GB2312" "\0" 271 "GB2312" "\0" "GB2312" "\0" 272 "eucJP" "\0" "EUC-JP" "\0" 273 "eucKR" "\0" "EUC-KR" "\0" 274 "Big5" "\0" "BIG5" "\0" 275 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 276 "GBK" "\0" "GBK" "\0" 277 "GB18030" "\0" "GB18030" "\0" 278 "SJIS" "\0" "SHIFT_JIS" "\0" 279 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 280 "PT154" "\0" "PT154" "\0" 281 /*"ISCII-DEV" "\0" "?" "\0"*/ 282 "*" "\0" "UTF-8" "\0"; 283 # endif 284 285 # if defined VMS 286 /* To avoid the troubles of an extra file charset.alias_vms in the 287 sources of many GNU packages, simply inline the aliases here. */ 288 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 289 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 290 section 10.7 "Handling Different Character Sets". */ 291 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 292 "ISO8859-2" "\0" "ISO-8859-2" "\0" 293 "ISO8859-5" "\0" "ISO-8859-5" "\0" 294 "ISO8859-7" "\0" "ISO-8859-7" "\0" 295 "ISO8859-8" "\0" "ISO-8859-8" "\0" 296 "ISO8859-9" "\0" "ISO-8859-9" "\0" 297 /* Japanese */ 298 "eucJP" "\0" "EUC-JP" "\0" 299 "SJIS" "\0" "SHIFT_JIS" "\0" 300 "DECKANJI" "\0" "DEC-KANJI" "\0" 301 "SDECKANJI" "\0" "EUC-JP" "\0" 302 /* Chinese */ 303 "eucTW" "\0" "EUC-TW" "\0" 304 "DECHANYU" "\0" "DEC-HANYU" "\0" 305 "DECHANZI" "\0" "GB2312" "\0" 306 /* Korean */ 307 "DECKOREAN" "\0" "EUC-KR" "\0"; 308 # endif 309 310 # if defined WINDOWS_NATIVE || defined __CYGWIN__ 311 /* To avoid the troubles of installing a separate file in the same 312 directory as the DLL and of retrieving the DLL's directory at 313 runtime, simply inline the aliases here. */ 314 315 cp = "CP936" "\0" "GBK" "\0" 316 "CP1361" "\0" "JOHAB" "\0" 317 "CP20127" "\0" "ASCII" "\0" 318 "CP20866" "\0" "KOI8-R" "\0" 319 "CP20936" "\0" "GB2312" "\0" 320 "CP21866" "\0" "KOI8-RU" "\0" 321 "CP28591" "\0" "ISO-8859-1" "\0" 322 "CP28592" "\0" "ISO-8859-2" "\0" 323 "CP28593" "\0" "ISO-8859-3" "\0" 324 "CP28594" "\0" "ISO-8859-4" "\0" 325 "CP28595" "\0" "ISO-8859-5" "\0" 326 "CP28596" "\0" "ISO-8859-6" "\0" 327 "CP28597" "\0" "ISO-8859-7" "\0" 328 "CP28598" "\0" "ISO-8859-8" "\0" 329 "CP28599" "\0" "ISO-8859-9" "\0" 330 "CP28605" "\0" "ISO-8859-15" "\0" 331 "CP38598" "\0" "ISO-8859-8" "\0" 332 "CP51932" "\0" "EUC-JP" "\0" 333 "CP51936" "\0" "GB2312" "\0" 334 "CP51949" "\0" "EUC-KR" "\0" 335 "CP51950" "\0" "EUC-TW" "\0" 336 "CP54936" "\0" "GB18030" "\0" 337 "CP65001" "\0" "UTF-8" "\0"; 338 # endif 339 #endif 340 341 charset_aliases = cp; 342 } 343 344 return cp; 345 } 346 347 /* Determine the current locale's character encoding, and canonicalize it 348 into one of the canonical names listed in config.charset. 349 The result must not be freed; it is statically allocated. 350 If the canonical name cannot be determined, the result is a non-canonical 351 name. */ 352 353 #ifdef STATIC 354 STATIC 355 #endif 356 const char * 357 locale_charset (void) 358 { 359 const char *codeset; 360 const char *aliases; 361 362 #if !(defined WINDOWS_NATIVE || defined OS2) 363 364 # if HAVE_LANGINFO_CODESET 365 366 /* Most systems support nl_langinfo (CODESET) nowadays. */ 367 codeset = nl_langinfo (CODESET); 368 369 # ifdef __CYGWIN__ 370 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always 371 returns "US-ASCII". Return the suffix of the locale name from the 372 environment variables (if present) or the codepage as a number. */ 373 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 374 { 375 const char *locale; 376 static char buf[2 + 10 + 1]; 377 378 locale = getenv ("LC_ALL"); 379 if (locale == NULL || locale[0] == '\0') 380 { 381 locale = getenv ("LC_CTYPE"); 382 if (locale == NULL || locale[0] == '\0') 383 locale = getenv ("LANG"); 384 } 385 if (locale != NULL && locale[0] != '\0') 386 { 387 /* If the locale name contains an encoding after the dot, return 388 it. */ 389 const char *dot = strchr (locale, '.'); 390 391 if (dot != NULL) 392 { 393 const char *modifier; 394 395 dot++; 396 /* Look for the possible @... trailer and remove it, if any. */ 397 modifier = strchr (dot, '@'); 398 if (modifier == NULL) 399 return dot; 400 if (modifier - dot < sizeof (buf)) 401 { 402 memcpy (buf, dot, modifier - dot); 403 buf [modifier - dot] = '\0'; 404 return buf; 405 } 406 } 407 } 408 409 /* The Windows API has a function returning the locale's codepage as a 410 number: GetACP(). This encoding is used by Cygwin, unless the user 411 has set the environment variable CYGWIN=codepage:oem (which very few 412 people do). 413 Output directed to console windows needs to be converted (to 414 GetOEMCP() if the console is using a raster font, or to 415 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 416 this conversion transparently (see winsup/cygwin/fhandler_console.cc), 417 converting to GetConsoleOutputCP(). This leads to correct results, 418 except when SetConsoleOutputCP has been called and a raster font is 419 in use. */ 420 sprintf (buf, "CP%u", GetACP ()); 421 codeset = buf; 422 } 423 # endif 424 425 # else 426 427 /* On old systems which lack it, use setlocale or getenv. */ 428 const char *locale = NULL; 429 430 /* But most old systems don't have a complete set of locales. Some 431 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 432 use setlocale here; it would return "C" when it doesn't support the 433 locale name the user has set. */ 434 # if 0 435 locale = setlocale (LC_CTYPE, NULL); 436 # endif 437 if (locale == NULL || locale[0] == '\0') 438 { 439 locale = getenv ("LC_ALL"); 440 if (locale == NULL || locale[0] == '\0') 441 { 442 locale = getenv ("LC_CTYPE"); 443 if (locale == NULL || locale[0] == '\0') 444 locale = getenv ("LANG"); 445 } 446 } 447 448 /* On some old systems, one used to set locale = "iso8859_1". On others, 449 you set it to "language_COUNTRY.charset". In any case, we resolve it 450 through the charset.alias file. */ 451 codeset = locale; 452 453 # endif 454 455 #elif defined WINDOWS_NATIVE 456 457 static char buf[2 + 10 + 1]; 458 459 /* The Windows API has a function returning the locale's codepage as a 460 number: GetACP(). 461 When the output goes to a console window, it needs to be provided in 462 GetOEMCP() encoding if the console is using a raster font, or in 463 GetConsoleOutputCP() encoding if it is using a TrueType font. 464 But in GUI programs and for output sent to files and pipes, GetACP() 465 encoding is the best bet. */ 466 sprintf (buf, "CP%u", GetACP ()); 467 codeset = buf; 468 469 #elif defined OS2 470 471 const char *locale; 472 static char buf[2 + 10 + 1]; 473 ULONG cp[3]; 474 ULONG cplen; 475 476 /* Allow user to override the codeset, as set in the operating system, 477 with standard language environment variables. */ 478 locale = getenv ("LC_ALL"); 479 if (locale == NULL || locale[0] == '\0') 480 { 481 locale = getenv ("LC_CTYPE"); 482 if (locale == NULL || locale[0] == '\0') 483 locale = getenv ("LANG"); 484 } 485 if (locale != NULL && locale[0] != '\0') 486 { 487 /* If the locale name contains an encoding after the dot, return it. */ 488 const char *dot = strchr (locale, '.'); 489 490 if (dot != NULL) 491 { 492 const char *modifier; 493 494 dot++; 495 /* Look for the possible @... trailer and remove it, if any. */ 496 modifier = strchr (dot, '@'); 497 if (modifier == NULL) 498 return dot; 499 if (modifier - dot < sizeof (buf)) 500 { 501 memcpy (buf, dot, modifier - dot); 502 buf [modifier - dot] = '\0'; 503 return buf; 504 } 505 } 506 507 /* Resolve through the charset.alias file. */ 508 codeset = locale; 509 } 510 else 511 { 512 /* OS/2 has a function returning the locale's codepage as a number. */ 513 if (DosQueryCp (sizeof (cp), cp, &cplen)) 514 codeset = ""; 515 else 516 { 517 sprintf (buf, "CP%u", cp[0]); 518 codeset = buf; 519 } 520 } 521 522 #endif 523 524 if (codeset == NULL) 525 /* The canonical name cannot be determined. */ 526 codeset = ""; 527 528 /* Resolve alias. */ 529 for (aliases = get_charset_aliases (); 530 *aliases != '\0'; 531 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 532 if (strcmp (codeset, aliases) == 0 533 || (aliases[0] == '*' && aliases[1] == '\0')) 534 { 535 codeset = aliases + strlen (aliases) + 1; 536 break; 537 } 538 539 /* Don't return an empty string. GNU libc and GNU libiconv interpret 540 the empty string as denoting "the locale's character encoding", 541 thus GNU libiconv would call this function a second time. */ 542 if (codeset[0] == '\0') 543 codeset = "ASCII"; 544 545 #ifdef DARWIN7 546 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 547 (the default codeset) does not work when MB_CUR_MAX is 1. */ 548 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX <= 1) 549 codeset = "ASCII"; 550 #endif 551 552 return codeset; 553 } 554