1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Determine a canonical name for the current locale's character encoding. 4 5 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program; if not, write to the Free Software Foundation, 19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20 21 /* Written by Bruno Haible <bruno (at) clisp.org>. */ 22 23 #include <config.h> 24 25 /* Specification. */ 26 #include "localcharset.h" 27 28 #include <stddef.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <stdlib.h> 32 33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */ 35 #endif 36 37 #if defined _WIN32 || defined __WIN32__ 38 # define WIN32_NATIVE 39 #endif 40 41 #if defined __EMX__ 42 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 43 # ifndef OS2 44 # define OS2 45 # endif 46 #endif 47 48 #if !defined WIN32_NATIVE 49 # if HAVE_LANGINFO_CODESET 50 # include <langinfo.h> 51 # else 52 # if 0 /* see comment below */ 53 # include <locale.h> 54 # endif 55 # endif 56 # ifdef __CYGWIN__ 57 # define WIN32_LEAN_AND_MEAN 58 # include <windows.h> 59 # endif 60 #elif defined WIN32_NATIVE 61 # define WIN32_LEAN_AND_MEAN 62 # include <windows.h> 63 #endif 64 #if defined OS2 65 # define INCL_DOS 66 # include <os2.h> 67 #endif 68 69 #if ENABLE_RELOCATABLE 70 # include "relocatable.h" 71 #else 72 # define relocate(pathname) (pathname) 73 #endif 74 75 /* Get LIBDIR. */ 76 #ifndef LIBDIR 77 # include "configmake.h" 78 #endif 79 80 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 81 /* Win32, Cygwin, OS/2, DOS */ 82 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 83 #endif 84 85 #ifndef DIRECTORY_SEPARATOR 86 # define DIRECTORY_SEPARATOR '/' 87 #endif 88 89 #ifndef ISSLASH 90 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 91 #endif 92 93 #if HAVE_DECL_GETC_UNLOCKED 94 # undef getc 95 # define getc getc_unlocked 96 #endif 97 98 /* The following static variable is declared 'volatile' to avoid a 99 possible multithread problem in the function get_charset_aliases. If we 100 are running in a threaded environment, and if two threads initialize 101 'charset_aliases' simultaneously, both will produce the same value, 102 and everything will be ok if the two assignments to 'charset_aliases' 103 are atomic. But I don't know what will happen if the two assignments mix. */ 104 #if __STDC__ != 1 105 # define volatile /* empty */ 106 #endif 107 /* Pointer to the contents of the charset.alias file, if it has already been 108 read, else NULL. Its format is: 109 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 110 static const char * volatile charset_aliases; 111 112 /* Return a pointer to the contents of the charset.alias file. */ 113 static const char * 114 get_charset_aliases (void) 115 { 116 const char *cp; 117 118 cp = charset_aliases; 119 if (cp == NULL) 120 { 121 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 122 FILE *fp; 123 const char *dir; 124 const char *base = "charset.alias"; 125 char *file_name; 126 127 /* Make it possible to override the charset.alias location. This is 128 necessary for running the testsuite before "make install". */ 129 dir = getenv ("CHARSETALIASDIR"); 130 if (dir == NULL || dir[0] == '\0') 131 dir = relocate (LIBDIR); 132 133 /* Concatenate dir and base into freshly allocated file_name. */ 134 { 135 size_t dir_len = strlen (dir); 136 size_t base_len = strlen (base); 137 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 138 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 139 if (file_name != NULL) 140 { 141 memcpy (file_name, dir, dir_len); 142 if (add_slash) 143 file_name[dir_len] = DIRECTORY_SEPARATOR; 144 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 145 } 146 } 147 148 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 149 /* Out of memory or file not found, treat it as empty. */ 150 cp = ""; 151 else 152 { 153 /* Parse the file's contents. */ 154 char *res_ptr = NULL; 155 size_t res_size = 0; 156 157 for (;;) 158 { 159 int c; 160 char buf1[50+1]; 161 char buf2[50+1]; 162 size_t l1, l2; 163 char *old_res_ptr; 164 165 c = getc (fp); 166 if (c == EOF) 167 break; 168 if (c == '\n' || c == ' ' || c == '\t') 169 continue; 170 if (c == '#') 171 { 172 /* Skip comment, to end of line. */ 173 do 174 c = getc (fp); 175 while (!(c == EOF || c == '\n')); 176 if (c == EOF) 177 break; 178 continue; 179 } 180 ungetc (c, fp); 181 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 182 break; 183 l1 = strlen (buf1); 184 l2 = strlen (buf2); 185 old_res_ptr = res_ptr; 186 if (res_size == 0) 187 { 188 res_size = l1 + 1 + l2 + 1; 189 res_ptr = (char *) malloc (res_size + 1); 190 } 191 else 192 { 193 res_size += l1 + 1 + l2 + 1; 194 res_ptr = (char *) realloc (res_ptr, res_size + 1); 195 } 196 if (res_ptr == NULL) 197 { 198 /* Out of memory. */ 199 res_size = 0; 200 if (old_res_ptr != NULL) 201 free (old_res_ptr); 202 break; 203 } 204 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 205 strcpy (res_ptr + res_size - (l2 + 1), buf2); 206 } 207 fclose (fp); 208 if (res_size == 0) 209 cp = ""; 210 else 211 { 212 *(res_ptr + res_size) = '\0'; 213 cp = res_ptr; 214 } 215 } 216 217 if (file_name != NULL) 218 free (file_name); 219 220 #else 221 222 # if defined DARWIN7 223 /* To avoid the trouble of installing a file that is shared by many 224 GNU packages -- many packaging systems have problems with this --, 225 simply inline the aliases here. */ 226 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 227 "ISO8859-2" "\0" "ISO-8859-2" "\0" 228 "ISO8859-4" "\0" "ISO-8859-4" "\0" 229 "ISO8859-5" "\0" "ISO-8859-5" "\0" 230 "ISO8859-7" "\0" "ISO-8859-7" "\0" 231 "ISO8859-9" "\0" "ISO-8859-9" "\0" 232 "ISO8859-13" "\0" "ISO-8859-13" "\0" 233 "ISO8859-15" "\0" "ISO-8859-15" "\0" 234 "KOI8-R" "\0" "KOI8-R" "\0" 235 "KOI8-U" "\0" "KOI8-U" "\0" 236 "CP866" "\0" "CP866" "\0" 237 "CP949" "\0" "CP949" "\0" 238 "CP1131" "\0" "CP1131" "\0" 239 "CP1251" "\0" "CP1251" "\0" 240 "eucCN" "\0" "GB2312" "\0" 241 "GB2312" "\0" "GB2312" "\0" 242 "eucJP" "\0" "EUC-JP" "\0" 243 "eucKR" "\0" "EUC-KR" "\0" 244 "Big5" "\0" "BIG5" "\0" 245 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" 246 "GBK" "\0" "GBK" "\0" 247 "GB18030" "\0" "GB18030" "\0" 248 "SJIS" "\0" "SHIFT_JIS" "\0" 249 "ARMSCII-8" "\0" "ARMSCII-8" "\0" 250 "PT154" "\0" "PT154" "\0" 251 /*"ISCII-DEV" "\0" "?" "\0"*/ 252 "*" "\0" "UTF-8" "\0"; 253 # endif 254 255 # if defined VMS 256 /* To avoid the troubles of an extra file charset.alias_vms in the 257 sources of many GNU packages, simply inline the aliases here. */ 258 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 259 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 260 section 10.7 "Handling Different Character Sets". */ 261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 262 "ISO8859-2" "\0" "ISO-8859-2" "\0" 263 "ISO8859-5" "\0" "ISO-8859-5" "\0" 264 "ISO8859-7" "\0" "ISO-8859-7" "\0" 265 "ISO8859-8" "\0" "ISO-8859-8" "\0" 266 "ISO8859-9" "\0" "ISO-8859-9" "\0" 267 /* Japanese */ 268 "eucJP" "\0" "EUC-JP" "\0" 269 "SJIS" "\0" "SHIFT_JIS" "\0" 270 "DECKANJI" "\0" "DEC-KANJI" "\0" 271 "SDECKANJI" "\0" "EUC-JP" "\0" 272 /* Chinese */ 273 "eucTW" "\0" "EUC-TW" "\0" 274 "DECHANYU" "\0" "DEC-HANYU" "\0" 275 "DECHANZI" "\0" "GB2312" "\0" 276 /* Korean */ 277 "DECKOREAN" "\0" "EUC-KR" "\0"; 278 # endif 279 280 # if defined WIN32_NATIVE || defined __CYGWIN__ 281 /* To avoid the troubles of installing a separate file in the same 282 directory as the DLL and of retrieving the DLL's directory at 283 runtime, simply inline the aliases here. */ 284 285 cp = "CP936" "\0" "GBK" "\0" 286 "CP1361" "\0" "JOHAB" "\0" 287 "CP20127" "\0" "ASCII" "\0" 288 "CP20866" "\0" "KOI8-R" "\0" 289 "CP20936" "\0" "GB2312" "\0" 290 "CP21866" "\0" "KOI8-RU" "\0" 291 "CP28591" "\0" "ISO-8859-1" "\0" 292 "CP28592" "\0" "ISO-8859-2" "\0" 293 "CP28593" "\0" "ISO-8859-3" "\0" 294 "CP28594" "\0" "ISO-8859-4" "\0" 295 "CP28595" "\0" "ISO-8859-5" "\0" 296 "CP28596" "\0" "ISO-8859-6" "\0" 297 "CP28597" "\0" "ISO-8859-7" "\0" 298 "CP28598" "\0" "ISO-8859-8" "\0" 299 "CP28599" "\0" "ISO-8859-9" "\0" 300 "CP28605" "\0" "ISO-8859-15" "\0" 301 "CP38598" "\0" "ISO-8859-8" "\0" 302 "CP51932" "\0" "EUC-JP" "\0" 303 "CP51936" "\0" "GB2312" "\0" 304 "CP51949" "\0" "EUC-KR" "\0" 305 "CP51950" "\0" "EUC-TW" "\0" 306 "CP54936" "\0" "GB18030" "\0" 307 "CP65001" "\0" "UTF-8" "\0"; 308 # endif 309 #endif 310 311 charset_aliases = cp; 312 } 313 314 return cp; 315 } 316 317 /* Determine the current locale's character encoding, and canonicalize it 318 into one of the canonical names listed in config.charset. 319 The result must not be freed; it is statically allocated. 320 If the canonical name cannot be determined, the result is a non-canonical 321 name. */ 322 323 #ifdef STATIC 324 STATIC 325 #endif 326 const char * 327 locale_charset (void) 328 { 329 const char *codeset; 330 const char *aliases; 331 332 #if !(defined WIN32_NATIVE || defined OS2) 333 334 # if HAVE_LANGINFO_CODESET 335 336 /* Most systems support nl_langinfo (CODESET) nowadays. */ 337 codeset = nl_langinfo (CODESET); 338 339 # ifdef __CYGWIN__ 340 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always 341 returns "US-ASCII". As long as this is not fixed, return the suffix 342 of the locale name from the environment variables (if present) or 343 the codepage as a number. */ 344 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 345 { 346 const char *locale; 347 static char buf[2 + 10 + 1]; 348 349 locale = getenv ("LC_ALL"); 350 if (locale == NULL || locale[0] == '\0') 351 { 352 locale = getenv ("LC_CTYPE"); 353 if (locale == NULL || locale[0] == '\0') 354 locale = getenv ("LANG"); 355 } 356 if (locale != NULL && locale[0] != '\0') 357 { 358 /* If the locale name contains an encoding after the dot, return 359 it. */ 360 const char *dot = strchr (locale, '.'); 361 362 if (dot != NULL) 363 { 364 const char *modifier; 365 366 dot++; 367 /* Look for the possible @... trailer and remove it, if any. */ 368 modifier = strchr (dot, '@'); 369 if (modifier == NULL) 370 return dot; 371 if (modifier - dot < sizeof (buf)) 372 { 373 memcpy (buf, dot, modifier - dot); 374 buf [modifier - dot] = '\0'; 375 return buf; 376 } 377 } 378 } 379 380 /* Woe32 has a function returning the locale's codepage as a number. */ 381 sprintf (buf, "CP%u", GetACP ()); 382 codeset = buf; 383 } 384 # endif 385 386 # else 387 388 /* On old systems which lack it, use setlocale or getenv. */ 389 const char *locale = NULL; 390 391 /* But most old systems don't have a complete set of locales. Some 392 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 393 use setlocale here; it would return "C" when it doesn't support the 394 locale name the user has set. */ 395 # if 0 396 locale = setlocale (LC_CTYPE, NULL); 397 # endif 398 if (locale == NULL || locale[0] == '\0') 399 { 400 locale = getenv ("LC_ALL"); 401 if (locale == NULL || locale[0] == '\0') 402 { 403 locale = getenv ("LC_CTYPE"); 404 if (locale == NULL || locale[0] == '\0') 405 locale = getenv ("LANG"); 406 } 407 } 408 409 /* On some old systems, one used to set locale = "iso8859_1". On others, 410 you set it to "language_COUNTRY.charset". In any case, we resolve it 411 through the charset.alias file. */ 412 codeset = locale; 413 414 # endif 415 416 #elif defined WIN32_NATIVE 417 418 static char buf[2 + 10 + 1]; 419 420 /* Woe32 has a function returning the locale's codepage as a number. */ 421 sprintf (buf, "CP%u", GetACP ()); 422 codeset = buf; 423 424 #elif defined OS2 425 426 const char *locale; 427 static char buf[2 + 10 + 1]; 428 ULONG cp[3]; 429 ULONG cplen; 430 431 /* Allow user to override the codeset, as set in the operating system, 432 with standard language environment variables. */ 433 locale = getenv ("LC_ALL"); 434 if (locale == NULL || locale[0] == '\0') 435 { 436 locale = getenv ("LC_CTYPE"); 437 if (locale == NULL || locale[0] == '\0') 438 locale = getenv ("LANG"); 439 } 440 if (locale != NULL && locale[0] != '\0') 441 { 442 /* If the locale name contains an encoding after the dot, return it. */ 443 const char *dot = strchr (locale, '.'); 444 445 if (dot != NULL) 446 { 447 const char *modifier; 448 449 dot++; 450 /* Look for the possible @... trailer and remove it, if any. */ 451 modifier = strchr (dot, '@'); 452 if (modifier == NULL) 453 return dot; 454 if (modifier - dot < sizeof (buf)) 455 { 456 memcpy (buf, dot, modifier - dot); 457 buf [modifier - dot] = '\0'; 458 return buf; 459 } 460 } 461 462 /* Resolve through the charset.alias file. */ 463 codeset = locale; 464 } 465 else 466 { 467 /* OS/2 has a function returning the locale's codepage as a number. */ 468 if (DosQueryCp (sizeof (cp), cp, &cplen)) 469 codeset = ""; 470 else 471 { 472 sprintf (buf, "CP%u", cp[0]); 473 codeset = buf; 474 } 475 } 476 477 #endif 478 479 if (codeset == NULL) 480 /* The canonical name cannot be determined. */ 481 codeset = ""; 482 483 /* Resolve alias. */ 484 for (aliases = get_charset_aliases (); 485 *aliases != '\0'; 486 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 487 if (strcmp (codeset, aliases) == 0 488 || (aliases[0] == '*' && aliases[1] == '\0')) 489 { 490 codeset = aliases + strlen (aliases) + 1; 491 break; 492 } 493 494 /* Don't return an empty string. GNU libc and GNU libiconv interpret 495 the empty string as denoting "the locale's character encoding", 496 thus GNU libiconv would call this function a second time. */ 497 if (codeset[0] == '\0') 498 codeset = "ASCII"; 499 500 return codeset; 501 } 502