1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2003 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, 18 USA. */ 19 20 /* Written by Bruno Haible <bruno (at) clisp.org>. */ 21 22 #ifdef HAVE_CONFIG_H 23 # include <config.h> 24 #endif 25 26 /* Specification. */ 27 #include "localcharset.h" 28 29 #if HAVE_STDDEF_H 30 # include <stddef.h> 31 #endif 32 33 #include <stdio.h> 34 #if HAVE_STRING_H 35 # include <string.h> 36 #else 37 # include <strings.h> 38 #endif 39 #if HAVE_STDLIB_H 40 # include <stdlib.h> 41 #endif 42 43 #if defined _WIN32 || defined __WIN32__ 44 # undef WIN32 /* avoid warning on mingw32 */ 45 # define WIN32 46 #endif 47 48 #if defined __EMX__ 49 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 50 # define OS2 51 #endif 52 53 #if !defined WIN32 54 # if HAVE_LANGINFO_CODESET 55 # include <langinfo.h> 56 # else 57 # if HAVE_SETLOCALE 58 # include <locale.h> 59 # endif 60 # endif 61 #elif defined WIN32 62 # define WIN32_LEAN_AND_MEAN 63 # include <windows.h> 64 #endif 65 #if defined OS2 66 # define INCL_DOS 67 # include <os2.h> 68 #endif 69 70 #if ENABLE_RELOCATABLE 71 # include "relocatable.h" 72 #else 73 # define relocate(pathname) (pathname) 74 #endif 75 76 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 77 /* Win32, OS/2, DOS */ 78 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 79 #endif 80 81 #ifndef DIRECTORY_SEPARATOR 82 # define DIRECTORY_SEPARATOR '/' 83 #endif 84 85 #ifndef ISSLASH 86 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 87 #endif 88 89 #ifdef HAVE_GETC_UNLOCKED 90 # undef getc 91 # define getc getc_unlocked 92 #endif 93 94 /* The following static variable is declared 'volatile' to avoid a 95 possible multithread problem in the function get_charset_aliases. If we 96 are running in a threaded environment, and if two threads initialize 97 'charset_aliases' simultaneously, both will produce the same value, 98 and everything will be ok if the two assignments to 'charset_aliases' 99 are atomic. But I don't know what will happen if the two assignments mix. */ 100 #if __STDC__ != 1 101 # define volatile /* empty */ 102 #endif 103 /* Pointer to the contents of the charset.alias file, if it has already been 104 read, else NULL. Its format is: 105 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 106 static const char * volatile charset_aliases; 107 108 /* Return a pointer to the contents of the charset.alias file. */ 109 static const char * 110 get_charset_aliases () 111 { 112 const char *cp; 113 114 cp = charset_aliases; 115 if (cp == NULL) 116 { 117 #if !(defined VMS || defined WIN32) 118 FILE *fp; 119 const char *dir = relocate (LIBDIR); 120 const char *base = "charset.alias"; 121 char *file_name; 122 123 /* Concatenate dir and base into freshly allocated file_name. */ 124 { 125 size_t dir_len = strlen (dir); 126 size_t base_len = strlen (base); 127 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 128 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 129 if (file_name != NULL) 130 { 131 memcpy (file_name, dir, dir_len); 132 if (add_slash) 133 file_name[dir_len] = DIRECTORY_SEPARATOR; 134 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 135 } 136 } 137 138 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 139 /* Out of memory or file not found, treat it as empty. */ 140 cp = ""; 141 else 142 { 143 /* Parse the file's contents. */ 144 int c; 145 char buf1[50+1]; 146 char buf2[50+1]; 147 char *res_ptr = NULL; 148 size_t res_size = 0; 149 size_t l1, l2; 150 151 for (;;) 152 { 153 c = getc (fp); 154 if (c == EOF) 155 break; 156 if (c == '\n' || c == ' ' || c == '\t') 157 continue; 158 if (c == '#') 159 { 160 /* Skip comment, to end of line. */ 161 do 162 c = getc (fp); 163 while (!(c == EOF || c == '\n')); 164 if (c == EOF) 165 break; 166 continue; 167 } 168 ungetc (c, fp); 169 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 170 break; 171 l1 = strlen (buf1); 172 l2 = strlen (buf2); 173 if (res_size == 0) 174 { 175 res_size = l1 + 1 + l2 + 1; 176 res_ptr = (char *) malloc (res_size + 1); 177 } 178 else 179 { 180 res_size += l1 + 1 + l2 + 1; 181 res_ptr = (char *) realloc (res_ptr, res_size + 1); 182 } 183 if (res_ptr == NULL) 184 { 185 /* Out of memory. */ 186 res_size = 0; 187 break; 188 } 189 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 190 strcpy (res_ptr + res_size - (l2 + 1), buf2); 191 } 192 fclose (fp); 193 if (res_size == 0) 194 cp = ""; 195 else 196 { 197 *(res_ptr + res_size) = '\0'; 198 cp = res_ptr; 199 } 200 } 201 202 if (file_name != NULL) 203 free (file_name); 204 205 #else 206 207 # if defined VMS 208 /* To avoid the troubles of an extra file charset.alias_vms in the 209 sources of many GNU packages, simply inline the aliases here. */ 210 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 211 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 212 section 10.7 "Handling Different Character Sets". */ 213 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 214 "ISO8859-2" "\0" "ISO-8859-2" "\0" 215 "ISO8859-5" "\0" "ISO-8859-5" "\0" 216 "ISO8859-7" "\0" "ISO-8859-7" "\0" 217 "ISO8859-8" "\0" "ISO-8859-8" "\0" 218 "ISO8859-9" "\0" "ISO-8859-9" "\0" 219 /* Japanese */ 220 "eucJP" "\0" "EUC-JP" "\0" 221 "SJIS" "\0" "SHIFT_JIS" "\0" 222 "DECKANJI" "\0" "DEC-KANJI" "\0" 223 "SDECKANJI" "\0" "EUC-JP" "\0" 224 /* Chinese */ 225 "eucTW" "\0" "EUC-TW" "\0" 226 "DECHANYU" "\0" "DEC-HANYU" "\0" 227 "DECHANZI" "\0" "GB2312" "\0" 228 /* Korean */ 229 "DECKOREAN" "\0" "EUC-KR" "\0"; 230 # endif 231 232 # if defined WIN32 233 /* To avoid the troubles of installing a separate file in the same 234 directory as the DLL and of retrieving the DLL's directory at 235 runtime, simply inline the aliases here. */ 236 237 cp = "CP936" "\0" "GBK" "\0" 238 "CP1361" "\0" "JOHAB" "\0" 239 "CP20127" "\0" "ASCII" "\0" 240 "CP20866" "\0" "KOI8-R" "\0" 241 "CP21866" "\0" "KOI8-RU" "\0" 242 "CP28591" "\0" "ISO-8859-1" "\0" 243 "CP28592" "\0" "ISO-8859-2" "\0" 244 "CP28593" "\0" "ISO-8859-3" "\0" 245 "CP28594" "\0" "ISO-8859-4" "\0" 246 "CP28595" "\0" "ISO-8859-5" "\0" 247 "CP28596" "\0" "ISO-8859-6" "\0" 248 "CP28597" "\0" "ISO-8859-7" "\0" 249 "CP28598" "\0" "ISO-8859-8" "\0" 250 "CP28599" "\0" "ISO-8859-9" "\0" 251 "CP28605" "\0" "ISO-8859-15" "\0"; 252 # endif 253 #endif 254 255 charset_aliases = cp; 256 } 257 258 return cp; 259 } 260 261 /* Determine the current locale's character encoding, and canonicalize it 262 into one of the canonical names listed in config.charset. 263 The result must not be freed; it is statically allocated. 264 If the canonical name cannot be determined, the result is a non-canonical 265 name. */ 266 267 #ifdef STATIC 268 STATIC 269 #endif 270 const char * 271 locale_charset () 272 { 273 const char *codeset; 274 const char *aliases; 275 276 #if !(defined WIN32 || defined OS2) 277 278 # if HAVE_LANGINFO_CODESET 279 280 /* Most systems support nl_langinfo (CODESET) nowadays. */ 281 codeset = nl_langinfo (CODESET); 282 283 # else 284 285 /* On old systems which lack it, use setlocale or getenv. */ 286 const char *locale = NULL; 287 288 /* But most old systems don't have a complete set of locales. Some 289 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 290 use setlocale here; it would return "C" when it doesn't support the 291 locale name the user has set. */ 292 # if HAVE_SETLOCALE && 0 293 locale = setlocale (LC_CTYPE, NULL); 294 # endif 295 if (locale == NULL || locale[0] == '\0') 296 { 297 locale = getenv ("LC_ALL"); 298 if (locale == NULL || locale[0] == '\0') 299 { 300 locale = getenv ("LC_CTYPE"); 301 if (locale == NULL || locale[0] == '\0') 302 locale = getenv ("LANG"); 303 } 304 } 305 306 /* On some old systems, one used to set locale = "iso8859_1". On others, 307 you set it to "language_COUNTRY.charset". In any case, we resolve it 308 through the charset.alias file. */ 309 codeset = locale; 310 311 # endif 312 313 #elif defined WIN32 314 315 static char buf[2 + 10 + 1]; 316 317 /* Woe32 has a function returning the locale's codepage as a number. */ 318 sprintf (buf, "CP%u", GetACP ()); 319 codeset = buf; 320 321 #elif defined OS2 322 323 const char *locale; 324 static char buf[2 + 10 + 1]; 325 ULONG cp[3]; 326 ULONG cplen; 327 328 /* Allow user to override the codeset, as set in the operating system, 329 with standard language environment variables. */ 330 locale = getenv ("LC_ALL"); 331 if (locale == NULL || locale[0] == '\0') 332 { 333 locale = getenv ("LC_CTYPE"); 334 if (locale == NULL || locale[0] == '\0') 335 locale = getenv ("LANG"); 336 } 337 if (locale != NULL && locale[0] != '\0') 338 { 339 /* If the locale name contains an encoding after the dot, return it. */ 340 const char *dot = strchr (locale, '.'); 341 342 if (dot != NULL) 343 { 344 const char *modifier; 345 346 dot++; 347 /* Look for the possible @... trailer and remove it, if any. */ 348 modifier = strchr (dot, '@'); 349 if (modifier == NULL) 350 return dot; 351 if (modifier - dot < sizeof (buf)) 352 { 353 memcpy (buf, dot, modifier - dot); 354 buf [modifier - dot] = '\0'; 355 return buf; 356 } 357 } 358 359 /* Resolve through the charset.alias file. */ 360 codeset = locale; 361 } 362 else 363 { 364 /* OS/2 has a function returning the locale's codepage as a number. */ 365 if (DosQueryCp (sizeof (cp), cp, &cplen)) 366 codeset = ""; 367 else 368 { 369 sprintf (buf, "CP%u", cp[0]); 370 codeset = buf; 371 } 372 } 373 374 #endif 375 376 if (codeset == NULL) 377 /* The canonical name cannot be determined. */ 378 codeset = ""; 379 380 /* Resolve alias. */ 381 for (aliases = get_charset_aliases (); 382 *aliases != '\0'; 383 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 384 if (strcmp (codeset, aliases) == 0 385 || (aliases[0] == '*' && aliases[1] == '\0')) 386 { 387 codeset = aliases + strlen (aliases) + 1; 388 break; 389 } 390 391 /* Don't return an empty string. GNU libc and GNU libiconv interpret 392 the empty string as denoting "the locale's character encoding", 393 thus GNU libiconv would call this function a second time. */ 394 if (codeset[0] == '\0') 395 codeset = "ASCII"; 396 397 return codeset; 398 } 399