1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2003 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 18 USA. */ 19 20 /* Written by Bruno Haible <bruno (at) clisp.org>. */ 21 22 #ifdef HAVE_CONFIG_H 23 # include <config.h> 24 #endif 25 26 /* Specification. */ 27 #include "localcharset.h" 28 29 #if HAVE_STDDEF_H 30 # include <stddef.h> 31 #endif 32 33 #include <stdio.h> 34 #if HAVE_STRING_H 35 # include <string.h> 36 #else 37 # include <strings.h> 38 #endif 39 #if HAVE_STDLIB_H 40 # include <stdlib.h> 41 #endif 42 43 #if defined _WIN32 || defined __WIN32__ 44 # undef WIN32 /* avoid warning on mingw32 */ 45 # define WIN32 46 #endif 47 48 #if defined __EMX__ 49 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 50 # define OS2 51 #endif 52 53 #if !defined WIN32 54 # if HAVE_LANGINFO_CODESET 55 # include <langinfo.h> 56 # else 57 # if HAVE_SETLOCALE 58 # include <locale.h> 59 # endif 60 # endif 61 #elif defined WIN32 62 # define WIN32_LEAN_AND_MEAN 63 # include <windows.h> 64 #endif 65 #if defined OS2 66 # define INCL_DOS 67 # include <os2.h> 68 #endif 69 70 #if ENABLE_RELOCATABLE 71 # include "relocatable.h" 72 #else 73 # define relocate(pathname) (pathname) 74 #endif 75 76 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__ 77 /* Win32, OS/2, DOS */ 78 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 79 #endif 80 81 #ifndef DIRECTORY_SEPARATOR 82 # define DIRECTORY_SEPARATOR '/' 83 #endif 84 85 #ifndef ISSLASH 86 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 87 #endif 88 89 #if HAVE_DECL_GETC_UNLOCKED 90 # undef getc 91 # define getc getc_unlocked 92 #endif 93 94 /* The following static variable is declared 'volatile' to avoid a 95 possible multithread problem in the function get_charset_aliases. If we 96 are running in a threaded environment, and if two threads initialize 97 'charset_aliases' simultaneously, both will produce the same value, 98 and everything will be ok if the two assignments to 'charset_aliases' 99 are atomic. But I don't know what will happen if the two assignments mix. */ 100 #if __STDC__ != 1 101 # define volatile /* empty */ 102 #endif 103 /* Pointer to the contents of the charset.alias file, if it has already been 104 read, else NULL. Its format is: 105 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 106 static const char * volatile charset_aliases; 107 108 /* Return a pointer to the contents of the charset.alias file. */ 109 static const char * 110 get_charset_aliases () 111 { 112 const char *cp; 113 114 cp = charset_aliases; 115 if (cp == NULL) 116 { 117 #if !(defined VMS || defined WIN32) 118 FILE *fp; 119 const char *dir = relocate (LIBDIR); 120 const char *base = "charset.alias"; 121 char *file_name; 122 123 /* Concatenate dir and base into freshly allocated file_name. */ 124 { 125 size_t dir_len = strlen (dir); 126 size_t base_len = strlen (base); 127 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 128 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 129 if (file_name != NULL) 130 { 131 memcpy (file_name, dir, dir_len); 132 if (add_slash) 133 file_name[dir_len] = DIRECTORY_SEPARATOR; 134 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 135 } 136 } 137 138 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 139 /* Out of memory or file not found, treat it as empty. */ 140 cp = ""; 141 else 142 { 143 /* Parse the file's contents. */ 144 int c; 145 char buf1[50+1]; 146 char buf2[50+1]; 147 char *res_ptr = NULL; 148 size_t res_size = 0; 149 size_t l1, l2; 150 151 for (;;) 152 { 153 c = getc (fp); 154 if (c == EOF) 155 break; 156 if (c == '\n' || c == ' ' || c == '\t') 157 continue; 158 if (c == '#') 159 { 160 /* Skip comment, to end of line. */ 161 do 162 c = getc (fp); 163 while (!(c == EOF || c == '\n')); 164 if (c == EOF) 165 break; 166 continue; 167 } 168 ungetc (c, fp); 169 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 170 break; 171 l1 = strlen (buf1); 172 l2 = strlen (buf2); 173 if (res_size == 0) 174 { 175 res_size = l1 + 1 + l2 + 1; 176 res_ptr = (char *) malloc (res_size + 1); 177 } 178 else 179 { 180 res_size += l1 + 1 + l2 + 1; 181 res_ptr = (char *) realloc (res_ptr, res_size + 1); 182 } 183 if (res_ptr == NULL) 184 { 185 /* Out of memory. */ 186 res_size = 0; 187 break; 188 } 189 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 190 strcpy (res_ptr + res_size - (l2 + 1), buf2); 191 } 192 fclose (fp); 193 if (res_size == 0) 194 cp = ""; 195 else 196 { 197 *(res_ptr + res_size) = '\0'; 198 cp = res_ptr; 199 } 200 } 201 202 free (file_name); 203 204 #else 205 206 # if defined VMS 207 /* To avoid the troubles of an extra file charset.alias_vms in the 208 sources of many GNU packages, simply inline the aliases here. */ 209 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 210 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 211 section 10.7 "Handling Different Character Sets". */ 212 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 213 "ISO8859-2" "\0" "ISO-8859-2" "\0" 214 "ISO8859-5" "\0" "ISO-8859-5" "\0" 215 "ISO8859-7" "\0" "ISO-8859-7" "\0" 216 "ISO8859-8" "\0" "ISO-8859-8" "\0" 217 "ISO8859-9" "\0" "ISO-8859-9" "\0" 218 /* Japanese */ 219 "eucJP" "\0" "EUC-JP" "\0" 220 "SJIS" "\0" "SHIFT_JIS" "\0" 221 "DECKANJI" "\0" "DEC-KANJI" "\0" 222 "SDECKANJI" "\0" "EUC-JP" "\0" 223 /* Chinese */ 224 "eucTW" "\0" "EUC-TW" "\0" 225 "DECHANYU" "\0" "DEC-HANYU" "\0" 226 "DECHANZI" "\0" "GB2312" "\0" 227 /* Korean */ 228 "DECKOREAN" "\0" "EUC-KR" "\0"; 229 # endif 230 231 # if defined WIN32 232 /* To avoid the troubles of installing a separate file in the same 233 directory as the DLL and of retrieving the DLL's directory at 234 runtime, simply inline the aliases here. */ 235 236 cp = "CP936" "\0" "GBK" "\0" 237 "CP1361" "\0" "JOHAB" "\0" 238 "CP20127" "\0" "ASCII" "\0" 239 "CP20866" "\0" "KOI8-R" "\0" 240 "CP21866" "\0" "KOI8-RU" "\0" 241 "CP28591" "\0" "ISO-8859-1" "\0" 242 "CP28592" "\0" "ISO-8859-2" "\0" 243 "CP28593" "\0" "ISO-8859-3" "\0" 244 "CP28594" "\0" "ISO-8859-4" "\0" 245 "CP28595" "\0" "ISO-8859-5" "\0" 246 "CP28596" "\0" "ISO-8859-6" "\0" 247 "CP28597" "\0" "ISO-8859-7" "\0" 248 "CP28598" "\0" "ISO-8859-8" "\0" 249 "CP28599" "\0" "ISO-8859-9" "\0" 250 "CP28605" "\0" "ISO-8859-15" "\0"; 251 # endif 252 #endif 253 254 charset_aliases = cp; 255 } 256 257 return cp; 258 } 259 260 /* Determine the current locale's character encoding, and canonicalize it 261 into one of the canonical names listed in config.charset. 262 The result must not be freed; it is statically allocated. 263 If the canonical name cannot be determined, the result is a non-canonical 264 name. */ 265 266 #ifdef STATIC 267 STATIC 268 #endif 269 const char * 270 locale_charset () 271 { 272 const char *codeset; 273 const char *aliases; 274 275 #if !(defined WIN32 || defined OS2) 276 277 # if HAVE_LANGINFO_CODESET 278 279 /* Most systems support nl_langinfo (CODESET) nowadays. */ 280 codeset = nl_langinfo (CODESET); 281 282 # else 283 284 /* On old systems which lack it, use setlocale or getenv. */ 285 const char *locale = NULL; 286 287 /* But most old systems don't have a complete set of locales. Some 288 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 289 use setlocale here; it would return "C" when it doesn't support the 290 locale name the user has set. */ 291 # if HAVE_SETLOCALE && 0 292 locale = setlocale (LC_CTYPE, NULL); 293 # endif 294 if (locale == NULL || locale[0] == '\0') 295 { 296 locale = getenv ("LC_ALL"); 297 if (locale == NULL || locale[0] == '\0') 298 { 299 locale = getenv ("LC_CTYPE"); 300 if (locale == NULL || locale[0] == '\0') 301 locale = getenv ("LANG"); 302 } 303 } 304 305 /* On some old systems, one used to set locale = "iso8859_1". On others, 306 you set it to "language_COUNTRY.charset". In any case, we resolve it 307 through the charset.alias file. */ 308 codeset = locale; 309 310 # endif 311 312 #elif defined WIN32 313 314 static char buf[2 + 10 + 1]; 315 316 /* Woe32 has a function returning the locale's codepage as a number. */ 317 sprintf (buf, "CP%u", GetACP ()); 318 codeset = buf; 319 320 #elif defined OS2 321 322 const char *locale; 323 static char buf[2 + 10 + 1]; 324 ULONG cp[3]; 325 ULONG cplen; 326 327 /* Allow user to override the codeset, as set in the operating system, 328 with standard language environment variables. */ 329 locale = getenv ("LC_ALL"); 330 if (locale == NULL || locale[0] == '\0') 331 { 332 locale = getenv ("LC_CTYPE"); 333 if (locale == NULL || locale[0] == '\0') 334 locale = getenv ("LANG"); 335 } 336 if (locale != NULL && locale[0] != '\0') 337 { 338 /* If the locale name contains an encoding after the dot, return it. */ 339 const char *dot = strchr (locale, '.'); 340 341 if (dot != NULL) 342 { 343 const char *modifier; 344 345 dot++; 346 /* Look for the possible @... trailer and remove it, if any. */ 347 modifier = strchr (dot, '@'); 348 if (modifier == NULL) 349 return dot; 350 if (modifier - dot < sizeof (buf)) 351 { 352 memcpy (buf, dot, modifier - dot); 353 buf [modifier - dot] = '\0'; 354 return buf; 355 } 356 } 357 358 /* Resolve through the charset.alias file. */ 359 codeset = locale; 360 } 361 else 362 { 363 /* OS/2 has a function returning the locale's codepage as a number. */ 364 if (DosQueryCp (sizeof (cp), cp, &cplen)) 365 codeset = ""; 366 else 367 { 368 sprintf (buf, "CP%u", cp[0]); 369 codeset = buf; 370 } 371 } 372 373 #endif 374 375 if (codeset == NULL) 376 /* The canonical name cannot be determined. */ 377 codeset = ""; 378 379 /* Resolve alias. */ 380 for (aliases = get_charset_aliases (); 381 *aliases != '\0'; 382 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 383 if (strcmp (codeset, aliases) == 0 384 || (aliases[0] == '*' && aliases[1] == '\0')) 385 { 386 codeset = aliases + strlen (aliases) + 1; 387 break; 388 } 389 390 /* Don't return an empty string. GNU libc and GNU libiconv interpret 391 the empty string as denoting "the locale's character encoding", 392 thus GNU libiconv would call this function a second time. */ 393 if (codeset[0] == '\0') 394 codeset = "ASCII"; 395 396 return codeset; 397 } 398