Home | History | Annotate | Download | only in lib
      1 /* -*- buffer-read-only: t -*- vi: set ro: */
      2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
      3 /* Determine a canonical name for the current locale's character encoding.
      4 
      5    Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
      6 
      7    This program is free software; you can redistribute it and/or modify
      8    it under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 3, or (at your option)
     10    any later version.
     11 
     12    This program is distributed in the hope that it will be useful,
     13    but WITHOUT ANY WARRANTY; without even the implied warranty of
     14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15    GNU General Public License for more details.
     16 
     17    You should have received a copy of the GNU General Public License along
     18    with this program; if not, write to the Free Software Foundation,
     19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     20 
     21 /* Written by Bruno Haible <bruno (at) clisp.org>.  */
     22 
     23 #include <config.h>
     24 
     25 /* Specification.  */
     26 #include "localcharset.h"
     27 
     28 #include <stddef.h>
     29 #include <stdio.h>
     30 #include <string.h>
     31 #include <stdlib.h>
     32 
     33 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
     34 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
     35 #endif
     36 
     37 #if defined _WIN32 || defined __WIN32__
     38 # define WIN32_NATIVE
     39 #endif
     40 
     41 #if defined __EMX__
     42 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
     43 # ifndef OS2
     44 #  define OS2
     45 # endif
     46 #endif
     47 
     48 #if !defined WIN32_NATIVE
     49 # if HAVE_LANGINFO_CODESET
     50 #  include <langinfo.h>
     51 # else
     52 #  if 0 /* see comment below */
     53 #   include <locale.h>
     54 #  endif
     55 # endif
     56 # ifdef __CYGWIN__
     57 #  define WIN32_LEAN_AND_MEAN
     58 #  include <windows.h>
     59 # endif
     60 #elif defined WIN32_NATIVE
     61 # define WIN32_LEAN_AND_MEAN
     62 # include <windows.h>
     63 #endif
     64 #if defined OS2
     65 # define INCL_DOS
     66 # include <os2.h>
     67 #endif
     68 
     69 #if ENABLE_RELOCATABLE
     70 # include "relocatable.h"
     71 #else
     72 # define relocate(pathname) (pathname)
     73 #endif
     74 
     75 /* Get LIBDIR.  */
     76 #ifndef LIBDIR
     77 # include "configmake.h"
     78 #endif
     79 
     80 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
     81   /* Win32, Cygwin, OS/2, DOS */
     82 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
     83 #endif
     84 
     85 #ifndef DIRECTORY_SEPARATOR
     86 # define DIRECTORY_SEPARATOR '/'
     87 #endif
     88 
     89 #ifndef ISSLASH
     90 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
     91 #endif
     92 
     93 #if HAVE_DECL_GETC_UNLOCKED
     94 # undef getc
     95 # define getc getc_unlocked
     96 #endif
     97 
     98 /* The following static variable is declared 'volatile' to avoid a
     99    possible multithread problem in the function get_charset_aliases. If we
    100    are running in a threaded environment, and if two threads initialize
    101    'charset_aliases' simultaneously, both will produce the same value,
    102    and everything will be ok if the two assignments to 'charset_aliases'
    103    are atomic. But I don't know what will happen if the two assignments mix.  */
    104 #if __STDC__ != 1
    105 # define volatile /* empty */
    106 #endif
    107 /* Pointer to the contents of the charset.alias file, if it has already been
    108    read, else NULL.  Its format is:
    109    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
    110 static const char * volatile charset_aliases;
    111 
    112 /* Return a pointer to the contents of the charset.alias file.  */
    113 static const char *
    114 get_charset_aliases (void)
    115 {
    116   const char *cp;
    117 
    118   cp = charset_aliases;
    119   if (cp == NULL)
    120     {
    121 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
    122       FILE *fp;
    123       const char *dir;
    124       const char *base = "charset.alias";
    125       char *file_name;
    126 
    127       /* Make it possible to override the charset.alias location.  This is
    128 	 necessary for running the testsuite before "make install".  */
    129       dir = getenv ("CHARSETALIASDIR");
    130       if (dir == NULL || dir[0] == '\0')
    131 	dir = relocate (LIBDIR);
    132 
    133       /* Concatenate dir and base into freshly allocated file_name.  */
    134       {
    135 	size_t dir_len = strlen (dir);
    136 	size_t base_len = strlen (base);
    137 	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
    138 	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
    139 	if (file_name != NULL)
    140 	  {
    141 	    memcpy (file_name, dir, dir_len);
    142 	    if (add_slash)
    143 	      file_name[dir_len] = DIRECTORY_SEPARATOR;
    144 	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
    145 	  }
    146       }
    147 
    148       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
    149 	/* Out of memory or file not found, treat it as empty.  */
    150 	cp = "";
    151       else
    152 	{
    153 	  /* Parse the file's contents.  */
    154 	  char *res_ptr = NULL;
    155 	  size_t res_size = 0;
    156 
    157 	  for (;;)
    158 	    {
    159 	      int c;
    160 	      char buf1[50+1];
    161 	      char buf2[50+1];
    162 	      size_t l1, l2;
    163 	      char *old_res_ptr;
    164 
    165 	      c = getc (fp);
    166 	      if (c == EOF)
    167 		break;
    168 	      if (c == '\n' || c == ' ' || c == '\t')
    169 		continue;
    170 	      if (c == '#')
    171 		{
    172 		  /* Skip comment, to end of line.  */
    173 		  do
    174 		    c = getc (fp);
    175 		  while (!(c == EOF || c == '\n'));
    176 		  if (c == EOF)
    177 		    break;
    178 		  continue;
    179 		}
    180 	      ungetc (c, fp);
    181 	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
    182 		break;
    183 	      l1 = strlen (buf1);
    184 	      l2 = strlen (buf2);
    185 	      old_res_ptr = res_ptr;
    186 	      if (res_size == 0)
    187 		{
    188 		  res_size = l1 + 1 + l2 + 1;
    189 		  res_ptr = (char *) malloc (res_size + 1);
    190 		}
    191 	      else
    192 		{
    193 		  res_size += l1 + 1 + l2 + 1;
    194 		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
    195 		}
    196 	      if (res_ptr == NULL)
    197 		{
    198 		  /* Out of memory. */
    199 		  res_size = 0;
    200 		  if (old_res_ptr != NULL)
    201 		    free (old_res_ptr);
    202 		  break;
    203 		}
    204 	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
    205 	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
    206 	    }
    207 	  fclose (fp);
    208 	  if (res_size == 0)
    209 	    cp = "";
    210 	  else
    211 	    {
    212 	      *(res_ptr + res_size) = '\0';
    213 	      cp = res_ptr;
    214 	    }
    215 	}
    216 
    217       if (file_name != NULL)
    218 	free (file_name);
    219 
    220 #else
    221 
    222 # if defined DARWIN7
    223       /* To avoid the trouble of installing a file that is shared by many
    224 	 GNU packages -- many packaging systems have problems with this --,
    225 	 simply inline the aliases here.  */
    226       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
    227 	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
    228 	   "ISO8859-4" "\0" "ISO-8859-4" "\0"
    229 	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
    230 	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
    231 	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
    232 	   "ISO8859-13" "\0" "ISO-8859-13" "\0"
    233 	   "ISO8859-15" "\0" "ISO-8859-15" "\0"
    234 	   "KOI8-R" "\0" "KOI8-R" "\0"
    235 	   "KOI8-U" "\0" "KOI8-U" "\0"
    236 	   "CP866" "\0" "CP866" "\0"
    237 	   "CP949" "\0" "CP949" "\0"
    238 	   "CP1131" "\0" "CP1131" "\0"
    239 	   "CP1251" "\0" "CP1251" "\0"
    240 	   "eucCN" "\0" "GB2312" "\0"
    241 	   "GB2312" "\0" "GB2312" "\0"
    242 	   "eucJP" "\0" "EUC-JP" "\0"
    243 	   "eucKR" "\0" "EUC-KR" "\0"
    244 	   "Big5" "\0" "BIG5" "\0"
    245 	   "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
    246 	   "GBK" "\0" "GBK" "\0"
    247 	   "GB18030" "\0" "GB18030" "\0"
    248 	   "SJIS" "\0" "SHIFT_JIS" "\0"
    249 	   "ARMSCII-8" "\0" "ARMSCII-8" "\0"
    250 	   "PT154" "\0" "PT154" "\0"
    251 	 /*"ISCII-DEV" "\0" "?" "\0"*/
    252 	   "*" "\0" "UTF-8" "\0";
    253 # endif
    254 
    255 # if defined VMS
    256       /* To avoid the troubles of an extra file charset.alias_vms in the
    257 	 sources of many GNU packages, simply inline the aliases here.  */
    258       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
    259 	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
    260 	 section 10.7 "Handling Different Character Sets".  */
    261       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
    262 	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
    263 	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
    264 	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
    265 	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
    266 	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
    267 	   /* Japanese */
    268 	   "eucJP" "\0" "EUC-JP" "\0"
    269 	   "SJIS" "\0" "SHIFT_JIS" "\0"
    270 	   "DECKANJI" "\0" "DEC-KANJI" "\0"
    271 	   "SDECKANJI" "\0" "EUC-JP" "\0"
    272 	   /* Chinese */
    273 	   "eucTW" "\0" "EUC-TW" "\0"
    274 	   "DECHANYU" "\0" "DEC-HANYU" "\0"
    275 	   "DECHANZI" "\0" "GB2312" "\0"
    276 	   /* Korean */
    277 	   "DECKOREAN" "\0" "EUC-KR" "\0";
    278 # endif
    279 
    280 # if defined WIN32_NATIVE || defined __CYGWIN__
    281       /* To avoid the troubles of installing a separate file in the same
    282 	 directory as the DLL and of retrieving the DLL's directory at
    283 	 runtime, simply inline the aliases here.  */
    284 
    285       cp = "CP936" "\0" "GBK" "\0"
    286 	   "CP1361" "\0" "JOHAB" "\0"
    287 	   "CP20127" "\0" "ASCII" "\0"
    288 	   "CP20866" "\0" "KOI8-R" "\0"
    289 	   "CP20936" "\0" "GB2312" "\0"
    290 	   "CP21866" "\0" "KOI8-RU" "\0"
    291 	   "CP28591" "\0" "ISO-8859-1" "\0"
    292 	   "CP28592" "\0" "ISO-8859-2" "\0"
    293 	   "CP28593" "\0" "ISO-8859-3" "\0"
    294 	   "CP28594" "\0" "ISO-8859-4" "\0"
    295 	   "CP28595" "\0" "ISO-8859-5" "\0"
    296 	   "CP28596" "\0" "ISO-8859-6" "\0"
    297 	   "CP28597" "\0" "ISO-8859-7" "\0"
    298 	   "CP28598" "\0" "ISO-8859-8" "\0"
    299 	   "CP28599" "\0" "ISO-8859-9" "\0"
    300 	   "CP28605" "\0" "ISO-8859-15" "\0"
    301 	   "CP38598" "\0" "ISO-8859-8" "\0"
    302 	   "CP51932" "\0" "EUC-JP" "\0"
    303 	   "CP51936" "\0" "GB2312" "\0"
    304 	   "CP51949" "\0" "EUC-KR" "\0"
    305 	   "CP51950" "\0" "EUC-TW" "\0"
    306 	   "CP54936" "\0" "GB18030" "\0"
    307 	   "CP65001" "\0" "UTF-8" "\0";
    308 # endif
    309 #endif
    310 
    311       charset_aliases = cp;
    312     }
    313 
    314   return cp;
    315 }
    316 
    317 /* Determine the current locale's character encoding, and canonicalize it
    318    into one of the canonical names listed in config.charset.
    319    The result must not be freed; it is statically allocated.
    320    If the canonical name cannot be determined, the result is a non-canonical
    321    name.  */
    322 
    323 #ifdef STATIC
    324 STATIC
    325 #endif
    326 const char *
    327 locale_charset (void)
    328 {
    329   const char *codeset;
    330   const char *aliases;
    331 
    332 #if !(defined WIN32_NATIVE || defined OS2)
    333 
    334 # if HAVE_LANGINFO_CODESET
    335 
    336   /* Most systems support nl_langinfo (CODESET) nowadays.  */
    337   codeset = nl_langinfo (CODESET);
    338 
    339 #  ifdef __CYGWIN__
    340   /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
    341      returns "US-ASCII".  As long as this is not fixed, return the suffix
    342      of the locale name from the environment variables (if present) or
    343      the codepage as a number.  */
    344   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
    345     {
    346       const char *locale;
    347       static char buf[2 + 10 + 1];
    348 
    349       locale = getenv ("LC_ALL");
    350       if (locale == NULL || locale[0] == '\0')
    351 	{
    352 	  locale = getenv ("LC_CTYPE");
    353 	  if (locale == NULL || locale[0] == '\0')
    354 	    locale = getenv ("LANG");
    355 	}
    356       if (locale != NULL && locale[0] != '\0')
    357 	{
    358 	  /* If the locale name contains an encoding after the dot, return
    359 	     it.  */
    360 	  const char *dot = strchr (locale, '.');
    361 
    362 	  if (dot != NULL)
    363 	    {
    364 	      const char *modifier;
    365 
    366 	      dot++;
    367 	      /* Look for the possible @... trailer and remove it, if any.  */
    368 	      modifier = strchr (dot, '@');
    369 	      if (modifier == NULL)
    370 		return dot;
    371 	      if (modifier - dot < sizeof (buf))
    372 		{
    373 		  memcpy (buf, dot, modifier - dot);
    374 		  buf [modifier - dot] = '\0';
    375 		  return buf;
    376 		}
    377 	    }
    378 	}
    379 
    380       /* Woe32 has a function returning the locale's codepage as a number.  */
    381       sprintf (buf, "CP%u", GetACP ());
    382       codeset = buf;
    383     }
    384 #  endif
    385 
    386 # else
    387 
    388   /* On old systems which lack it, use setlocale or getenv.  */
    389   const char *locale = NULL;
    390 
    391   /* But most old systems don't have a complete set of locales.  Some
    392      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
    393      use setlocale here; it would return "C" when it doesn't support the
    394      locale name the user has set.  */
    395 #  if 0
    396   locale = setlocale (LC_CTYPE, NULL);
    397 #  endif
    398   if (locale == NULL || locale[0] == '\0')
    399     {
    400       locale = getenv ("LC_ALL");
    401       if (locale == NULL || locale[0] == '\0')
    402 	{
    403 	  locale = getenv ("LC_CTYPE");
    404 	  if (locale == NULL || locale[0] == '\0')
    405 	    locale = getenv ("LANG");
    406 	}
    407     }
    408 
    409   /* On some old systems, one used to set locale = "iso8859_1". On others,
    410      you set it to "language_COUNTRY.charset". In any case, we resolve it
    411      through the charset.alias file.  */
    412   codeset = locale;
    413 
    414 # endif
    415 
    416 #elif defined WIN32_NATIVE
    417 
    418   static char buf[2 + 10 + 1];
    419 
    420   /* Woe32 has a function returning the locale's codepage as a number.  */
    421   sprintf (buf, "CP%u", GetACP ());
    422   codeset = buf;
    423 
    424 #elif defined OS2
    425 
    426   const char *locale;
    427   static char buf[2 + 10 + 1];
    428   ULONG cp[3];
    429   ULONG cplen;
    430 
    431   /* Allow user to override the codeset, as set in the operating system,
    432      with standard language environment variables.  */
    433   locale = getenv ("LC_ALL");
    434   if (locale == NULL || locale[0] == '\0')
    435     {
    436       locale = getenv ("LC_CTYPE");
    437       if (locale == NULL || locale[0] == '\0')
    438 	locale = getenv ("LANG");
    439     }
    440   if (locale != NULL && locale[0] != '\0')
    441     {
    442       /* If the locale name contains an encoding after the dot, return it.  */
    443       const char *dot = strchr (locale, '.');
    444 
    445       if (dot != NULL)
    446 	{
    447 	  const char *modifier;
    448 
    449 	  dot++;
    450 	  /* Look for the possible @... trailer and remove it, if any.  */
    451 	  modifier = strchr (dot, '@');
    452 	  if (modifier == NULL)
    453 	    return dot;
    454 	  if (modifier - dot < sizeof (buf))
    455 	    {
    456 	      memcpy (buf, dot, modifier - dot);
    457 	      buf [modifier - dot] = '\0';
    458 	      return buf;
    459 	    }
    460 	}
    461 
    462       /* Resolve through the charset.alias file.  */
    463       codeset = locale;
    464     }
    465   else
    466     {
    467       /* OS/2 has a function returning the locale's codepage as a number.  */
    468       if (DosQueryCp (sizeof (cp), cp, &cplen))
    469 	codeset = "";
    470       else
    471 	{
    472 	  sprintf (buf, "CP%u", cp[0]);
    473 	  codeset = buf;
    474 	}
    475     }
    476 
    477 #endif
    478 
    479   if (codeset == NULL)
    480     /* The canonical name cannot be determined.  */
    481     codeset = "";
    482 
    483   /* Resolve alias. */
    484   for (aliases = get_charset_aliases ();
    485        *aliases != '\0';
    486        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
    487     if (strcmp (codeset, aliases) == 0
    488 	|| (aliases[0] == '*' && aliases[1] == '\0'))
    489       {
    490 	codeset = aliases + strlen (aliases) + 1;
    491 	break;
    492       }
    493 
    494   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
    495      the empty string as denoting "the locale's character encoding",
    496      thus GNU libiconv would call this function a second time.  */
    497   if (codeset[0] == '\0')
    498     codeset = "ASCII";
    499 
    500   return codeset;
    501 }
    502