Home | History | Annotate | Download | only in lib
      1 /* Determine a canonical name for the current locale's character encoding.
      2 
      3    Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
      4 
      5    This program is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    This program is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    You should have received a copy of the GNU General Public License along
     16    with this program; if not, see <http://www.gnu.org/licenses/>.  */
     17 
     18 /* Written by Bruno Haible <bruno (at) clisp.org>.  */
     19 
     20 #include <config.h>
     21 
     22 /* Specification.  */
     23 #include "localcharset.h"
     24 
     25 #include <fcntl.h>
     26 #include <stddef.h>
     27 #include <stdio.h>
     28 #include <string.h>
     29 #include <stdlib.h>
     30 
     31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
     32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
     33 #endif
     34 
     35 #if defined _WIN32 || defined __WIN32__
     36 # define WINDOWS_NATIVE
     37 #endif
     38 
     39 #if defined __EMX__
     40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
     41 # ifndef OS2
     42 #  define OS2
     43 # endif
     44 #endif
     45 
     46 #if !defined WINDOWS_NATIVE
     47 # include <unistd.h>
     48 # if HAVE_LANGINFO_CODESET
     49 #  include <langinfo.h>
     50 # else
     51 #  if 0 /* see comment below */
     52 #   include <locale.h>
     53 #  endif
     54 # endif
     55 # ifdef __CYGWIN__
     56 #  define WIN32_LEAN_AND_MEAN
     57 #  include <windows.h>
     58 # endif
     59 #elif defined WINDOWS_NATIVE
     60 # define WIN32_LEAN_AND_MEAN
     61 # include <windows.h>
     62 #endif
     63 #if defined OS2
     64 # define INCL_DOS
     65 # include <os2.h>
     66 #endif
     67 
     68 #if ENABLE_RELOCATABLE
     69 # include "relocatable.h"
     70 #else
     71 # define relocate(pathname) (pathname)
     72 #endif
     73 
     74 /* Get LIBDIR.  */
     75 #ifndef LIBDIR
     76 # include "configmake.h"
     77 #endif
     78 
     79 /* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
     80 #ifndef O_NOFOLLOW
     81 # define O_NOFOLLOW 0
     82 #endif
     83 
     84 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
     85   /* Native Windows, Cygwin, OS/2, DOS */
     86 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
     87 #endif
     88 
     89 #ifndef DIRECTORY_SEPARATOR
     90 # define DIRECTORY_SEPARATOR '/'
     91 #endif
     92 
     93 #ifndef ISSLASH
     94 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
     95 #endif
     96 
     97 #if HAVE_DECL_GETC_UNLOCKED
     98 # undef getc
     99 # define getc getc_unlocked
    100 #endif
    101 
    102 /* The following static variable is declared 'volatile' to avoid a
    103    possible multithread problem in the function get_charset_aliases. If we
    104    are running in a threaded environment, and if two threads initialize
    105    'charset_aliases' simultaneously, both will produce the same value,
    106    and everything will be ok if the two assignments to 'charset_aliases'
    107    are atomic. But I don't know what will happen if the two assignments mix.  */
    108 #if __STDC__ != 1
    109 # define volatile /* empty */
    110 #endif
    111 /* Pointer to the contents of the charset.alias file, if it has already been
    112    read, else NULL.  Its format is:
    113    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
    114 static const char * volatile charset_aliases;
    115 
    116 /* Return a pointer to the contents of the charset.alias file.  */
    117 static const char *
    118 get_charset_aliases (void)
    119 {
    120   const char *cp;
    121 
    122   cp = charset_aliases;
    123   if (cp == NULL)
    124     {
    125 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
    126       const char *dir;
    127       const char *base = "charset.alias";
    128       char *file_name;
    129 
    130       /* Make it possible to override the charset.alias location.  This is
    131          necessary for running the testsuite before "make install".  */
    132       dir = getenv ("CHARSETALIASDIR");
    133       if (dir == NULL || dir[0] == '\0')
    134         dir = relocate (LIBDIR);
    135 
    136       /* Concatenate dir and base into freshly allocated file_name.  */
    137       {
    138         size_t dir_len = strlen (dir);
    139         size_t base_len = strlen (base);
    140         int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
    141         file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
    142         if (file_name != NULL)
    143           {
    144             memcpy (file_name, dir, dir_len);
    145             if (add_slash)
    146               file_name[dir_len] = DIRECTORY_SEPARATOR;
    147             memcpy (file_name + dir_len + add_slash, base, base_len + 1);
    148           }
    149       }
    150 
    151       if (file_name == NULL)
    152         /* Out of memory.  Treat the file as empty.  */
    153         cp = "";
    154       else
    155         {
    156           int fd;
    157 
    158           /* Open the file.  Reject symbolic links on platforms that support
    159              O_NOFOLLOW.  This is a security feature.  Without it, an attacker
    160              could retrieve parts of the contents (namely, the tail of the
    161              first line that starts with "* ") of an arbitrary file by placing
    162              a symbolic link to that file under the name "charset.alias" in
    163              some writable directory and defining the environment variable
    164              CHARSETALIASDIR to point to that directory.  */
    165           fd = open (file_name,
    166                      O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
    167           if (fd < 0)
    168             /* File not found.  Treat it as empty.  */
    169             cp = "";
    170           else
    171             {
    172               FILE *fp;
    173 
    174               fp = fdopen (fd, "r");
    175               if (fp == NULL)
    176                 {
    177                   /* Out of memory.  Treat the file as empty.  */
    178                   close (fd);
    179                   cp = "";
    180                 }
    181               else
    182                 {
    183                   /* Parse the file's contents.  */
    184                   char *res_ptr = NULL;
    185                   size_t res_size = 0;
    186 
    187                   for (;;)
    188                     {
    189                       int c;
    190                       char buf1[50+1];
    191                       char buf2[50+1];
    192                       size_t l1, l2;
    193                       char *old_res_ptr;
    194 
    195                       c = getc (fp);
    196                       if (c == EOF)
    197                         break;
    198                       if (c == '\n' || c == ' ' || c == '\t')
    199                         continue;
    200                       if (c == '#')
    201                         {
    202                           /* Skip comment, to end of line.  */
    203                           do
    204                             c = getc (fp);
    205                           while (!(c == EOF || c == '\n'));
    206                           if (c == EOF)
    207                             break;
    208                           continue;
    209                         }
    210                       ungetc (c, fp);
    211                       if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
    212                         break;
    213                       l1 = strlen (buf1);
    214                       l2 = strlen (buf2);
    215                       old_res_ptr = res_ptr;
    216                       if (res_size == 0)
    217                         {
    218                           res_size = l1 + 1 + l2 + 1;
    219                           res_ptr = (char *) malloc (res_size + 1);
    220                         }
    221                       else
    222                         {
    223                           res_size += l1 + 1 + l2 + 1;
    224                           res_ptr = (char *) realloc (res_ptr, res_size + 1);
    225                         }
    226                       if (res_ptr == NULL)
    227                         {
    228                           /* Out of memory. */
    229                           res_size = 0;
    230                           free (old_res_ptr);
    231                           break;
    232                         }
    233                       strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
    234                       strcpy (res_ptr + res_size - (l2 + 1), buf2);
    235                     }
    236                   fclose (fp);
    237                   if (res_size == 0)
    238                     cp = "";
    239                   else
    240                     {
    241                       *(res_ptr + res_size) = '\0';
    242                       cp = res_ptr;
    243                     }
    244                 }
    245             }
    246 
    247           free (file_name);
    248         }
    249 
    250 #else
    251 
    252 # if defined DARWIN7
    253       /* To avoid the trouble of installing a file that is shared by many
    254          GNU packages -- many packaging systems have problems with this --,
    255          simply inline the aliases here.  */
    256       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
    257            "ISO8859-2" "\0" "ISO-8859-2" "\0"
    258            "ISO8859-4" "\0" "ISO-8859-4" "\0"
    259            "ISO8859-5" "\0" "ISO-8859-5" "\0"
    260            "ISO8859-7" "\0" "ISO-8859-7" "\0"
    261            "ISO8859-9" "\0" "ISO-8859-9" "\0"
    262            "ISO8859-13" "\0" "ISO-8859-13" "\0"
    263            "ISO8859-15" "\0" "ISO-8859-15" "\0"
    264            "KOI8-R" "\0" "KOI8-R" "\0"
    265            "KOI8-U" "\0" "KOI8-U" "\0"
    266            "CP866" "\0" "CP866" "\0"
    267            "CP949" "\0" "CP949" "\0"
    268            "CP1131" "\0" "CP1131" "\0"
    269            "CP1251" "\0" "CP1251" "\0"
    270            "eucCN" "\0" "GB2312" "\0"
    271            "GB2312" "\0" "GB2312" "\0"
    272            "eucJP" "\0" "EUC-JP" "\0"
    273            "eucKR" "\0" "EUC-KR" "\0"
    274            "Big5" "\0" "BIG5" "\0"
    275            "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
    276            "GBK" "\0" "GBK" "\0"
    277            "GB18030" "\0" "GB18030" "\0"
    278            "SJIS" "\0" "SHIFT_JIS" "\0"
    279            "ARMSCII-8" "\0" "ARMSCII-8" "\0"
    280            "PT154" "\0" "PT154" "\0"
    281          /*"ISCII-DEV" "\0" "?" "\0"*/
    282            "*" "\0" "UTF-8" "\0";
    283 # endif
    284 
    285 # if defined VMS
    286       /* To avoid the troubles of an extra file charset.alias_vms in the
    287          sources of many GNU packages, simply inline the aliases here.  */
    288       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
    289          "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
    290          section 10.7 "Handling Different Character Sets".  */
    291       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
    292            "ISO8859-2" "\0" "ISO-8859-2" "\0"
    293            "ISO8859-5" "\0" "ISO-8859-5" "\0"
    294            "ISO8859-7" "\0" "ISO-8859-7" "\0"
    295            "ISO8859-8" "\0" "ISO-8859-8" "\0"
    296            "ISO8859-9" "\0" "ISO-8859-9" "\0"
    297            /* Japanese */
    298            "eucJP" "\0" "EUC-JP" "\0"
    299            "SJIS" "\0" "SHIFT_JIS" "\0"
    300            "DECKANJI" "\0" "DEC-KANJI" "\0"
    301            "SDECKANJI" "\0" "EUC-JP" "\0"
    302            /* Chinese */
    303            "eucTW" "\0" "EUC-TW" "\0"
    304            "DECHANYU" "\0" "DEC-HANYU" "\0"
    305            "DECHANZI" "\0" "GB2312" "\0"
    306            /* Korean */
    307            "DECKOREAN" "\0" "EUC-KR" "\0";
    308 # endif
    309 
    310 # if defined WINDOWS_NATIVE || defined __CYGWIN__
    311       /* To avoid the troubles of installing a separate file in the same
    312          directory as the DLL and of retrieving the DLL's directory at
    313          runtime, simply inline the aliases here.  */
    314 
    315       cp = "CP936" "\0" "GBK" "\0"
    316            "CP1361" "\0" "JOHAB" "\0"
    317            "CP20127" "\0" "ASCII" "\0"
    318            "CP20866" "\0" "KOI8-R" "\0"
    319            "CP20936" "\0" "GB2312" "\0"
    320            "CP21866" "\0" "KOI8-RU" "\0"
    321            "CP28591" "\0" "ISO-8859-1" "\0"
    322            "CP28592" "\0" "ISO-8859-2" "\0"
    323            "CP28593" "\0" "ISO-8859-3" "\0"
    324            "CP28594" "\0" "ISO-8859-4" "\0"
    325            "CP28595" "\0" "ISO-8859-5" "\0"
    326            "CP28596" "\0" "ISO-8859-6" "\0"
    327            "CP28597" "\0" "ISO-8859-7" "\0"
    328            "CP28598" "\0" "ISO-8859-8" "\0"
    329            "CP28599" "\0" "ISO-8859-9" "\0"
    330            "CP28605" "\0" "ISO-8859-15" "\0"
    331            "CP38598" "\0" "ISO-8859-8" "\0"
    332            "CP51932" "\0" "EUC-JP" "\0"
    333            "CP51936" "\0" "GB2312" "\0"
    334            "CP51949" "\0" "EUC-KR" "\0"
    335            "CP51950" "\0" "EUC-TW" "\0"
    336            "CP54936" "\0" "GB18030" "\0"
    337            "CP65001" "\0" "UTF-8" "\0";
    338 # endif
    339 #endif
    340 
    341       charset_aliases = cp;
    342     }
    343 
    344   return cp;
    345 }
    346 
    347 /* Determine the current locale's character encoding, and canonicalize it
    348    into one of the canonical names listed in config.charset.
    349    The result must not be freed; it is statically allocated.
    350    If the canonical name cannot be determined, the result is a non-canonical
    351    name.  */
    352 
    353 #ifdef STATIC
    354 STATIC
    355 #endif
    356 const char *
    357 locale_charset (void)
    358 {
    359   const char *codeset;
    360   const char *aliases;
    361 
    362 #if !(defined WINDOWS_NATIVE || defined OS2)
    363 
    364 # if HAVE_LANGINFO_CODESET
    365 
    366   /* Most systems support nl_langinfo (CODESET) nowadays.  */
    367   codeset = nl_langinfo (CODESET);
    368 
    369 #  ifdef __CYGWIN__
    370   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
    371      returns "US-ASCII".  Return the suffix of the locale name from the
    372      environment variables (if present) or the codepage as a number.  */
    373   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
    374     {
    375       const char *locale;
    376       static char buf[2 + 10 + 1];
    377 
    378       locale = getenv ("LC_ALL");
    379       if (locale == NULL || locale[0] == '\0')
    380         {
    381           locale = getenv ("LC_CTYPE");
    382           if (locale == NULL || locale[0] == '\0')
    383             locale = getenv ("LANG");
    384         }
    385       if (locale != NULL && locale[0] != '\0')
    386         {
    387           /* If the locale name contains an encoding after the dot, return
    388              it.  */
    389           const char *dot = strchr (locale, '.');
    390 
    391           if (dot != NULL)
    392             {
    393               const char *modifier;
    394 
    395               dot++;
    396               /* Look for the possible @... trailer and remove it, if any.  */
    397               modifier = strchr (dot, '@');
    398               if (modifier == NULL)
    399                 return dot;
    400               if (modifier - dot < sizeof (buf))
    401                 {
    402                   memcpy (buf, dot, modifier - dot);
    403                   buf [modifier - dot] = '\0';
    404                   return buf;
    405                 }
    406             }
    407         }
    408 
    409       /* The Windows API has a function returning the locale's codepage as a
    410          number: GetACP().  This encoding is used by Cygwin, unless the user
    411          has set the environment variable CYGWIN=codepage:oem (which very few
    412          people do).
    413          Output directed to console windows needs to be converted (to
    414          GetOEMCP() if the console is using a raster font, or to
    415          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
    416          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
    417          converting to GetConsoleOutputCP().  This leads to correct results,
    418          except when SetConsoleOutputCP has been called and a raster font is
    419          in use.  */
    420       sprintf (buf, "CP%u", GetACP ());
    421       codeset = buf;
    422     }
    423 #  endif
    424 
    425 # else
    426 
    427   /* On old systems which lack it, use setlocale or getenv.  */
    428   const char *locale = NULL;
    429 
    430   /* But most old systems don't have a complete set of locales.  Some
    431      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
    432      use setlocale here; it would return "C" when it doesn't support the
    433      locale name the user has set.  */
    434 #  if 0
    435   locale = setlocale (LC_CTYPE, NULL);
    436 #  endif
    437   if (locale == NULL || locale[0] == '\0')
    438     {
    439       locale = getenv ("LC_ALL");
    440       if (locale == NULL || locale[0] == '\0')
    441         {
    442           locale = getenv ("LC_CTYPE");
    443           if (locale == NULL || locale[0] == '\0')
    444             locale = getenv ("LANG");
    445         }
    446     }
    447 
    448   /* On some old systems, one used to set locale = "iso8859_1". On others,
    449      you set it to "language_COUNTRY.charset". In any case, we resolve it
    450      through the charset.alias file.  */
    451   codeset = locale;
    452 
    453 # endif
    454 
    455 #elif defined WINDOWS_NATIVE
    456 
    457   static char buf[2 + 10 + 1];
    458 
    459   /* The Windows API has a function returning the locale's codepage as a
    460      number: GetACP().
    461      When the output goes to a console window, it needs to be provided in
    462      GetOEMCP() encoding if the console is using a raster font, or in
    463      GetConsoleOutputCP() encoding if it is using a TrueType font.
    464      But in GUI programs and for output sent to files and pipes, GetACP()
    465      encoding is the best bet.  */
    466   sprintf (buf, "CP%u", GetACP ());
    467   codeset = buf;
    468 
    469 #elif defined OS2
    470 
    471   const char *locale;
    472   static char buf[2 + 10 + 1];
    473   ULONG cp[3];
    474   ULONG cplen;
    475 
    476   /* Allow user to override the codeset, as set in the operating system,
    477      with standard language environment variables.  */
    478   locale = getenv ("LC_ALL");
    479   if (locale == NULL || locale[0] == '\0')
    480     {
    481       locale = getenv ("LC_CTYPE");
    482       if (locale == NULL || locale[0] == '\0')
    483         locale = getenv ("LANG");
    484     }
    485   if (locale != NULL && locale[0] != '\0')
    486     {
    487       /* If the locale name contains an encoding after the dot, return it.  */
    488       const char *dot = strchr (locale, '.');
    489 
    490       if (dot != NULL)
    491         {
    492           const char *modifier;
    493 
    494           dot++;
    495           /* Look for the possible @... trailer and remove it, if any.  */
    496           modifier = strchr (dot, '@');
    497           if (modifier == NULL)
    498             return dot;
    499           if (modifier - dot < sizeof (buf))
    500             {
    501               memcpy (buf, dot, modifier - dot);
    502               buf [modifier - dot] = '\0';
    503               return buf;
    504             }
    505         }
    506 
    507       /* Resolve through the charset.alias file.  */
    508       codeset = locale;
    509     }
    510   else
    511     {
    512       /* OS/2 has a function returning the locale's codepage as a number.  */
    513       if (DosQueryCp (sizeof (cp), cp, &cplen))
    514         codeset = "";
    515       else
    516         {
    517           sprintf (buf, "CP%u", cp[0]);
    518           codeset = buf;
    519         }
    520     }
    521 
    522 #endif
    523 
    524   if (codeset == NULL)
    525     /* The canonical name cannot be determined.  */
    526     codeset = "";
    527 
    528   /* Resolve alias. */
    529   for (aliases = get_charset_aliases ();
    530        *aliases != '\0';
    531        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
    532     if (strcmp (codeset, aliases) == 0
    533         || (aliases[0] == '*' && aliases[1] == '\0'))
    534       {
    535         codeset = aliases + strlen (aliases) + 1;
    536         break;
    537       }
    538 
    539   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
    540      the empty string as denoting "the locale's character encoding",
    541      thus GNU libiconv would call this function a second time.  */
    542   if (codeset[0] == '\0')
    543     codeset = "ASCII";
    544 
    545 #ifdef DARWIN7
    546   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
    547      (the default codeset) does not work when MB_CUR_MAX is 1.  */
    548   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX <= 1)
    549     codeset = "ASCII";
    550 #endif
    551 
    552   return codeset;
    553 }
    554