Home | History | Annotate | Download | only in lib
      1 /* Convert multibyte character to wide character.
      2    Copyright (C) 1999-2002, 2005-2012 Free Software Foundation, Inc.
      3    Written by Bruno Haible <bruno (at) clisp.org>, 2008.
      4 
      5    This program is free software: you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3 of the License, or
      8    (at your option) any later version.
      9 
     10    This program is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    You should have received a copy of the GNU General Public License
     16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
     17 
     18 #include <config.h>
     19 
     20 /* Specification.  */
     21 #include <wchar.h>
     22 
     23 #if GNULIB_defined_mbstate_t
     24 /* Implement mbrtowc() on top of mbtowc().  */
     25 
     26 # include <errno.h>
     27 # include <stdlib.h>
     28 
     29 # include "localcharset.h"
     30 # include "streq.h"
     31 # include "verify.h"
     32 
     33 
     34 verify (sizeof (mbstate_t) >= 4);
     35 
     36 static char internal_state[4];
     37 
     38 size_t
     39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
     40 {
     41   char *pstate = (char *)ps;
     42 
     43   if (s == NULL)
     44     {
     45       pwc = NULL;
     46       s = "";
     47       n = 1;
     48     }
     49 
     50   if (n == 0)
     51     return (size_t)(-2);
     52 
     53   /* Here n > 0.  */
     54 
     55   if (pstate == NULL)
     56     pstate = internal_state;
     57 
     58   {
     59     size_t nstate = pstate[0];
     60     char buf[4];
     61     const char *p;
     62     size_t m;
     63 
     64     switch (nstate)
     65       {
     66       case 0:
     67         p = s;
     68         m = n;
     69         break;
     70       case 3:
     71         buf[2] = pstate[3];
     72         /*FALLTHROUGH*/
     73       case 2:
     74         buf[1] = pstate[2];
     75         /*FALLTHROUGH*/
     76       case 1:
     77         buf[0] = pstate[1];
     78         p = buf;
     79         m = nstate;
     80         buf[m++] = s[0];
     81         if (n >= 2 && m < 4)
     82           {
     83             buf[m++] = s[1];
     84             if (n >= 3 && m < 4)
     85               buf[m++] = s[2];
     86           }
     87         break;
     88       default:
     89         errno = EINVAL;
     90         return (size_t)(-1);
     91       }
     92 
     93     /* Here m > 0.  */
     94 
     95 # if __GLIBC__ || defined __UCLIBC__
     96     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
     97     mbtowc (NULL, NULL, 0);
     98 # endif
     99     {
    100       int res = mbtowc (pwc, p, m);
    101 
    102       if (res >= 0)
    103         {
    104           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
    105             abort ();
    106           if (nstate >= (res > 0 ? res : 1))
    107             abort ();
    108           res -= nstate;
    109           pstate[0] = 0;
    110           return res;
    111         }
    112 
    113       /* mbtowc does not distinguish between invalid and incomplete multibyte
    114          sequences.  But mbrtowc needs to make this distinction.
    115          There are two possible approaches:
    116            - Use iconv() and its return value.
    117            - Use built-in knowledge about the possible encodings.
    118          Given the low quality of implementation of iconv() on the systems that
    119          lack mbrtowc(), we use the second approach.
    120          The possible encodings are:
    121            - 8-bit encodings,
    122            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
    123            - UTF-8.
    124          Use specialized code for each.  */
    125       if (m >= 4 || m >= MB_CUR_MAX)
    126         goto invalid;
    127       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
    128       {
    129         const char *encoding = locale_charset ();
    130 
    131         if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
    132           {
    133             /* Cf. unistr/u8-mblen.c.  */
    134             unsigned char c = (unsigned char) p[0];
    135 
    136             if (c >= 0xc2)
    137               {
    138                 if (c < 0xe0)
    139                   {
    140                     if (m == 1)
    141                       goto incomplete;
    142                   }
    143                 else if (c < 0xf0)
    144                   {
    145                     if (m == 1)
    146                       goto incomplete;
    147                     if (m == 2)
    148                       {
    149                         unsigned char c2 = (unsigned char) p[1];
    150 
    151                         if ((c2 ^ 0x80) < 0x40
    152                             && (c >= 0xe1 || c2 >= 0xa0)
    153                             && (c != 0xed || c2 < 0xa0))
    154                           goto incomplete;
    155                       }
    156                   }
    157                 else if (c <= 0xf4)
    158                   {
    159                     if (m == 1)
    160                       goto incomplete;
    161                     else /* m == 2 || m == 3 */
    162                       {
    163                         unsigned char c2 = (unsigned char) p[1];
    164 
    165                         if ((c2 ^ 0x80) < 0x40
    166                             && (c >= 0xf1 || c2 >= 0x90)
    167                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
    168                           {
    169                             if (m == 2)
    170                               goto incomplete;
    171                             else /* m == 3 */
    172                               {
    173                                 unsigned char c3 = (unsigned char) p[2];
    174 
    175                                 if ((c3 ^ 0x80) < 0x40)
    176                                   goto incomplete;
    177                               }
    178                           }
    179                       }
    180                   }
    181               }
    182             goto invalid;
    183           }
    184 
    185         /* As a reference for this code, you can use the GNU libiconv
    186            implementation.  Look for uses of the RET_TOOFEW macro.  */
    187 
    188         if (STREQ_OPT (encoding,
    189                        "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
    190           {
    191             if (m == 1)
    192               {
    193                 unsigned char c = (unsigned char) p[0];
    194 
    195                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
    196                   goto incomplete;
    197               }
    198             if (m == 2)
    199               {
    200                 unsigned char c = (unsigned char) p[0];
    201 
    202                 if (c == 0x8f)
    203                   {
    204                     unsigned char c2 = (unsigned char) p[1];
    205 
    206                     if (c2 >= 0xa1 && c2 < 0xff)
    207                       goto incomplete;
    208                   }
    209               }
    210             goto invalid;
    211           }
    212         if (STREQ_OPT (encoding,
    213                        "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
    214             || STREQ_OPT (encoding,
    215                           "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
    216             || STREQ_OPT (encoding,
    217                           "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
    218           {
    219             if (m == 1)
    220               {
    221                 unsigned char c = (unsigned char) p[0];
    222 
    223                 if (c >= 0xa1 && c < 0xff)
    224                   goto incomplete;
    225               }
    226             goto invalid;
    227           }
    228         if (STREQ_OPT (encoding,
    229                        "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
    230           {
    231             if (m == 1)
    232               {
    233                 unsigned char c = (unsigned char) p[0];
    234 
    235                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
    236                   goto incomplete;
    237               }
    238             else /* m == 2 || m == 3 */
    239               {
    240                 unsigned char c = (unsigned char) p[0];
    241 
    242                 if (c == 0x8e)
    243                   goto incomplete;
    244               }
    245             goto invalid;
    246           }
    247         if (STREQ_OPT (encoding,
    248                        "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
    249           {
    250             if (m == 1)
    251               {
    252                 unsigned char c = (unsigned char) p[0];
    253 
    254                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
    255                   goto incomplete;
    256               }
    257             else /* m == 2 || m == 3 */
    258               {
    259                 unsigned char c = (unsigned char) p[0];
    260 
    261                 if (c >= 0x90 && c <= 0xe3)
    262                   {
    263                     unsigned char c2 = (unsigned char) p[1];
    264 
    265                     if (c2 >= 0x30 && c2 <= 0x39)
    266                       {
    267                         if (m == 2)
    268                           goto incomplete;
    269                         else /* m == 3 */
    270                           {
    271                             unsigned char c3 = (unsigned char) p[2];
    272 
    273                             if (c3 >= 0x81 && c3 <= 0xfe)
    274                               goto incomplete;
    275                           }
    276                       }
    277                   }
    278               }
    279             goto invalid;
    280           }
    281         if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
    282           {
    283             if (m == 1)
    284               {
    285                 unsigned char c = (unsigned char) p[0];
    286 
    287                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
    288                     || (c >= 0xf0 && c <= 0xf9))
    289                   goto incomplete;
    290               }
    291             goto invalid;
    292           }
    293 
    294         /* An unknown multibyte encoding.  */
    295         goto incomplete;
    296       }
    297 
    298      incomplete:
    299       {
    300         size_t k = nstate;
    301         /* Here 0 <= k < m < 4.  */
    302         pstate[++k] = s[0];
    303         if (k < m)
    304           {
    305             pstate[++k] = s[1];
    306             if (k < m)
    307               pstate[++k] = s[2];
    308           }
    309         if (k != m)
    310           abort ();
    311       }
    312       pstate[0] = m;
    313       return (size_t)(-2);
    314 
    315      invalid:
    316       errno = EILSEQ;
    317       /* The conversion state is undefined, says POSIX.  */
    318       return (size_t)(-1);
    319     }
    320   }
    321 }
    322 
    323 #else
    324 /* Override the system's mbrtowc() function.  */
    325 
    326 # undef mbrtowc
    327 
    328 size_t
    329 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
    330 {
    331 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
    332   if (s == NULL)
    333     {
    334       pwc = NULL;
    335       s = "";
    336       n = 1;
    337     }
    338 # endif
    339 
    340 # if MBRTOWC_RETVAL_BUG
    341   {
    342     static mbstate_t internal_state;
    343 
    344     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
    345        hidden internal state, but we can call it on our variable.  */
    346     if (ps == NULL)
    347       ps = &internal_state;
    348 
    349     if (!mbsinit (ps))
    350       {
    351         /* Parse the rest of the multibyte character byte for byte.  */
    352         size_t count = 0;
    353         for (; n > 0; s++, n--)
    354           {
    355             wchar_t wc;
    356             size_t ret = mbrtowc (&wc, s, 1, ps);
    357 
    358             if (ret == (size_t)(-1))
    359               return (size_t)(-1);
    360             count++;
    361             if (ret != (size_t)(-2))
    362               {
    363                 /* The multibyte character has been completed.  */
    364                 if (pwc != NULL)
    365                   *pwc = wc;
    366                 return (wc == 0 ? 0 : count);
    367               }
    368           }
    369         return (size_t)(-2);
    370       }
    371   }
    372 # endif
    373 
    374 # if MBRTOWC_NUL_RETVAL_BUG
    375   {
    376     wchar_t wc;
    377     size_t ret = mbrtowc (&wc, s, n, ps);
    378 
    379     if (ret != (size_t)(-1) && ret != (size_t)(-2))
    380       {
    381         if (pwc != NULL)
    382           *pwc = wc;
    383         if (wc == 0)
    384           ret = 0;
    385       }
    386     return ret;
    387   }
    388 # else
    389   {
    390 #   if MBRTOWC_NULL_ARG1_BUG
    391     wchar_t dummy;
    392 
    393     if (pwc == NULL)
    394       pwc = &dummy;
    395 #   endif
    396 
    397     return mbrtowc (pwc, s, n, ps);
    398   }
    399 # endif
    400 }
    401 
    402 #endif
    403