Home | History | Annotate | Download | only in lib
      1 /* Convert multibyte character to wide character.
      2    Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc.
      3    Written by Bruno Haible <bruno (at) clisp.org>, 2008.
      4 
      5    This program is free software: you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3 of the License, or
      8    (at your option) any later version.
      9 
     10    This program is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    You should have received a copy of the GNU General Public License
     16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
     17 
     18 #include <config.h>
     19 
     20 /* Specification.  */
     21 #include <wchar.h>
     22 
     23 #if GNULIB_defined_mbstate_t
     24 /* Implement mbrtowc() on top of mbtowc().  */
     25 
     26 # include <errno.h>
     27 # include <stdlib.h>
     28 
     29 # include "localcharset.h"
     30 # include "streq.h"
     31 # include "verify.h"
     32 
     33 
     34 verify (sizeof (mbstate_t) >= 4);
     35 
     36 static char internal_state[4];
     37 
     38 size_t
     39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
     40 {
     41   char *pstate = (char *)ps;
     42 
     43   if (pstate == NULL)
     44     pstate = internal_state;
     45 
     46   if (s == NULL)
     47     {
     48       pwc = NULL;
     49       s = "";
     50       n = 1;
     51     }
     52 
     53   if (n == 0)
     54     return (size_t)(-2);
     55 
     56   /* Here n > 0.  */
     57   {
     58     size_t nstate = pstate[0];
     59     char buf[4];
     60     const char *p;
     61     size_t m;
     62 
     63     switch (nstate)
     64       {
     65       case 0:
     66 	p = s;
     67 	m = n;
     68 	break;
     69       case 3:
     70 	buf[2] = pstate[3];
     71 	/*FALLTHROUGH*/
     72       case 2:
     73 	buf[1] = pstate[2];
     74 	/*FALLTHROUGH*/
     75       case 1:
     76 	buf[0] = pstate[1];
     77 	p = buf;
     78 	m = nstate;
     79 	buf[m++] = s[0];
     80 	if (n >= 2 && m < 4)
     81 	  {
     82 	    buf[m++] = s[1];
     83 	    if (n >= 3 && m < 4)
     84 	      buf[m++] = s[2];
     85 	  }
     86 	break;
     87       default:
     88 	errno = EINVAL;
     89 	return (size_t)(-1);
     90       }
     91 
     92     /* Here m > 0.  */
     93 
     94 # if __GLIBC__
     95     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
     96     mbtowc (NULL, NULL, 0);
     97 # endif
     98     {
     99       int res = mbtowc (pwc, p, m);
    100 
    101       if (res >= 0)
    102 	{
    103 	  if (pwc != NULL && ((*pwc == 0) != (res == 0)))
    104 	    abort ();
    105 	  if (nstate >= (res > 0 ? res : 1))
    106 	    abort ();
    107 	  res -= nstate;
    108 	  pstate[0] = 0;
    109 	  return res;
    110 	}
    111 
    112       /* mbtowc does not distinguish between invalid and incomplete multibyte
    113 	 sequences.  But mbrtowc needs to make this distinction.
    114 	 There are two possible approaches:
    115 	   - Use iconv() and its return value.
    116 	   - Use built-in knowledge about the possible encodings.
    117 	 Given the low quality of implementation of iconv() on the systems that
    118 	 lack mbrtowc(), we use the second approach.
    119 	 The possible encodings are:
    120 	   - 8-bit encodings,
    121 	   - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
    122 	   - UTF-8.
    123 	 Use specialized code for each.  */
    124       if (m >= 4 || m >= MB_CUR_MAX)
    125 	goto invalid;
    126       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
    127       {
    128 	const char *encoding = locale_charset ();
    129 
    130 	if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
    131 	  {
    132 	    /* Cf. unistr/u8-mblen.c.  */
    133 	    unsigned char c = (unsigned char) p[0];
    134 
    135 	    if (c >= 0xc2)
    136 	      {
    137 		if (c < 0xe0)
    138 		  {
    139 		    if (m == 1)
    140 		      goto incomplete;
    141 		  }
    142 		else if (c < 0xf0)
    143 		  {
    144 		    if (m == 1)
    145 		      goto incomplete;
    146 		    if (m == 2)
    147 		      {
    148 			unsigned char c2 = (unsigned char) p[1];
    149 
    150 			if ((c2 ^ 0x80) < 0x40
    151 			    && (c >= 0xe1 || c2 >= 0xa0)
    152 			    && (c != 0xed || c2 < 0xa0))
    153 			  goto incomplete;
    154 		      }
    155 		  }
    156 		else if (c <= 0xf4)
    157 		  {
    158 		    if (m == 1)
    159 		      goto incomplete;
    160 		    else /* m == 2 || m == 3 */
    161 		      {
    162 			unsigned char c2 = (unsigned char) p[1];
    163 
    164 			if ((c2 ^ 0x80) < 0x40
    165 			    && (c >= 0xf1 || c2 >= 0x90)
    166 			    && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
    167 			  {
    168 			    if (m == 2)
    169 			      goto incomplete;
    170 			    else /* m == 3 */
    171 			      {
    172 				unsigned char c3 = (unsigned char) p[2];
    173 
    174 				if ((c3 ^ 0x80) < 0x40)
    175 				  goto incomplete;
    176 			      }
    177 			  }
    178 		      }
    179 		  }
    180 	      }
    181 	    goto invalid;
    182 	  }
    183 
    184 	/* As a reference for this code, you can use the GNU libiconv
    185 	   implementation.  Look for uses of the RET_TOOFEW macro.  */
    186 
    187 	if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
    188 	  {
    189 	    if (m == 1)
    190 	      {
    191 		unsigned char c = (unsigned char) p[0];
    192 
    193 		if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
    194 		  goto incomplete;
    195 	      }
    196 	    if (m == 2)
    197 	      {
    198 		unsigned char c = (unsigned char) p[0];
    199 
    200 		if (c == 0x8f)
    201 		  {
    202 		    unsigned char c2 = (unsigned char) p[1];
    203 
    204 		    if (c2 >= 0xa1 && c2 < 0xff)
    205 		      goto incomplete;
    206 		  }
    207 	      }
    208 	    goto invalid;
    209 	  }
    210 	if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
    211 	    || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
    212 	    || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
    213 	  {
    214 	    if (m == 1)
    215 	      {
    216 		unsigned char c = (unsigned char) p[0];
    217 
    218 		if (c >= 0xa1 && c < 0xff)
    219 		  goto incomplete;
    220 	      }
    221 	    goto invalid;
    222 	  }
    223 	if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
    224 	  {
    225 	    if (m == 1)
    226 	      {
    227 		unsigned char c = (unsigned char) p[0];
    228 
    229 		if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
    230 		  goto incomplete;
    231 	      }
    232 	    else /* m == 2 || m == 3 */
    233 	      {
    234 		unsigned char c = (unsigned char) p[0];
    235 
    236 		if (c == 0x8e)
    237 		  goto incomplete;
    238 	      }
    239 	    goto invalid;
    240 	  }
    241 	if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
    242 	  {
    243 	    if (m == 1)
    244 	      {
    245 		unsigned char c = (unsigned char) p[0];
    246 
    247 		if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
    248 		  goto incomplete;
    249 	      }
    250 	    else /* m == 2 || m == 3 */
    251 	      {
    252 		unsigned char c = (unsigned char) p[0];
    253 
    254 		if (c >= 0x90 && c <= 0xe3)
    255 		  {
    256 		    unsigned char c2 = (unsigned char) p[1];
    257 
    258 		    if (c2 >= 0x30 && c2 <= 0x39)
    259 		      {
    260 			if (m == 2)
    261 			  goto incomplete;
    262 			else /* m == 3 */
    263 			  {
    264 			    unsigned char c3 = (unsigned char) p[2];
    265 
    266 			    if (c3 >= 0x81 && c3 <= 0xfe)
    267 			      goto incomplete;
    268 			  }
    269 		      }
    270 		  }
    271 	      }
    272 	    goto invalid;
    273 	  }
    274 	if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
    275 	  {
    276 	    if (m == 1)
    277 	      {
    278 		unsigned char c = (unsigned char) p[0];
    279 
    280 		if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
    281 		    || (c >= 0xf0 && c <= 0xf9))
    282 		  goto incomplete;
    283 	      }
    284 	    goto invalid;
    285 	  }
    286 
    287 	/* An unknown multibyte encoding.  */
    288 	goto incomplete;
    289       }
    290 
    291      incomplete:
    292       {
    293 	size_t k = nstate;
    294 	/* Here 0 <= k < m < 4.  */
    295 	pstate[++k] = s[0];
    296 	if (k < m)
    297 	  {
    298 	    pstate[++k] = s[1];
    299 	    if (k < m)
    300 	      pstate[++k] = s[2];
    301 	  }
    302 	if (k != m)
    303 	  abort ();
    304       }
    305       pstate[0] = m;
    306       return (size_t)(-2);
    307 
    308      invalid:
    309       errno = EILSEQ;
    310       /* The conversion state is undefined, says POSIX.  */
    311       return (size_t)(-1);
    312     }
    313   }
    314 }
    315 
    316 #else
    317 /* Override the system's mbrtowc() function.  */
    318 
    319 # undef mbrtowc
    320 
    321 size_t
    322 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
    323 {
    324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
    325   if (s == NULL)
    326     {
    327       pwc = NULL;
    328       s = "";
    329       n = 1;
    330     }
    331 # endif
    332 
    333 # if MBRTOWC_RETVAL_BUG
    334   {
    335     static mbstate_t internal_state;
    336 
    337     /* Override mbrtowc's internal state.  We can not call mbsinit() on the
    338        hidden internal state, but we can call it on our variable.  */
    339     if (ps == NULL)
    340       ps = &internal_state;
    341 
    342     if (!mbsinit (ps))
    343       {
    344 	/* Parse the rest of the multibyte character byte for byte.  */
    345 	size_t count = 0;
    346 	for (; n > 0; s++, n--)
    347 	  {
    348 	    wchar_t wc;
    349 	    size_t ret = mbrtowc (&wc, s, 1, ps);
    350 
    351 	    if (ret == (size_t)(-1))
    352 	      return (size_t)(-1);
    353 	    count++;
    354 	    if (ret != (size_t)(-2))
    355 	      {
    356 		/* The multibyte character has been completed.  */
    357 		if (pwc != NULL)
    358 		  *pwc = wc;
    359 		return (wc == 0 ? 0 : count);
    360 	      }
    361 	  }
    362 	return (size_t)(-2);
    363       }
    364   }
    365 # endif
    366 
    367 # if MBRTOWC_NUL_RETVAL_BUG
    368   {
    369     wchar_t wc;
    370     size_t ret = mbrtowc (&wc, s, n, ps);
    371 
    372     if (ret != (size_t)(-1) && ret != (size_t)(-2))
    373       {
    374 	if (pwc != NULL)
    375 	  *pwc = wc;
    376 	if (wc == 0)
    377 	  ret = 0;
    378       }
    379     return ret;
    380   }
    381 # else
    382   return mbrtowc (pwc, s, n, ps);
    383 # endif
    384 }
    385 
    386 #endif
    387