1 /* Convert multibyte character to wide character. 2 Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno (at) clisp.org>, 2008. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include <config.h> 19 20 /* Specification. */ 21 #include <wchar.h> 22 23 #if GNULIB_defined_mbstate_t 24 /* Implement mbrtowc() on top of mbtowc(). */ 25 26 # include <errno.h> 27 # include <stdlib.h> 28 29 # include "localcharset.h" 30 # include "streq.h" 31 # include "verify.h" 32 33 34 verify (sizeof (mbstate_t) >= 4); 35 36 static char internal_state[4]; 37 38 size_t 39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 40 { 41 char *pstate = (char *)ps; 42 43 if (pstate == NULL) 44 pstate = internal_state; 45 46 if (s == NULL) 47 { 48 pwc = NULL; 49 s = ""; 50 n = 1; 51 } 52 53 if (n == 0) 54 return (size_t)(-2); 55 56 /* Here n > 0. */ 57 { 58 size_t nstate = pstate[0]; 59 char buf[4]; 60 const char *p; 61 size_t m; 62 63 switch (nstate) 64 { 65 case 0: 66 p = s; 67 m = n; 68 break; 69 case 3: 70 buf[2] = pstate[3]; 71 /*FALLTHROUGH*/ 72 case 2: 73 buf[1] = pstate[2]; 74 /*FALLTHROUGH*/ 75 case 1: 76 buf[0] = pstate[1]; 77 p = buf; 78 m = nstate; 79 buf[m++] = s[0]; 80 if (n >= 2 && m < 4) 81 { 82 buf[m++] = s[1]; 83 if (n >= 3 && m < 4) 84 buf[m++] = s[2]; 85 } 86 break; 87 default: 88 errno = EINVAL; 89 return (size_t)(-1); 90 } 91 92 /* Here m > 0. */ 93 94 # if __GLIBC__ 95 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ 96 mbtowc (NULL, NULL, 0); 97 # endif 98 { 99 int res = mbtowc (pwc, p, m); 100 101 if (res >= 0) 102 { 103 if (pwc != NULL && ((*pwc == 0) != (res == 0))) 104 abort (); 105 if (nstate >= (res > 0 ? res : 1)) 106 abort (); 107 res -= nstate; 108 pstate[0] = 0; 109 return res; 110 } 111 112 /* mbtowc does not distinguish between invalid and incomplete multibyte 113 sequences. But mbrtowc needs to make this distinction. 114 There are two possible approaches: 115 - Use iconv() and its return value. 116 - Use built-in knowledge about the possible encodings. 117 Given the low quality of implementation of iconv() on the systems that 118 lack mbrtowc(), we use the second approach. 119 The possible encodings are: 120 - 8-bit encodings, 121 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 122 - UTF-8. 123 Use specialized code for each. */ 124 if (m >= 4 || m >= MB_CUR_MAX) 125 goto invalid; 126 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 127 { 128 const char *encoding = locale_charset (); 129 130 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) 131 { 132 /* Cf. unistr/u8-mblen.c. */ 133 unsigned char c = (unsigned char) p[0]; 134 135 if (c >= 0xc2) 136 { 137 if (c < 0xe0) 138 { 139 if (m == 1) 140 goto incomplete; 141 } 142 else if (c < 0xf0) 143 { 144 if (m == 1) 145 goto incomplete; 146 if (m == 2) 147 { 148 unsigned char c2 = (unsigned char) p[1]; 149 150 if ((c2 ^ 0x80) < 0x40 151 && (c >= 0xe1 || c2 >= 0xa0) 152 && (c != 0xed || c2 < 0xa0)) 153 goto incomplete; 154 } 155 } 156 else if (c <= 0xf4) 157 { 158 if (m == 1) 159 goto incomplete; 160 else /* m == 2 || m == 3 */ 161 { 162 unsigned char c2 = (unsigned char) p[1]; 163 164 if ((c2 ^ 0x80) < 0x40 165 && (c >= 0xf1 || c2 >= 0x90) 166 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) 167 { 168 if (m == 2) 169 goto incomplete; 170 else /* m == 3 */ 171 { 172 unsigned char c3 = (unsigned char) p[2]; 173 174 if ((c3 ^ 0x80) < 0x40) 175 goto incomplete; 176 } 177 } 178 } 179 } 180 } 181 goto invalid; 182 } 183 184 /* As a reference for this code, you can use the GNU libiconv 185 implementation. Look for uses of the RET_TOOFEW macro. */ 186 187 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) 188 { 189 if (m == 1) 190 { 191 unsigned char c = (unsigned char) p[0]; 192 193 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 194 goto incomplete; 195 } 196 if (m == 2) 197 { 198 unsigned char c = (unsigned char) p[0]; 199 200 if (c == 0x8f) 201 { 202 unsigned char c2 = (unsigned char) p[1]; 203 204 if (c2 >= 0xa1 && c2 < 0xff) 205 goto incomplete; 206 } 207 } 208 goto invalid; 209 } 210 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 211 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 212 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) 213 { 214 if (m == 1) 215 { 216 unsigned char c = (unsigned char) p[0]; 217 218 if (c >= 0xa1 && c < 0xff) 219 goto incomplete; 220 } 221 goto invalid; 222 } 223 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) 224 { 225 if (m == 1) 226 { 227 unsigned char c = (unsigned char) p[0]; 228 229 if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 230 goto incomplete; 231 } 232 else /* m == 2 || m == 3 */ 233 { 234 unsigned char c = (unsigned char) p[0]; 235 236 if (c == 0x8e) 237 goto incomplete; 238 } 239 goto invalid; 240 } 241 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 242 { 243 if (m == 1) 244 { 245 unsigned char c = (unsigned char) p[0]; 246 247 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 248 goto incomplete; 249 } 250 else /* m == 2 || m == 3 */ 251 { 252 unsigned char c = (unsigned char) p[0]; 253 254 if (c >= 0x90 && c <= 0xe3) 255 { 256 unsigned char c2 = (unsigned char) p[1]; 257 258 if (c2 >= 0x30 && c2 <= 0x39) 259 { 260 if (m == 2) 261 goto incomplete; 262 else /* m == 3 */ 263 { 264 unsigned char c3 = (unsigned char) p[2]; 265 266 if (c3 >= 0x81 && c3 <= 0xfe) 267 goto incomplete; 268 } 269 } 270 } 271 } 272 goto invalid; 273 } 274 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) 275 { 276 if (m == 1) 277 { 278 unsigned char c = (unsigned char) p[0]; 279 280 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 281 || (c >= 0xf0 && c <= 0xf9)) 282 goto incomplete; 283 } 284 goto invalid; 285 } 286 287 /* An unknown multibyte encoding. */ 288 goto incomplete; 289 } 290 291 incomplete: 292 { 293 size_t k = nstate; 294 /* Here 0 <= k < m < 4. */ 295 pstate[++k] = s[0]; 296 if (k < m) 297 { 298 pstate[++k] = s[1]; 299 if (k < m) 300 pstate[++k] = s[2]; 301 } 302 if (k != m) 303 abort (); 304 } 305 pstate[0] = m; 306 return (size_t)(-2); 307 308 invalid: 309 errno = EILSEQ; 310 /* The conversion state is undefined, says POSIX. */ 311 return (size_t)(-1); 312 } 313 } 314 } 315 316 #else 317 /* Override the system's mbrtowc() function. */ 318 319 # undef mbrtowc 320 321 size_t 322 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 323 { 324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG 325 if (s == NULL) 326 { 327 pwc = NULL; 328 s = ""; 329 n = 1; 330 } 331 # endif 332 333 # if MBRTOWC_RETVAL_BUG 334 { 335 static mbstate_t internal_state; 336 337 /* Override mbrtowc's internal state. We can not call mbsinit() on the 338 hidden internal state, but we can call it on our variable. */ 339 if (ps == NULL) 340 ps = &internal_state; 341 342 if (!mbsinit (ps)) 343 { 344 /* Parse the rest of the multibyte character byte for byte. */ 345 size_t count = 0; 346 for (; n > 0; s++, n--) 347 { 348 wchar_t wc; 349 size_t ret = mbrtowc (&wc, s, 1, ps); 350 351 if (ret == (size_t)(-1)) 352 return (size_t)(-1); 353 count++; 354 if (ret != (size_t)(-2)) 355 { 356 /* The multibyte character has been completed. */ 357 if (pwc != NULL) 358 *pwc = wc; 359 return (wc == 0 ? 0 : count); 360 } 361 } 362 return (size_t)(-2); 363 } 364 } 365 # endif 366 367 # if MBRTOWC_NUL_RETVAL_BUG 368 { 369 wchar_t wc; 370 size_t ret = mbrtowc (&wc, s, n, ps); 371 372 if (ret != (size_t)(-1) && ret != (size_t)(-2)) 373 { 374 if (pwc != NULL) 375 *pwc = wc; 376 if (wc == 0) 377 ret = 0; 378 } 379 return ret; 380 } 381 # else 382 return mbrtowc (pwc, s, n, ps); 383 # endif 384 } 385 386 #endif 387