1 #include <iconv.h> 2 #include <errno.h> 3 #include <wchar.h> 4 #include <string.h> 5 #include <stdlib.h> 6 #include <limits.h> 7 #include <stdint.h> 8 9 #define UTF_32BE 0300 10 #define UTF_16LE 0301 11 #define UTF_16BE 0302 12 #define UTF_32LE 0303 13 #define UCS2BE 0304 14 #define UCS2LE 0305 15 #define WCHAR_T 0306 16 #define US_ASCII 0307 17 #define UTF_8 0310 18 #define EUC_JP 0320 19 #define SHIFT_JIS 0321 20 #define GB18030 0330 21 #define GBK 0331 22 #define GB2312 0332 23 24 /* FIXME: these are not implemented yet 25 * EUC: A1-FE A1-FE 26 * GBK: 81-FE 40-7E,80-FE 27 * Big5: A1-FE 40-7E,A1-FE 28 */ 29 30 /* Definitions of charmaps. Each charmap consists of: 31 * 1. Empty-string-terminated list of null-terminated aliases. 32 * 2. Special type code or number of elided entries. 33 * 3. Character table (size determined by field 2). */ 34 35 static const unsigned char charmaps[] = 36 "utf8\0\0\310" 37 "wchart\0\0\306" 38 "ucs2\0ucs2be\0\0\304" 39 "ucs2le\0\0\305" 40 "utf16\0utf16be\0\0\302" 41 "utf16le\0\0\301" 42 "ucs4\0ucs4be\0utf32\0utf32be\0\0\300" 43 "ucs4le\0utf32le\0\0\303" 44 "ascii\0usascii\0iso646\0iso646us\0\0\307" 45 "eucjp\0\0\320" 46 "shiftjis\0sjis\0\0\321" 47 "gb18030\0\0\330" 48 "gbk\0\0\331" 49 "gb2312\0\0\332" 50 #include "codepages.h" 51 ; 52 53 static const unsigned short legacy_chars[] = { 54 #include "legacychars.h" 55 }; 56 57 static const unsigned short jis0208[84][94] = { 58 #include "jis0208.h" 59 }; 60 61 static const unsigned short gb18030[126][190] = { 62 #include "gb18030.h" 63 }; 64 65 static int fuzzycmp(const unsigned char *a, const unsigned char *b) 66 { 67 for (; *a && *b; a++, b++) { 68 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++; 69 if ((*a|32U) != *b) return 1; 70 } 71 return *a != *b; 72 } 73 74 static size_t find_charmap(const void *name) 75 { 76 const unsigned char *s; 77 for (s=charmaps; *s; ) { 78 if (!fuzzycmp(name, s)) { 79 for (; *s; s+=strlen((void *)s)+1); 80 return s+1-charmaps; 81 } 82 s += strlen((void *)s)+1; 83 if (!*s) { 84 if (s[1] > 0200) s+=2; 85 else s+=2+(128U-s[1])/4*5; 86 } 87 } 88 return -1; 89 } 90 91 iconv_t iconv_open(const char *to, const char *from) 92 { 93 size_t f, t; 94 95 if ((t = find_charmap(to))==-1 96 || (f = find_charmap(from))==-1 97 || (charmaps[t] >= 0320)) { 98 errno = EINVAL; 99 return (iconv_t)-1; 100 } 101 102 return (void *)(f<<16 | t); 103 } 104 105 int iconv_close(iconv_t cd) 106 { 107 return 0; 108 } 109 110 static unsigned get_16(const unsigned char *s, int e) 111 { 112 e &= 1; 113 return s[e]<<8 | s[1-e]; 114 } 115 116 static void put_16(unsigned char *s, unsigned c, int e) 117 { 118 e &= 1; 119 s[e] = c>>8; 120 s[1-e] = c; 121 } 122 123 static unsigned get_32(const unsigned char *s, int e) 124 { 125 e &= 3; 126 return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3]; 127 } 128 129 static void put_32(unsigned char *s, unsigned c, int e) 130 { 131 e &= 3; 132 s[e^0] = c>>24; 133 s[e^1] = c>>16; 134 s[e^2] = c>>8; 135 s[e^3] = c; 136 } 137 138 /* Adapt as needed */ 139 #define mbrtowc_utf8 mbrtowc 140 #define wctomb_utf8 wctomb 141 142 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) 143 { 144 size_t x=0; 145 unsigned long cd = (unsigned long)cd0; 146 unsigned to = cd & 0xffff; 147 unsigned from = cd >> 16; 148 const unsigned char *map = charmaps+from+1; 149 const unsigned char *tomap = charmaps+to+1; 150 mbstate_t st = {0}; 151 wchar_t wc; 152 unsigned c, d; 153 size_t k, l; 154 int err; 155 unsigned char type = map[-1]; 156 unsigned char totype = tomap[-1]; 157 158 if (!in || !*in || !*inb) return 0; 159 160 for (; *inb; *in+=l, *inb-=l) { 161 c = *(unsigned char *)*in; 162 l = 1; 163 164 if (c >= 128 || type-UTF_32BE < 7U) switch (type) { 165 case UTF_8: 166 l = mbrtowc_utf8(&wc, *in, *inb, &st); 167 if (!l) l++; 168 else if (l == (size_t)-1) goto ilseq; 169 else if (l == (size_t)-2) goto starved; 170 c = wc; 171 break; 172 case US_ASCII: 173 goto ilseq; 174 case WCHAR_T: 175 l = sizeof(wchar_t); 176 if (*inb < l) goto starved; 177 c = *(wchar_t *)*in; 178 if (0) { 179 case UTF_32BE: 180 case UTF_32LE: 181 l = 4; 182 if (*inb < 4) goto starved; 183 c = get_32((void *)*in, type); 184 } 185 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq; 186 break; 187 case UCS2BE: 188 case UCS2LE: 189 case UTF_16BE: 190 case UTF_16LE: 191 l = 2; 192 if (*inb < 2) goto starved; 193 c = get_16((void *)*in, type); 194 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq; 195 if ((unsigned)(c-0xd800) < 0x400) { 196 if (type-UCS2BE < 2U) goto ilseq; 197 l = 4; 198 if (*inb < 4) goto starved; 199 d = get_16((void *)(*in + 2), type); 200 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq; 201 c = ((c-0xd7c0)<<10) + (d-0xdc00); 202 } 203 break; 204 case SHIFT_JIS: 205 if (c-0xa1 <= 0xdf-0xa1) { 206 c += 0xff61-0xa1; 207 break; 208 } 209 l = 2; 210 if (*inb < 2) goto starved; 211 d = *((unsigned char *)*in + 1); 212 if (c-129 <= 159-129) c -= 129; 213 else if (c-224 <= 239-224) c -= 193; 214 else goto ilseq; 215 c *= 2; 216 if (d-64 <= 158-64) { 217 if (d==127) goto ilseq; 218 if (d>127) d--; 219 d -= 64; 220 } else if (d-159 <= 252-159) { 221 c++; 222 d -= 159; 223 } 224 c = jis0208[c][d]; 225 if (!c) goto ilseq; 226 break; 227 case EUC_JP: 228 l = 2; 229 if (*inb < 2) goto starved; 230 d = *((unsigned char *)*in + 1); 231 if (c==0x8e) { 232 c = d; 233 if (c-0xa1 > 0xdf-0xa1) goto ilseq; 234 c += 0xff61 - 0xa1; 235 break; 236 } 237 c -= 0xa1; 238 d -= 0xa1; 239 if (c >= 84 || d >= 94) goto ilseq; 240 c = jis0208[c][d]; 241 if (!c) goto ilseq; 242 break; 243 case GB2312: 244 if (c < 0xa1) goto ilseq; 245 case GBK: 246 case GB18030: 247 c -= 0x81; 248 if (c >= 126) goto ilseq; 249 l = 2; 250 if (*inb < 2) goto starved; 251 d = *((unsigned char *)*in + 1); 252 if (d < 0xa1 && type == GB2312) goto ilseq; 253 if (d-0x40>=191 || d==127) { 254 if (d-'0'>9 || type != GB18030) 255 goto ilseq; 256 l = 4; 257 if (*inb < 4) goto starved; 258 c = (10*c + d-'0') * 1260; 259 d = *((unsigned char *)*in + 2); 260 if (d-0x81>126) goto ilseq; 261 c += 10*(d-0x81); 262 d = *((unsigned char *)*in + 3); 263 if (d-'0'>9) goto ilseq; 264 c += d-'0'; 265 c += 128; 266 for (d=0; d<=c; ) { 267 k = 0; 268 int i, j; 269 for (i=0; i<126; i++) 270 for (j=0; j<190; j++) 271 if (gb18030[i][j]-d <= c-d) 272 k++; 273 d = c+1; 274 c += k; 275 } 276 break; 277 } 278 d -= 0x40; 279 if (d>63) d--; 280 c = gb18030[c][d]; 281 break; 282 default: 283 if (c < 128+type) break; 284 c -= 128+type; 285 c = legacy_chars[ map[c*5/4]>>2*c%8 | 286 map[c*5/4+1]<<8-2*c%8 & 1023 ]; 287 if (!c) c = *(unsigned char *)*in; 288 if (c==1) goto ilseq; 289 } 290 291 switch (totype) { 292 case WCHAR_T: 293 if (*outb < sizeof(wchar_t)) goto toobig; 294 *(wchar_t *)*out = c; 295 *out += sizeof(wchar_t); 296 *outb -= sizeof(wchar_t); 297 break; 298 case UTF_8: 299 if (*outb < 4) { 300 char tmp[4]; 301 k = wctomb_utf8(tmp, c); 302 if (*outb < k) goto toobig; 303 memcpy(*out, tmp, k); 304 } else k = wctomb_utf8(*out, c); 305 *out += k; 306 *outb -= k; 307 break; 308 case US_ASCII: 309 if (c > 0x7f) subst: x++, c='*'; 310 default: 311 if (*outb < 1) goto toobig; 312 if (c < 128+totype) { 313 revout: 314 *(*out)++ = c; 315 *outb -= 1; 316 break; 317 } 318 d = c; 319 for (c=0; c<128-totype; c++) { 320 if (d == legacy_chars[ map[c*5/4]>>2*c%8 | 321 map[c*5/4+1]<<8-2*c%8 & 1023 ]) { 322 c += 128; 323 goto revout; 324 } 325 } 326 goto subst; 327 case UCS2BE: 328 case UCS2LE: 329 case UTF_16BE: 330 case UTF_16LE: 331 if (c < 0x10000 || type-UCS2BE < 2U) { 332 if (c >= 0x10000) c = 0xFFFD; 333 if (*outb < 2) goto toobig; 334 put_16((void *)*out, c, totype); 335 *out += 2; 336 *outb -= 2; 337 break; 338 } 339 if (*outb < 4) goto toobig; 340 c -= 0x10000; 341 put_16((void *)*out, (c>>10)|0xd800, totype); 342 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype); 343 *out += 4; 344 *outb -= 4; 345 break; 346 case UTF_32BE: 347 case UTF_32LE: 348 if (*outb < 4) goto toobig; 349 put_32((void *)*out, c, totype); 350 *out += 4; 351 *outb -= 4; 352 break; 353 } 354 } 355 return x; 356 ilseq: 357 err = EILSEQ; 358 x = -1; 359 goto end; 360 toobig: 361 err = E2BIG; 362 x = -1; 363 goto end; 364 starved: 365 err = EINVAL; 366 x = -1; 367 end: 368 errno = err; 369 return x; 370 } 371