Home | History | Annotate | Download | only in musl-locale
      1 #include <iconv.h>
      2 #include <errno.h>
      3 #include <wchar.h>
      4 #include <string.h>
      5 #include <stdlib.h>
      6 #include <limits.h>
      7 #include <stdint.h>
      8 
      9 #define UTF_32BE    0300
     10 #define UTF_16LE    0301
     11 #define UTF_16BE    0302
     12 #define UTF_32LE    0303
     13 #define UCS2BE      0304
     14 #define UCS2LE      0305
     15 #define WCHAR_T     0306
     16 #define US_ASCII    0307
     17 #define UTF_8       0310
     18 #define EUC_JP      0320
     19 #define SHIFT_JIS   0321
     20 #define GB18030     0330
     21 #define GBK         0331
     22 #define GB2312      0332
     23 
     24 /* FIXME: these are not implemented yet
     25  * EUC:   A1-FE A1-FE
     26  * GBK:   81-FE 40-7E,80-FE
     27  * Big5:  A1-FE 40-7E,A1-FE
     28  */
     29 
     30 /* Definitions of charmaps. Each charmap consists of:
     31  * 1. Empty-string-terminated list of null-terminated aliases.
     32  * 2. Special type code or number of elided entries.
     33  * 3. Character table (size determined by field 2). */
     34 
     35 static const unsigned char charmaps[] =
     36 "utf8\0\0\310"
     37 "wchart\0\0\306"
     38 "ucs2\0ucs2be\0\0\304"
     39 "ucs2le\0\0\305"
     40 "utf16\0utf16be\0\0\302"
     41 "utf16le\0\0\301"
     42 "ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
     43 "ucs4le\0utf32le\0\0\303"
     44 "ascii\0usascii\0iso646\0iso646us\0\0\307"
     45 "eucjp\0\0\320"
     46 "shiftjis\0sjis\0\0\321"
     47 "gb18030\0\0\330"
     48 "gbk\0\0\331"
     49 "gb2312\0\0\332"
     50 #include "codepages.h"
     51 ;
     52 
     53 static const unsigned short legacy_chars[] = {
     54 #include "legacychars.h"
     55 };
     56 
     57 static const unsigned short jis0208[84][94] = {
     58 #include "jis0208.h"
     59 };
     60 
     61 static const unsigned short gb18030[126][190] = {
     62 #include "gb18030.h"
     63 };
     64 
     65 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
     66 {
     67 	for (; *a && *b; a++, b++) {
     68 		while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
     69 		if ((*a|32U) != *b) return 1;
     70 	}
     71 	return *a != *b;
     72 }
     73 
     74 static size_t find_charmap(const void *name)
     75 {
     76 	const unsigned char *s;
     77 	for (s=charmaps; *s; ) {
     78 		if (!fuzzycmp(name, s)) {
     79 			for (; *s; s+=strlen((void *)s)+1);
     80 			return s+1-charmaps;
     81 		}
     82 		s += strlen((void *)s)+1;
     83 		if (!*s) {
     84 			if (s[1] > 0200) s+=2;
     85 			else s+=2+(128U-s[1])/4*5;
     86 		}
     87 	}
     88 	return -1;
     89 }
     90 
     91 iconv_t iconv_open(const char *to, const char *from)
     92 {
     93 	size_t f, t;
     94 
     95 	if ((t = find_charmap(to))==-1
     96 	 || (f = find_charmap(from))==-1
     97 	 || (charmaps[t] >= 0320)) {
     98 		errno = EINVAL;
     99 		return (iconv_t)-1;
    100 	}
    101 
    102 	return (void *)(f<<16 | t);
    103 }
    104 
    105 int iconv_close(iconv_t cd)
    106 {
    107 	return 0;
    108 }
    109 
    110 static unsigned get_16(const unsigned char *s, int e)
    111 {
    112 	e &= 1;
    113 	return s[e]<<8 | s[1-e];
    114 }
    115 
    116 static void put_16(unsigned char *s, unsigned c, int e)
    117 {
    118 	e &= 1;
    119 	s[e] = c>>8;
    120 	s[1-e] = c;
    121 }
    122 
    123 static unsigned get_32(const unsigned char *s, int e)
    124 {
    125 	e &= 3;
    126 	return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
    127 }
    128 
    129 static void put_32(unsigned char *s, unsigned c, int e)
    130 {
    131 	e &= 3;
    132 	s[e^0] = c>>24;
    133 	s[e^1] = c>>16;
    134 	s[e^2] = c>>8;
    135 	s[e^3] = c;
    136 }
    137 
    138 /* Adapt as needed */
    139 #define mbrtowc_utf8 mbrtowc
    140 #define wctomb_utf8 wctomb
    141 
    142 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
    143 {
    144 	size_t x=0;
    145 	unsigned long cd = (unsigned long)cd0;
    146 	unsigned to = cd & 0xffff;
    147 	unsigned from = cd >> 16;
    148 	const unsigned char *map = charmaps+from+1;
    149 	const unsigned char *tomap = charmaps+to+1;
    150 	mbstate_t st = {0};
    151 	wchar_t wc;
    152 	unsigned c, d;
    153 	size_t k, l;
    154 	int err;
    155 	unsigned char type = map[-1];
    156 	unsigned char totype = tomap[-1];
    157 
    158 	if (!in || !*in || !*inb) return 0;
    159 
    160 	for (; *inb; *in+=l, *inb-=l) {
    161 		c = *(unsigned char *)*in;
    162 		l = 1;
    163 
    164 		if (c >= 128 || type-UTF_32BE < 7U) switch (type) {
    165 		case UTF_8:
    166 			l = mbrtowc_utf8(&wc, *in, *inb, &st);
    167 			if (!l) l++;
    168 			else if (l == (size_t)-1) goto ilseq;
    169 			else if (l == (size_t)-2) goto starved;
    170 			c = wc;
    171 			break;
    172 		case US_ASCII:
    173 			goto ilseq;
    174 		case WCHAR_T:
    175 			l = sizeof(wchar_t);
    176 			if (*inb < l) goto starved;
    177 			c = *(wchar_t *)*in;
    178 			if (0) {
    179 		case UTF_32BE:
    180 		case UTF_32LE:
    181 			l = 4;
    182 			if (*inb < 4) goto starved;
    183 			c = get_32((void *)*in, type);
    184 			}
    185 			if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
    186 			break;
    187 		case UCS2BE:
    188 		case UCS2LE:
    189 		case UTF_16BE:
    190 		case UTF_16LE:
    191 			l = 2;
    192 			if (*inb < 2) goto starved;
    193 			c = get_16((void *)*in, type);
    194 			if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
    195 			if ((unsigned)(c-0xd800) < 0x400) {
    196 				if (type-UCS2BE < 2U) goto ilseq;
    197 				l = 4;
    198 				if (*inb < 4) goto starved;
    199 				d = get_16((void *)(*in + 2), type);
    200 				if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
    201 				c = ((c-0xd7c0)<<10) + (d-0xdc00);
    202 			}
    203 			break;
    204 		case SHIFT_JIS:
    205 			if (c-0xa1 <= 0xdf-0xa1) {
    206 				c += 0xff61-0xa1;
    207 				break;
    208 			}
    209 			l = 2;
    210 			if (*inb < 2) goto starved;
    211 			d = *((unsigned char *)*in + 1);
    212 			if (c-129 <= 159-129) c -= 129;
    213 			else if (c-224 <= 239-224) c -= 193;
    214 			else goto ilseq;
    215 			c *= 2;
    216 			if (d-64 <= 158-64) {
    217 				if (d==127) goto ilseq;
    218 				if (d>127) d--;
    219 				d -= 64;
    220 			} else if (d-159 <= 252-159) {
    221 				c++;
    222 				d -= 159;
    223 			}
    224 			c = jis0208[c][d];
    225 			if (!c) goto ilseq;
    226 			break;
    227 		case EUC_JP:
    228 			l = 2;
    229 			if (*inb < 2) goto starved;
    230 			d = *((unsigned char *)*in + 1);
    231 			if (c==0x8e) {
    232 				c = d;
    233 				if (c-0xa1 > 0xdf-0xa1) goto ilseq;
    234 				c += 0xff61 - 0xa1;
    235 				break;
    236 			}
    237 			c -= 0xa1;
    238 			d -= 0xa1;
    239 			if (c >= 84 || d >= 94) goto ilseq;
    240 			c = jis0208[c][d];
    241 			if (!c) goto ilseq;
    242 			break;
    243 		case GB2312:
    244 			if (c < 0xa1) goto ilseq;
    245 		case GBK:
    246 		case GB18030:
    247 			c -= 0x81;
    248 			if (c >= 126) goto ilseq;
    249 			l = 2;
    250 			if (*inb < 2) goto starved;
    251 			d = *((unsigned char *)*in + 1);
    252 			if (d < 0xa1 && type == GB2312) goto ilseq;
    253 			if (d-0x40>=191 || d==127) {
    254 				if (d-'0'>9 || type != GB18030)
    255 					goto ilseq;
    256 				l = 4;
    257 				if (*inb < 4) goto starved;
    258 				c = (10*c + d-'0') * 1260;
    259 				d = *((unsigned char *)*in + 2);
    260 				if (d-0x81>126) goto ilseq;
    261 				c += 10*(d-0x81);
    262 				d = *((unsigned char *)*in + 3);
    263 				if (d-'0'>9) goto ilseq;
    264 				c += d-'0';
    265 				c += 128;
    266 				for (d=0; d<=c; ) {
    267 					k = 0;
    268                                       int i, j;
    269 					for (i=0; i<126; i++)
    270 						for (j=0; j<190; j++)
    271 							if (gb18030[i][j]-d <= c-d)
    272 								k++;
    273 					d = c+1;
    274 					c += k;
    275 				}
    276 				break;
    277 			}
    278 			d -= 0x40;
    279 			if (d>63) d--;
    280 			c = gb18030[c][d];
    281 			break;
    282 		default:
    283 			if (c < 128+type) break;
    284 			c -= 128+type;
    285 			c = legacy_chars[ map[c*5/4]>>2*c%8 |
    286 				map[c*5/4+1]<<8-2*c%8 & 1023 ];
    287 			if (!c) c = *(unsigned char *)*in;
    288 			if (c==1) goto ilseq;
    289 		}
    290 
    291 		switch (totype) {
    292 		case WCHAR_T:
    293 			if (*outb < sizeof(wchar_t)) goto toobig;
    294 			*(wchar_t *)*out = c;
    295 			*out += sizeof(wchar_t);
    296 			*outb -= sizeof(wchar_t);
    297 			break;
    298 		case UTF_8:
    299 			if (*outb < 4) {
    300 				char tmp[4];
    301 				k = wctomb_utf8(tmp, c);
    302 				if (*outb < k) goto toobig;
    303 				memcpy(*out, tmp, k);
    304 			} else k = wctomb_utf8(*out, c);
    305 			*out += k;
    306 			*outb -= k;
    307 			break;
    308 		case US_ASCII:
    309 			if (c > 0x7f) subst: x++, c='*';
    310 		default:
    311 			if (*outb < 1) goto toobig;
    312 			if (c < 128+totype) {
    313 			revout:
    314 				*(*out)++ = c;
    315 				*outb -= 1;
    316 				break;
    317 			}
    318 			d = c;
    319 			for (c=0; c<128-totype; c++) {
    320 				if (d == legacy_chars[ map[c*5/4]>>2*c%8 |
    321 					map[c*5/4+1]<<8-2*c%8 & 1023 ]) {
    322 					c += 128;
    323 					goto revout;
    324 				}
    325 			}
    326 			goto subst;
    327 		case UCS2BE:
    328 		case UCS2LE:
    329 		case UTF_16BE:
    330 		case UTF_16LE:
    331 			if (c < 0x10000 || type-UCS2BE < 2U) {
    332 				if (c >= 0x10000) c = 0xFFFD;
    333 				if (*outb < 2) goto toobig;
    334 				put_16((void *)*out, c, totype);
    335 				*out += 2;
    336 				*outb -= 2;
    337 				break;
    338 			}
    339 			if (*outb < 4) goto toobig;
    340 			c -= 0x10000;
    341 			put_16((void *)*out, (c>>10)|0xd800, totype);
    342 			put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
    343 			*out += 4;
    344 			*outb -= 4;
    345 			break;
    346 		case UTF_32BE:
    347 		case UTF_32LE:
    348 			if (*outb < 4) goto toobig;
    349 			put_32((void *)*out, c, totype);
    350 			*out += 4;
    351 			*outb -= 4;
    352 			break;
    353 		}
    354 	}
    355 	return x;
    356 ilseq:
    357 	err = EILSEQ;
    358 	x = -1;
    359 	goto end;
    360 toobig:
    361 	err = E2BIG;
    362 	x = -1;
    363 	goto end;
    364 starved:
    365 	err = EINVAL;
    366 	x = -1;
    367 end:
    368 	errno = err;
    369 	return x;
    370 }
    371