Home | History | Annotate | Download | only in libutf
      1 /*
      2  * The authors of this software are Rob Pike and Ken Thompson.
      3  *              Copyright (c) 2002 by Lucent Technologies.
      4  * Permission to use, copy, modify, and distribute this software for any
      5  * purpose without fee is hereby granted, provided that this entire notice
      6  * is included in all copies of any software which is or includes a copy
      7  * or modification of this software and in all copies of the supporting
      8  * documentation for such software.
      9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
     10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
     11  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
     12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
     13  */
     14 #include <stdarg.h>
     15 #include <string.h>
     16 #include "utf.h"
     17 #include "utfdef.h"
     18 
     19 enum
     20 {
     21 	Bit1	= 7,
     22 	Bitx	= 6,
     23 	Bit2	= 5,
     24 	Bit3	= 4,
     25 	Bit4	= 3,
     26 	Bit5	= 2,
     27 
     28 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
     29 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
     30 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
     31 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
     32 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
     33 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
     34 
     35 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
     36 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
     37 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
     38 	Rune4	= (1<<(Bit4+3*Bitx))-1,
     39                                         /* 0001 1111 1111 1111 1111 1111 */
     40 
     41 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
     42 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
     43 
     44 	Bad	= Runeerror,
     45 };
     46 
     47 /*
     48  * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
     49  * This is a slower but "safe" version of the old chartorune
     50  * that works on strings that are not necessarily null-terminated.
     51  *
     52  * If you know for sure that your string is null-terminated,
     53  * chartorune will be a bit faster.
     54  *
     55  * It is guaranteed not to attempt to access "length"
     56  * past the incoming pointer.  This is to avoid
     57  * possible access violations.  If the string appears to be
     58  * well-formed but incomplete (i.e., to get the whole Rune
     59  * we'd need to read past str+length) then we'll set the Rune
     60  * to Bad and return 0.
     61  *
     62  * Note that if we have decoding problems for other
     63  * reasons, we return 1 instead of 0.
     64  */
     65 int
     66 charntorune(Rune *rune, const char *str, int length)
     67 {
     68 	int c, c1, c2, c3;
     69 	long l;
     70 
     71 	/* When we're not allowed to read anything */
     72 	if(length <= 0) {
     73 		goto badlen;
     74 	}
     75 
     76 	/*
     77 	 * one character sequence (7-bit value)
     78 	 *	00000-0007F => T1
     79 	 */
     80 	c = *(uchar*)str;
     81 	if(c < Tx) {
     82 		*rune = c;
     83 		return 1;
     84 	}
     85 
     86 	// If we can't read more than one character we must stop
     87 	if(length <= 1) {
     88 		goto badlen;
     89 	}
     90 
     91 	/*
     92 	 * two character sequence (11-bit value)
     93 	 *	0080-07FF => T2 Tx
     94 	 */
     95 	c1 = *(uchar*)(str+1) ^ Tx;
     96 	if(c1 & Testx)
     97 		goto bad;
     98 	if(c < T3) {
     99 		if(c < T2)
    100 			goto bad;
    101 		l = ((c << Bitx) | c1) & Rune2;
    102 		if(l <= Rune1)
    103 			goto bad;
    104 		*rune = l;
    105 		return 2;
    106 	}
    107 
    108 	// If we can't read more than two characters we must stop
    109 	if(length <= 2) {
    110 		goto badlen;
    111 	}
    112 
    113 	/*
    114 	 * three character sequence (16-bit value)
    115 	 *	0800-FFFF => T3 Tx Tx
    116 	 */
    117 	c2 = *(uchar*)(str+2) ^ Tx;
    118 	if(c2 & Testx)
    119 		goto bad;
    120 	if(c < T4) {
    121 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    122 		if(l <= Rune2)
    123 			goto bad;
    124 		*rune = l;
    125 		return 3;
    126 	}
    127 
    128 	if (length <= 3)
    129 		goto badlen;
    130 
    131 	/*
    132 	 * four character sequence (21-bit value)
    133 	 *	10000-1FFFFF => T4 Tx Tx Tx
    134 	 */
    135 	c3 = *(uchar*)(str+3) ^ Tx;
    136 	if (c3 & Testx)
    137 		goto bad;
    138 	if (c < T5) {
    139 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    140 		if (l <= Rune3)
    141 			goto bad;
    142 		*rune = l;
    143 		return 4;
    144 	}
    145 
    146 	// Support for 5-byte or longer UTF-8 would go here, but
    147 	// since we don't have that, we'll just fall through to bad.
    148 
    149 	/*
    150 	 * bad decoding
    151 	 */
    152 bad:
    153 	*rune = Bad;
    154 	return 1;
    155 badlen:
    156 	*rune = Bad;
    157 	return 0;
    158 
    159 }
    160 
    161 
    162 /*
    163  * This is the older "unsafe" version, which works fine on
    164  * null-terminated strings.
    165  */
    166 int
    167 chartorune(Rune *rune, const char *str)
    168 {
    169 	int c, c1, c2, c3;
    170 	long l;
    171 
    172 	/*
    173 	 * one character sequence
    174 	 *	00000-0007F => T1
    175 	 */
    176 	c = *(uchar*)str;
    177 	if(c < Tx) {
    178 		*rune = c;
    179 		return 1;
    180 	}
    181 
    182 	/*
    183 	 * two character sequence
    184 	 *	0080-07FF => T2 Tx
    185 	 */
    186 	c1 = *(uchar*)(str+1) ^ Tx;
    187 	if(c1 & Testx)
    188 		goto bad;
    189 	if(c < T3) {
    190 		if(c < T2)
    191 			goto bad;
    192 		l = ((c << Bitx) | c1) & Rune2;
    193 		if(l <= Rune1)
    194 			goto bad;
    195 		*rune = l;
    196 		return 2;
    197 	}
    198 
    199 	/*
    200 	 * three character sequence
    201 	 *	0800-FFFF => T3 Tx Tx
    202 	 */
    203 	c2 = *(uchar*)(str+2) ^ Tx;
    204 	if(c2 & Testx)
    205 		goto bad;
    206 	if(c < T4) {
    207 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    208 		if(l <= Rune2)
    209 			goto bad;
    210 		*rune = l;
    211 		return 3;
    212 	}
    213 
    214 	/*
    215 	 * four character sequence (21-bit value)
    216 	 *	10000-1FFFFF => T4 Tx Tx Tx
    217 	 */
    218 	c3 = *(uchar*)(str+3) ^ Tx;
    219 	if (c3 & Testx)
    220 		goto bad;
    221 	if (c < T5) {
    222 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    223 		if (l <= Rune3)
    224 			goto bad;
    225 		*rune = l;
    226 		return 4;
    227 	}
    228 
    229 	/*
    230 	 * Support for 5-byte or longer UTF-8 would go here, but
    231 	 * since we don't have that, we'll just fall through to bad.
    232 	 */
    233 
    234 	/*
    235 	 * bad decoding
    236 	 */
    237 bad:
    238 	*rune = Bad;
    239 	return 1;
    240 }
    241 
    242 int
    243 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
    244 	*consumed = charntorune(rune, str, length);
    245 	return *rune != Runeerror || *consumed == 3;
    246 }
    247 
    248 int
    249 runetochar(char *str, const Rune *rune)
    250 {
    251 	/* Runes are signed, so convert to unsigned for range check. */
    252 	unsigned long c;
    253 
    254 	/*
    255 	 * one character sequence
    256 	 *	00000-0007F => 00-7F
    257 	 */
    258 	c = *rune;
    259 	if(c <= Rune1) {
    260 		str[0] = c;
    261 		return 1;
    262 	}
    263 
    264 	/*
    265 	 * two character sequence
    266 	 *	0080-07FF => T2 Tx
    267 	 */
    268 	if(c <= Rune2) {
    269 		str[0] = T2 | (c >> 1*Bitx);
    270 		str[1] = Tx | (c & Maskx);
    271 		return 2;
    272 	}
    273 
    274 	/*
    275 	 * If the Rune is out of range, convert it to the error rune.
    276 	 * Do this test here because the error rune encodes to three bytes.
    277 	 * Doing it earlier would duplicate work, since an out of range
    278 	 * Rune wouldn't have fit in one or two bytes.
    279 	 */
    280 	if (c > Runemax)
    281 		c = Runeerror;
    282 
    283 	/*
    284 	 * three character sequence
    285 	 *	0800-FFFF => T3 Tx Tx
    286 	 */
    287 	if (c <= Rune3) {
    288 		str[0] = T3 |  (c >> 2*Bitx);
    289 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
    290 		str[2] = Tx |  (c & Maskx);
    291 		return 3;
    292 	}
    293 
    294 	/*
    295 	 * four character sequence (21-bit value)
    296 	 *     10000-1FFFFF => T4 Tx Tx Tx
    297 	 */
    298 	str[0] = T4 | (c >> 3*Bitx);
    299 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
    300 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
    301 	str[3] = Tx | (c & Maskx);
    302 	return 4;
    303 }
    304 
    305 int
    306 runelen(Rune rune)
    307 {
    308 	char str[10];
    309 
    310 	return runetochar(str, &rune);
    311 }
    312 
    313 int
    314 runenlen(const Rune *r, int nrune)
    315 {
    316 	int nb, c;
    317 
    318 	nb = 0;
    319 	while(nrune--) {
    320 		c = *r++;
    321 		if (c <= Rune1)
    322 			nb++;
    323 		else if (c <= Rune2)
    324 			nb += 2;
    325 		else if (c <= Rune3)
    326 			nb += 3;
    327 		else /* assert(c <= Rune4) */
    328 			nb += 4;
    329 	}
    330 	return nb;
    331 }
    332 
    333 int
    334 fullrune(const char *str, int n)
    335 {
    336 	if (n > 0) {
    337 		int c = *(uchar*)str;
    338 		if (c < Tx)
    339 			return 1;
    340 		if (n > 1) {
    341 			if (c < T3)
    342 				return 1;
    343 			if (n > 2) {
    344 				if (c < T4 || n > 3)
    345 					return 1;
    346 			}
    347 		}
    348 	}
    349 	return 0;
    350 }
    351