Home | History | Annotate | Download | only in util
      1 /*
      2  * The authors of this software are Rob Pike and Ken Thompson.
      3  *              Copyright (c) 2002 by Lucent Technologies.
      4  * Permission to use, copy, modify, and distribute this software for any
      5  * purpose without fee is hereby granted, provided that this entire notice
      6  * is included in all copies of any software which is or includes a copy
      7  * or modification of this software and in all copies of the supporting
      8  * documentation for such software.
      9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
     10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
     11  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
     12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
     13  */
     14 #include <stdarg.h>
     15 #include <string.h>
     16 #include "util/utf.h"
     17 
     18 namespace re2 {
     19 
     20 enum
     21 {
     22 	Bit1	= 7,
     23 	Bitx	= 6,
     24 	Bit2	= 5,
     25 	Bit3	= 4,
     26 	Bit4	= 3,
     27 	Bit5	= 2,
     28 
     29 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
     30 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
     31 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
     32 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
     33 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
     34 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
     35 
     36 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
     37 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
     38 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
     39 	Rune4	= (1<<(Bit4+3*Bitx))-1,
     40                                         /* 0001 1111 1111 1111 1111 1111 */
     41 
     42 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
     43 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
     44 
     45 	Bad	= Runeerror,
     46 };
     47 
     48 int
     49 chartorune(Rune *rune, const char *str)
     50 {
     51 	int c, c1, c2, c3;
     52 	long l;
     53 
     54 	/*
     55 	 * one character sequence
     56 	 *	00000-0007F => T1
     57 	 */
     58 	c = *(unsigned char*)str;
     59 	if(c < Tx) {
     60 		*rune = c;
     61 		return 1;
     62 	}
     63 
     64 	/*
     65 	 * two character sequence
     66 	 *	0080-07FF => T2 Tx
     67 	 */
     68 	c1 = *(unsigned char*)(str+1) ^ Tx;
     69 	if(c1 & Testx)
     70 		goto bad;
     71 	if(c < T3) {
     72 		if(c < T2)
     73 			goto bad;
     74 		l = ((c << Bitx) | c1) & Rune2;
     75 		if(l <= Rune1)
     76 			goto bad;
     77 		*rune = l;
     78 		return 2;
     79 	}
     80 
     81 	/*
     82 	 * three character sequence
     83 	 *	0800-FFFF => T3 Tx Tx
     84 	 */
     85 	c2 = *(unsigned char*)(str+2) ^ Tx;
     86 	if(c2 & Testx)
     87 		goto bad;
     88 	if(c < T4) {
     89 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
     90 		if(l <= Rune2)
     91 			goto bad;
     92 		*rune = l;
     93 		return 3;
     94 	}
     95 
     96 	/*
     97 	 * four character sequence (21-bit value)
     98 	 *	10000-1FFFFF => T4 Tx Tx Tx
     99 	 */
    100 	c3 = *(unsigned char*)(str+3) ^ Tx;
    101 	if (c3 & Testx)
    102 		goto bad;
    103 	if (c < T5) {
    104 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    105 		if (l <= Rune3)
    106 			goto bad;
    107 		*rune = l;
    108 		return 4;
    109 	}
    110 
    111 	/*
    112 	 * Support for 5-byte or longer UTF-8 would go here, but
    113 	 * since we don't have that, we'll just fall through to bad.
    114 	 */
    115 
    116 	/*
    117 	 * bad decoding
    118 	 */
    119 bad:
    120 	*rune = Bad;
    121 	return 1;
    122 }
    123 
    124 int
    125 runetochar(char *str, const Rune *rune)
    126 {
    127 	/* Runes are signed, so convert to unsigned for range check. */
    128 	unsigned long c;
    129 
    130 	/*
    131 	 * one character sequence
    132 	 *	00000-0007F => 00-7F
    133 	 */
    134 	c = *rune;
    135 	if(c <= Rune1) {
    136 		str[0] = c;
    137 		return 1;
    138 	}
    139 
    140 	/*
    141 	 * two character sequence
    142 	 *	0080-07FF => T2 Tx
    143 	 */
    144 	if(c <= Rune2) {
    145 		str[0] = T2 | (c >> 1*Bitx);
    146 		str[1] = Tx | (c & Maskx);
    147 		return 2;
    148 	}
    149 
    150 	/*
    151 	 * If the Rune is out of range, convert it to the error rune.
    152 	 * Do this test here because the error rune encodes to three bytes.
    153 	 * Doing it earlier would duplicate work, since an out of range
    154 	 * Rune wouldn't have fit in one or two bytes.
    155 	 */
    156 	if (c > Runemax)
    157 		c = Runeerror;
    158 
    159 	/*
    160 	 * three character sequence
    161 	 *	0800-FFFF => T3 Tx Tx
    162 	 */
    163 	if (c <= Rune3) {
    164 		str[0] = T3 |  (c >> 2*Bitx);
    165 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
    166 		str[2] = Tx |  (c & Maskx);
    167 		return 3;
    168 	}
    169 
    170 	/*
    171 	 * four character sequence (21-bit value)
    172 	 *     10000-1FFFFF => T4 Tx Tx Tx
    173 	 */
    174 	str[0] = T4 | (c >> 3*Bitx);
    175 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
    176 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
    177 	str[3] = Tx | (c & Maskx);
    178 	return 4;
    179 }
    180 
    181 int
    182 runelen(Rune rune)
    183 {
    184 	char str[10];
    185 
    186 	return runetochar(str, &rune);
    187 }
    188 
    189 int
    190 fullrune(const char *str, int n)
    191 {
    192 	if (n > 0) {
    193 		int c = *(unsigned char*)str;
    194 		if (c < Tx)
    195 			return 1;
    196 		if (n > 1) {
    197 			if (c < T3)
    198 				return 1;
    199 			if (n > 2) {
    200 				if (c < T4 || n > 3)
    201 					return 1;
    202 			}
    203 		}
    204 	}
    205 	return 0;
    206 }
    207 
    208 
    209 int
    210 utflen(const char *s)
    211 {
    212 	int c;
    213 	long n;
    214 	Rune rune;
    215 
    216 	n = 0;
    217 	for(;;) {
    218 		c = *(unsigned char*)s;
    219 		if(c < Runeself) {
    220 			if(c == 0)
    221 				return n;
    222 			s++;
    223 		} else
    224 			s += chartorune(&rune, s);
    225 		n++;
    226 	}
    227 	return 0;
    228 }
    229 
    230 char*
    231 utfrune(const char *s, Rune c)
    232 {
    233 	long c1;
    234 	Rune r;
    235 	int n;
    236 
    237 	if(c < Runesync)		/* not part of utf sequence */
    238 		return strchr((char*)s, c);
    239 
    240 	for(;;) {
    241 		c1 = *(unsigned char*)s;
    242 		if(c1 < Runeself) {	/* one byte rune */
    243 			if(c1 == 0)
    244 				return 0;
    245 			if(c1 == c)
    246 				return (char*)s;
    247 			s++;
    248 			continue;
    249 		}
    250 		n = chartorune(&r, s);
    251 		if(r == c)
    252 			return (char*)s;
    253 		s += n;
    254 	}
    255 	return 0;
    256 }
    257 
    258 }  // namespace re2
    259