1 /* 2 * The authors of this software are Rob Pike and Ken Thompson. 3 * Copyright (c) 2002 by Lucent Technologies. 4 * Permission to use, copy, modify, and distribute this software for any 5 * purpose without fee is hereby granted, provided that this entire notice 6 * is included in all copies of any software which is or includes a copy 7 * or modification of this software and in all copies of the supporting 8 * documentation for such software. 9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY 11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 13 */ 14 #include <stdarg.h> 15 #include <string.h> 16 #include "util/utf.h" 17 18 namespace re2 { 19 20 enum 21 { 22 Bit1 = 7, 23 Bitx = 6, 24 Bit2 = 5, 25 Bit3 = 4, 26 Bit4 = 3, 27 Bit5 = 2, 28 29 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 30 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 31 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 32 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 33 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 34 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 35 36 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ 37 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ 38 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ 39 Rune4 = (1<<(Bit4+3*Bitx))-1, 40 /* 0001 1111 1111 1111 1111 1111 */ 41 42 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 43 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 44 45 Bad = Runeerror, 46 }; 47 48 int 49 chartorune(Rune *rune, const char *str) 50 { 51 int c, c1, c2, c3; 52 long l; 53 54 /* 55 * one character sequence 56 * 00000-0007F => T1 57 */ 58 c = *(unsigned char*)str; 59 if(c < Tx) { 60 *rune = c; 61 return 1; 62 } 63 64 /* 65 * two character sequence 66 * 0080-07FF => T2 Tx 67 */ 68 c1 = *(unsigned char*)(str+1) ^ Tx; 69 if(c1 & Testx) 70 goto bad; 71 if(c < T3) { 72 if(c < T2) 73 goto bad; 74 l = ((c << Bitx) | c1) & Rune2; 75 if(l <= Rune1) 76 goto bad; 77 *rune = l; 78 return 2; 79 } 80 81 /* 82 * three character sequence 83 * 0800-FFFF => T3 Tx Tx 84 */ 85 c2 = *(unsigned char*)(str+2) ^ Tx; 86 if(c2 & Testx) 87 goto bad; 88 if(c < T4) { 89 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 90 if(l <= Rune2) 91 goto bad; 92 *rune = l; 93 return 3; 94 } 95 96 /* 97 * four character sequence (21-bit value) 98 * 10000-1FFFFF => T4 Tx Tx Tx 99 */ 100 c3 = *(unsigned char*)(str+3) ^ Tx; 101 if (c3 & Testx) 102 goto bad; 103 if (c < T5) { 104 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 105 if (l <= Rune3) 106 goto bad; 107 *rune = l; 108 return 4; 109 } 110 111 /* 112 * Support for 5-byte or longer UTF-8 would go here, but 113 * since we don't have that, we'll just fall through to bad. 114 */ 115 116 /* 117 * bad decoding 118 */ 119 bad: 120 *rune = Bad; 121 return 1; 122 } 123 124 int 125 runetochar(char *str, const Rune *rune) 126 { 127 /* Runes are signed, so convert to unsigned for range check. */ 128 unsigned long c; 129 130 /* 131 * one character sequence 132 * 00000-0007F => 00-7F 133 */ 134 c = *rune; 135 if(c <= Rune1) { 136 str[0] = c; 137 return 1; 138 } 139 140 /* 141 * two character sequence 142 * 0080-07FF => T2 Tx 143 */ 144 if(c <= Rune2) { 145 str[0] = T2 | (c >> 1*Bitx); 146 str[1] = Tx | (c & Maskx); 147 return 2; 148 } 149 150 /* 151 * If the Rune is out of range, convert it to the error rune. 152 * Do this test here because the error rune encodes to three bytes. 153 * Doing it earlier would duplicate work, since an out of range 154 * Rune wouldn't have fit in one or two bytes. 155 */ 156 if (c > Runemax) 157 c = Runeerror; 158 159 /* 160 * three character sequence 161 * 0800-FFFF => T3 Tx Tx 162 */ 163 if (c <= Rune3) { 164 str[0] = T3 | (c >> 2*Bitx); 165 str[1] = Tx | ((c >> 1*Bitx) & Maskx); 166 str[2] = Tx | (c & Maskx); 167 return 3; 168 } 169 170 /* 171 * four character sequence (21-bit value) 172 * 10000-1FFFFF => T4 Tx Tx Tx 173 */ 174 str[0] = T4 | (c >> 3*Bitx); 175 str[1] = Tx | ((c >> 2*Bitx) & Maskx); 176 str[2] = Tx | ((c >> 1*Bitx) & Maskx); 177 str[3] = Tx | (c & Maskx); 178 return 4; 179 } 180 181 int 182 runelen(Rune rune) 183 { 184 char str[10]; 185 186 return runetochar(str, &rune); 187 } 188 189 int 190 fullrune(const char *str, int n) 191 { 192 if (n > 0) { 193 int c = *(unsigned char*)str; 194 if (c < Tx) 195 return 1; 196 if (n > 1) { 197 if (c < T3) 198 return 1; 199 if (n > 2) { 200 if (c < T4 || n > 3) 201 return 1; 202 } 203 } 204 } 205 return 0; 206 } 207 208 209 int 210 utflen(const char *s) 211 { 212 int c; 213 long n; 214 Rune rune; 215 216 n = 0; 217 for(;;) { 218 c = *(unsigned char*)s; 219 if(c < Runeself) { 220 if(c == 0) 221 return n; 222 s++; 223 } else 224 s += chartorune(&rune, s); 225 n++; 226 } 227 return 0; 228 } 229 230 char* 231 utfrune(const char *s, Rune c) 232 { 233 long c1; 234 Rune r; 235 int n; 236 237 if(c < Runesync) /* not part of utf sequence */ 238 return strchr((char*)s, c); 239 240 for(;;) { 241 c1 = *(unsigned char*)s; 242 if(c1 < Runeself) { /* one byte rune */ 243 if(c1 == 0) 244 return 0; 245 if(c1 == c) 246 return (char*)s; 247 s++; 248 continue; 249 } 250 n = chartorune(&r, s); 251 if(r == c) 252 return (char*)s; 253 s += n; 254 } 255 return 0; 256 } 257 258 } // namespace re2 259