1 /* 2 * The authors of this software are Rob Pike and Ken Thompson. 3 * Copyright (c) 2002 by Lucent Technologies. 4 * Permission to use, copy, modify, and distribute this software for any 5 * purpose without fee is hereby granted, provided that this entire notice 6 * is included in all copies of any software which is or includes a copy 7 * or modification of this software and in all copies of the supporting 8 * documentation for such software. 9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY 11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 13 */ 14 #include <stdarg.h> 15 #include <string.h> 16 #include "utf.h" 17 #include "utfdef.h" 18 19 enum 20 { 21 Bit1 = 7, 22 Bitx = 6, 23 Bit2 = 5, 24 Bit3 = 4, 25 Bit4 = 3, 26 Bit5 = 2, 27 28 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 29 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 30 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 31 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 32 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 33 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 34 35 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ 36 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ 37 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ 38 Rune4 = (1<<(Bit4+3*Bitx))-1, 39 /* 0001 1111 1111 1111 1111 1111 */ 40 41 Maskx = (1<<Bitx)-1, /* 0011 1111 */ 42 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 43 44 Bad = Runeerror, 45 }; 46 47 /* 48 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 49 * This is a slower but "safe" version of the old chartorune 50 * that works on strings that are not necessarily null-terminated. 51 * 52 * If you know for sure that your string is null-terminated, 53 * chartorune will be a bit faster. 54 * 55 * It is guaranteed not to attempt to access "length" 56 * past the incoming pointer. This is to avoid 57 * possible access violations. If the string appears to be 58 * well-formed but incomplete (i.e., to get the whole Rune 59 * we'd need to read past str+length) then we'll set the Rune 60 * to Bad and return 0. 61 * 62 * Note that if we have decoding problems for other 63 * reasons, we return 1 instead of 0. 64 */ 65 int 66 charntorune(Rune *rune, const char *str, int length) 67 { 68 int c, c1, c2, c3; 69 long l; 70 71 /* When we're not allowed to read anything */ 72 if(length <= 0) { 73 goto badlen; 74 } 75 76 /* 77 * one character sequence (7-bit value) 78 * 00000-0007F => T1 79 */ 80 c = *(uchar*)str; 81 if(c < Tx) { 82 *rune = c; 83 return 1; 84 } 85 86 // If we can't read more than one character we must stop 87 if(length <= 1) { 88 goto badlen; 89 } 90 91 /* 92 * two character sequence (11-bit value) 93 * 0080-07FF => T2 Tx 94 */ 95 c1 = *(uchar*)(str+1) ^ Tx; 96 if(c1 & Testx) 97 goto bad; 98 if(c < T3) { 99 if(c < T2) 100 goto bad; 101 l = ((c << Bitx) | c1) & Rune2; 102 if(l <= Rune1) 103 goto bad; 104 *rune = l; 105 return 2; 106 } 107 108 // If we can't read more than two characters we must stop 109 if(length <= 2) { 110 goto badlen; 111 } 112 113 /* 114 * three character sequence (16-bit value) 115 * 0800-FFFF => T3 Tx Tx 116 */ 117 c2 = *(uchar*)(str+2) ^ Tx; 118 if(c2 & Testx) 119 goto bad; 120 if(c < T4) { 121 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 122 if(l <= Rune2) 123 goto bad; 124 *rune = l; 125 return 3; 126 } 127 128 if (length <= 3) 129 goto badlen; 130 131 /* 132 * four character sequence (21-bit value) 133 * 10000-1FFFFF => T4 Tx Tx Tx 134 */ 135 c3 = *(uchar*)(str+3) ^ Tx; 136 if (c3 & Testx) 137 goto bad; 138 if (c < T5) { 139 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 140 if (l <= Rune3) 141 goto bad; 142 *rune = l; 143 return 4; 144 } 145 146 // Support for 5-byte or longer UTF-8 would go here, but 147 // since we don't have that, we'll just fall through to bad. 148 149 /* 150 * bad decoding 151 */ 152 bad: 153 *rune = Bad; 154 return 1; 155 badlen: 156 *rune = Bad; 157 return 0; 158 159 } 160 161 162 /* 163 * This is the older "unsafe" version, which works fine on 164 * null-terminated strings. 165 */ 166 int 167 chartorune(Rune *rune, const char *str) 168 { 169 int c, c1, c2, c3; 170 long l; 171 172 /* 173 * one character sequence 174 * 00000-0007F => T1 175 */ 176 c = *(uchar*)str; 177 if(c < Tx) { 178 *rune = c; 179 return 1; 180 } 181 182 /* 183 * two character sequence 184 * 0080-07FF => T2 Tx 185 */ 186 c1 = *(uchar*)(str+1) ^ Tx; 187 if(c1 & Testx) 188 goto bad; 189 if(c < T3) { 190 if(c < T2) 191 goto bad; 192 l = ((c << Bitx) | c1) & Rune2; 193 if(l <= Rune1) 194 goto bad; 195 *rune = l; 196 return 2; 197 } 198 199 /* 200 * three character sequence 201 * 0800-FFFF => T3 Tx Tx 202 */ 203 c2 = *(uchar*)(str+2) ^ Tx; 204 if(c2 & Testx) 205 goto bad; 206 if(c < T4) { 207 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 208 if(l <= Rune2) 209 goto bad; 210 *rune = l; 211 return 3; 212 } 213 214 /* 215 * four character sequence (21-bit value) 216 * 10000-1FFFFF => T4 Tx Tx Tx 217 */ 218 c3 = *(uchar*)(str+3) ^ Tx; 219 if (c3 & Testx) 220 goto bad; 221 if (c < T5) { 222 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 223 if (l <= Rune3) 224 goto bad; 225 *rune = l; 226 return 4; 227 } 228 229 /* 230 * Support for 5-byte or longer UTF-8 would go here, but 231 * since we don't have that, we'll just fall through to bad. 232 */ 233 234 /* 235 * bad decoding 236 */ 237 bad: 238 *rune = Bad; 239 return 1; 240 } 241 242 int 243 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { 244 *consumed = charntorune(rune, str, length); 245 return *rune != Runeerror || *consumed == 3; 246 } 247 248 int 249 runetochar(char *str, const Rune *rune) 250 { 251 /* Runes are signed, so convert to unsigned for range check. */ 252 unsigned long c; 253 254 /* 255 * one character sequence 256 * 00000-0007F => 00-7F 257 */ 258 c = *rune; 259 if(c <= Rune1) { 260 str[0] = c; 261 return 1; 262 } 263 264 /* 265 * two character sequence 266 * 0080-07FF => T2 Tx 267 */ 268 if(c <= Rune2) { 269 str[0] = T2 | (c >> 1*Bitx); 270 str[1] = Tx | (c & Maskx); 271 return 2; 272 } 273 274 /* 275 * If the Rune is out of range, convert it to the error rune. 276 * Do this test here because the error rune encodes to three bytes. 277 * Doing it earlier would duplicate work, since an out of range 278 * Rune wouldn't have fit in one or two bytes. 279 */ 280 if (c > Runemax) 281 c = Runeerror; 282 283 /* 284 * three character sequence 285 * 0800-FFFF => T3 Tx Tx 286 */ 287 if (c <= Rune3) { 288 str[0] = T3 | (c >> 2*Bitx); 289 str[1] = Tx | ((c >> 1*Bitx) & Maskx); 290 str[2] = Tx | (c & Maskx); 291 return 3; 292 } 293 294 /* 295 * four character sequence (21-bit value) 296 * 10000-1FFFFF => T4 Tx Tx Tx 297 */ 298 str[0] = T4 | (c >> 3*Bitx); 299 str[1] = Tx | ((c >> 2*Bitx) & Maskx); 300 str[2] = Tx | ((c >> 1*Bitx) & Maskx); 301 str[3] = Tx | (c & Maskx); 302 return 4; 303 } 304 305 int 306 runelen(Rune rune) 307 { 308 char str[10]; 309 310 return runetochar(str, &rune); 311 } 312 313 int 314 runenlen(const Rune *r, int nrune) 315 { 316 int nb, c; 317 318 nb = 0; 319 while(nrune--) { 320 c = *r++; 321 if (c <= Rune1) 322 nb++; 323 else if (c <= Rune2) 324 nb += 2; 325 else if (c <= Rune3) 326 nb += 3; 327 else /* assert(c <= Rune4) */ 328 nb += 4; 329 } 330 return nb; 331 } 332 333 int 334 fullrune(const char *str, int n) 335 { 336 if (n > 0) { 337 int c = *(uchar*)str; 338 if (c < Tx) 339 return 1; 340 if (n > 1) { 341 if (c < T3) 342 return 1; 343 if (n > 2) { 344 if (c < T4 || n > 3) 345 return 1; 346 } 347 } 348 } 349 return 0; 350 } 351