1 /* 2 * Copyright 2006 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9 #include "SkUtils.h" 10 11 /* 0xxxxxxx 1 total 12 10xxxxxx // never a leading byte 13 110xxxxx 2 total 14 1110xxxx 3 total 15 11110xxx 4 total 16 17 11 10 01 01 xx xx xx xx 0... 18 0xE5XX0000 19 0xE5 << 24 20 */ 21 22 static bool utf8_byte_is_valid(uint8_t c) { 23 return c < 0xF5 && (c & 0xFE) != 0xC0; 24 } 25 static bool utf8_byte_is_continuation(uint8_t c) { 26 return (c & 0xC0) == 0x80; 27 } 28 static bool utf8_byte_is_leading_byte(uint8_t c) { 29 return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c); 30 } 31 32 #ifdef SK_DEBUG 33 static void assert_utf8_leadingbyte(unsigned c) { 34 SkASSERT(utf8_byte_is_leading_byte(SkToU8(c))); 35 } 36 37 int SkUTF8_LeadByteToCount(unsigned c) { 38 assert_utf8_leadingbyte(c); 39 return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1; 40 } 41 #else 42 #define assert_utf8_leadingbyte(c) 43 #endif 44 45 /** 46 * @returns -1 iff invalid UTF8 byte, 47 * 0 iff UTF8 continuation byte, 48 * 1 iff ASCII byte, 49 * 2 iff leading byte of 2-byte sequence, 50 * 3 iff leading byte of 3-byte sequence, and 51 * 4 iff leading byte of 4-byte sequence. 52 * 53 * I.e.: if return value > 0, then gives length of sequence. 54 */ 55 static int utf8_byte_type(uint8_t c) { 56 if (c < 0x80) { 57 return 1; 58 } else if (c < 0xC0) { 59 return 0; 60 } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear" 61 return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; 62 } else { 63 return -1; 64 } 65 } 66 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } 67 68 int SkUTF8_CountUnichars(const char utf8[]) { 69 SkASSERT(utf8); 70 71 int count = 0; 72 73 for (;;) { 74 int c = *(const uint8_t*)utf8; 75 if (c == 0) { 76 break; 77 } 78 utf8 += SkUTF8_LeadByteToCount(c); 79 count += 1; 80 } 81 return count; 82 } 83 84 // SAFE: returns -1 if invalid UTF-8 85 int SkUTF8_CountUnicharsWithError(const char utf8[], size_t byteLength) { 86 SkASSERT(utf8 || 0 == byteLength); 87 88 int count = 0; 89 const char* stop = utf8 + byteLength; 90 91 while (utf8 < stop) { 92 int type = utf8_byte_type(*(const uint8_t*)utf8); 93 SkASSERT(type >= -1 && type <= 4); 94 if (!utf8_type_is_valid_leading_byte(type) || 95 utf8 + type > stop) { // Sequence extends beyond end. 96 return -1; 97 } 98 while(type-- > 1) { 99 ++utf8; 100 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { 101 return -1; 102 } 103 } 104 ++utf8; 105 ++count; 106 } 107 return count; 108 } 109 110 SkUnichar SkUTF8_ToUnichar(const char utf8[]) { 111 SkASSERT(utf8); 112 113 const uint8_t* p = (const uint8_t*)utf8; 114 int c = *p; 115 int hic = c << 24; 116 117 assert_utf8_leadingbyte(c); 118 119 if (hic < 0) { 120 uint32_t mask = (uint32_t)~0x3F; 121 hic = SkLeftShift(hic, 1); 122 do { 123 c = (c << 6) | (*++p & 0x3F); 124 mask <<= 5; 125 } while ((hic = SkLeftShift(hic, 1)) < 0); 126 c &= ~mask; 127 } 128 return c; 129 } 130 131 // SAFE: returns -1 on invalid UTF-8 sequence. 132 SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) { 133 SkASSERT(ptr && *ptr); 134 SkASSERT(*ptr < end); 135 const uint8_t* p = (const uint8_t*)*ptr; 136 int c = *p; 137 int hic = c << 24; 138 139 if (!utf8_byte_is_leading_byte(c)) { 140 return -1; 141 } 142 if (hic < 0) { 143 uint32_t mask = (uint32_t)~0x3F; 144 hic = SkLeftShift(hic, 1); 145 do { 146 ++p; 147 if (p >= (const uint8_t*)end) { 148 return -1; 149 } 150 // check before reading off end of array. 151 uint8_t nextByte = *p; 152 if (!utf8_byte_is_continuation(nextByte)) { 153 return -1; 154 } 155 c = (c << 6) | (nextByte & 0x3F); 156 mask <<= 5; 157 } while ((hic = SkLeftShift(hic, 1)) < 0); 158 c &= ~mask; 159 } 160 *ptr = (char*)p + 1; 161 return c; 162 } 163 164 SkUnichar SkUTF8_NextUnichar(const char** ptr) { 165 SkASSERT(ptr && *ptr); 166 167 const uint8_t* p = (const uint8_t*)*ptr; 168 int c = *p; 169 int hic = c << 24; 170 171 assert_utf8_leadingbyte(c); 172 173 if (hic < 0) { 174 uint32_t mask = (uint32_t)~0x3F; 175 hic = SkLeftShift(hic, 1); 176 do { 177 c = (c << 6) | (*++p & 0x3F); 178 mask <<= 5; 179 } while ((hic = SkLeftShift(hic, 1)) < 0); 180 c &= ~mask; 181 } 182 *ptr = (char*)p + 1; 183 return c; 184 } 185 186 SkUnichar SkUTF8_PrevUnichar(const char** ptr) { 187 SkASSERT(ptr && *ptr); 188 189 const char* p = *ptr; 190 191 if (*--p & 0x80) { 192 while (*--p & 0x40) { 193 ; 194 } 195 } 196 197 *ptr = (char*)p; 198 return SkUTF8_NextUnichar(&p); 199 } 200 201 size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) { 202 if ((uint32_t)uni > 0x10FFFF) { 203 SkDEBUGFAIL("bad unichar"); 204 return 0; 205 } 206 207 if (uni <= 127) { 208 if (utf8) { 209 *utf8 = (char)uni; 210 } 211 return 1; 212 } 213 214 char tmp[4]; 215 char* p = tmp; 216 size_t count = 1; 217 218 SkDEBUGCODE(SkUnichar orig = uni;) 219 220 while (uni > 0x7F >> count) { 221 *p++ = (char)(0x80 | (uni & 0x3F)); 222 uni >>= 6; 223 count += 1; 224 } 225 226 if (utf8) { 227 p = tmp; 228 utf8 += count; 229 while (p < tmp + count - 1) { 230 *--utf8 = *p++; 231 } 232 *--utf8 = (char)(~(0xFF >> count) | uni); 233 } 234 235 SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8)); 236 return count; 237 } 238 239 /////////////////////////////////////////////////////////////////////////////// 240 241 int SkUTF16_CountUnichars(const uint16_t src[]) { 242 SkASSERT(src); 243 244 int count = 0; 245 unsigned c; 246 while ((c = *src++) != 0) { 247 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 248 if (SkUTF16_IsHighSurrogate(c)) { 249 c = *src++; 250 SkASSERT(SkUTF16_IsLowSurrogate(c)); 251 } 252 count += 1; 253 } 254 return count; 255 } 256 257 int SkUTF16_CountUnichars(const uint16_t src[], int numberOf16BitValues) { 258 SkASSERT(src); 259 260 const uint16_t* stop = src + numberOf16BitValues; 261 int count = 0; 262 while (src < stop) { 263 unsigned c = *src++; 264 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 265 if (SkUTF16_IsHighSurrogate(c)) { 266 SkASSERT(src < stop); 267 c = *src++; 268 SkASSERT(SkUTF16_IsLowSurrogate(c)); 269 } 270 count += 1; 271 } 272 return count; 273 } 274 275 SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) { 276 SkASSERT(srcPtr && *srcPtr); 277 278 const uint16_t* src = *srcPtr; 279 SkUnichar c = *src++; 280 281 SkASSERT(!SkUTF16_IsLowSurrogate(c)); 282 if (SkUTF16_IsHighSurrogate(c)) { 283 unsigned c2 = *src++; 284 SkASSERT(SkUTF16_IsLowSurrogate(c2)); 285 286 // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000 287 // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF) 288 c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00); 289 } 290 *srcPtr = src; 291 return c; 292 } 293 294 SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) { 295 SkASSERT(srcPtr && *srcPtr); 296 297 const uint16_t* src = *srcPtr; 298 SkUnichar c = *--src; 299 300 SkASSERT(!SkUTF16_IsHighSurrogate(c)); 301 if (SkUTF16_IsLowSurrogate(c)) { 302 unsigned c2 = *--src; 303 SkASSERT(SkUTF16_IsHighSurrogate(c2)); 304 c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00); 305 } 306 *srcPtr = src; 307 return c; 308 } 309 310 size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) { 311 SkASSERT((unsigned)uni <= 0x10FFFF); 312 313 int extra = (uni > 0xFFFF); 314 315 if (dst) { 316 if (extra) { 317 // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10)); 318 // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64)); 319 dst[0] = SkToU16((0xD800 - 64) + (uni >> 10)); 320 dst[1] = SkToU16(0xDC00 | (uni & 0x3FF)); 321 322 SkASSERT(SkUTF16_IsHighSurrogate(dst[0])); 323 SkASSERT(SkUTF16_IsLowSurrogate(dst[1])); 324 } else { 325 dst[0] = SkToU16(uni); 326 SkASSERT(!SkUTF16_IsHighSurrogate(dst[0])); 327 SkASSERT(!SkUTF16_IsLowSurrogate(dst[0])); 328 } 329 } 330 return 1 + extra; 331 } 332 333 size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues, 334 char utf8[]) { 335 SkASSERT(numberOf16BitValues >= 0); 336 if (numberOf16BitValues <= 0) { 337 return 0; 338 } 339 340 SkASSERT(utf16 != nullptr); 341 342 const uint16_t* stop = utf16 + numberOf16BitValues; 343 size_t size = 0; 344 345 if (utf8 == nullptr) { // just count 346 while (utf16 < stop) { 347 size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr); 348 } 349 } else { 350 char* start = utf8; 351 while (utf16 < stop) { 352 utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8); 353 } 354 size = utf8 - start; 355 } 356 return size; 357 } 358 359 const char SkHexadecimalDigits::gUpper[16] = 360 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; 361 const char SkHexadecimalDigits::gLower[16] = 362 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; 363 364