Home | History | Annotate | Download | only in core
      1 /*
      2  * Copyright 2006 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 
      9 #include "SkUtils.h"
     10 
     11 /*  0xxxxxxx    1 total
     12     10xxxxxx    // never a leading byte
     13     110xxxxx    2 total
     14     1110xxxx    3 total
     15     11110xxx    4 total
     16 
     17     11 10 01 01 xx xx xx xx 0...
     18     0xE5XX0000
     19     0xE5 << 24
     20 */
     21 
     22 static bool utf8_byte_is_valid(uint8_t c) {
     23     return c < 0xF5 && (c & 0xFE) != 0xC0;
     24 }
     25 static bool utf8_byte_is_continuation(uint8_t c) {
     26     return  (c & 0xC0) == 0x80;
     27 }
     28 static bool utf8_byte_is_leading_byte(uint8_t c) {
     29     return utf8_byte_is_valid(c) && !utf8_byte_is_continuation(c);
     30 }
     31 
     32 #ifdef SK_DEBUG
     33     static void assert_utf8_leadingbyte(unsigned c) {
     34         SkASSERT(utf8_byte_is_leading_byte(SkToU8(c)));
     35     }
     36 
     37     int SkUTF8_LeadByteToCount(unsigned c) {
     38         assert_utf8_leadingbyte(c);
     39         return (((0xE5 << 24) >> (c >> 4 << 1)) & 3) + 1;
     40     }
     41 #else
     42     #define assert_utf8_leadingbyte(c)
     43 #endif
     44 
     45 /**
     46  * @returns -1  iff invalid UTF8 byte,
     47  *           0  iff UTF8 continuation byte,
     48  *           1  iff ASCII byte,
     49  *           2  iff leading byte of 2-byte sequence,
     50  *           3  iff leading byte of 3-byte sequence, and
     51  *           4  iff leading byte of 4-byte sequence.
     52  *
     53  * I.e.: if return value > 0, then gives length of sequence.
     54 */
     55 static int utf8_byte_type(uint8_t c) {
     56     if (c < 0x80) {
     57         return 1;
     58     } else if (c < 0xC0) {
     59         return 0;
     60     } else if (c < 0xF5 && (c & 0xFE) != 0xC0) { // "octet values C0, C1, F5 to FF never appear"
     61         return (((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
     62     } else {
     63         return -1;
     64     }
     65 }
     66 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
     67 
     68 int SkUTF8_CountUnichars(const char utf8[]) {
     69     SkASSERT(utf8);
     70 
     71     int count = 0;
     72 
     73     for (;;) {
     74         int c = *(const uint8_t*)utf8;
     75         if (c == 0) {
     76             break;
     77         }
     78         utf8 += SkUTF8_LeadByteToCount(c);
     79         count += 1;
     80     }
     81     return count;
     82 }
     83 
     84 // SAFE: returns -1 if invalid UTF-8
     85 int SkUTF8_CountUnichars(const void* text, size_t byteLength) {
     86     SkASSERT(text);
     87     const char* utf8 = static_cast<const char*>(text);
     88     if (byteLength == 0) {
     89         return 0;
     90     }
     91 
     92     int         count = 0;
     93     const char* stop = utf8 + byteLength;
     94 
     95     while (utf8 < stop) {
     96         int type = utf8_byte_type(*(const uint8_t*)utf8);
     97         SkASSERT(type >= -1 && type <= 4);
     98         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
     99             // Sequence extends beyond end.
    100             return -1;
    101         }
    102         while(type-- > 1) {
    103             ++utf8;
    104             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
    105                 return -1;
    106             }
    107         }
    108         ++utf8;
    109         ++count;
    110     }
    111     return count;
    112 }
    113 
    114 SkUnichar SkUTF8_ToUnichar(const char utf8[]) {
    115     SkASSERT(utf8);
    116 
    117     const uint8_t*  p = (const uint8_t*)utf8;
    118     int             c = *p;
    119     int             hic = c << 24;
    120 
    121     assert_utf8_leadingbyte(c);
    122 
    123     if (hic < 0) {
    124         uint32_t mask = (uint32_t)~0x3F;
    125         hic = SkLeftShift(hic, 1);
    126         do {
    127             c = (c << 6) | (*++p & 0x3F);
    128             mask <<= 5;
    129         } while ((hic = SkLeftShift(hic, 1)) < 0);
    130         c &= ~mask;
    131     }
    132     return c;
    133 }
    134 
    135 // SAFE: returns -1 on invalid UTF-8 sequence.
    136 SkUnichar SkUTF8_NextUnicharWithError(const char** ptr, const char* end) {
    137     SkASSERT(ptr && *ptr);
    138     SkASSERT(*ptr < end);
    139     const uint8_t*  p = (const uint8_t*)*ptr;
    140     int             c = *p;
    141     int             hic = c << 24;
    142 
    143     if (!utf8_byte_is_leading_byte(c)) {
    144         return -1;
    145     }
    146     if (hic < 0) {
    147         uint32_t mask = (uint32_t)~0x3F;
    148         hic = SkLeftShift(hic, 1);
    149         do {
    150             ++p;
    151             if (p >= (const uint8_t*)end) {
    152                 return -1;
    153             }
    154             // check before reading off end of array.
    155             uint8_t nextByte = *p;
    156             if (!utf8_byte_is_continuation(nextByte)) {
    157                 return -1;
    158             }
    159             c = (c << 6) | (nextByte & 0x3F);
    160             mask <<= 5;
    161         } while ((hic = SkLeftShift(hic, 1)) < 0);
    162         c &= ~mask;
    163     }
    164     *ptr = (char*)p + 1;
    165     return c;
    166 }
    167 
    168 SkUnichar SkUTF8_NextUnichar(const char** ptr) {
    169     SkASSERT(ptr && *ptr);
    170 
    171     const uint8_t*  p = (const uint8_t*)*ptr;
    172     int             c = *p;
    173     int             hic = c << 24;
    174 
    175     assert_utf8_leadingbyte(c);
    176 
    177     if (hic < 0) {
    178         uint32_t mask = (uint32_t)~0x3F;
    179         hic = SkLeftShift(hic, 1);
    180         do {
    181             c = (c << 6) | (*++p & 0x3F);
    182             mask <<= 5;
    183         } while ((hic = SkLeftShift(hic, 1)) < 0);
    184         c &= ~mask;
    185     }
    186     *ptr = (char*)p + 1;
    187     return c;
    188 }
    189 
    190 SkUnichar SkUTF8_PrevUnichar(const char** ptr) {
    191     SkASSERT(ptr && *ptr);
    192 
    193     const char* p = *ptr;
    194 
    195     if (*--p & 0x80) {
    196         while (*--p & 0x40) {
    197             ;
    198         }
    199     }
    200 
    201     *ptr = (char*)p;
    202     return SkUTF8_NextUnichar(&p);
    203 }
    204 
    205 size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[]) {
    206     if ((uint32_t)uni > 0x10FFFF) {
    207         SkDEBUGFAIL("bad unichar");
    208         return 0;
    209     }
    210 
    211     if (uni <= 127) {
    212         if (utf8) {
    213             *utf8 = (char)uni;
    214         }
    215         return 1;
    216     }
    217 
    218     char    tmp[4];
    219     char*   p = tmp;
    220     size_t  count = 1;
    221 
    222     SkDEBUGCODE(SkUnichar orig = uni;)
    223 
    224     while (uni > 0x7F >> count) {
    225         *p++ = (char)(0x80 | (uni & 0x3F));
    226         uni >>= 6;
    227         count += 1;
    228     }
    229 
    230     if (utf8) {
    231         p = tmp;
    232         utf8 += count;
    233         while (p < tmp + count - 1) {
    234             *--utf8 = *p++;
    235         }
    236         *--utf8 = (char)(~(0xFF >> count) | uni);
    237     }
    238 
    239     SkASSERT(utf8 == nullptr || orig == SkUTF8_ToUnichar(utf8));
    240     return count;
    241 }
    242 
    243 ///////////////////////////////////////////////////////////////////////////////
    244 
    245 int SkUTF16_CountUnichars(const uint16_t src[]) {
    246     SkASSERT(src);
    247 
    248     int count = 0;
    249     unsigned c;
    250     while ((c = *src++) != 0) {
    251         SkASSERT(!SkUTF16_IsLowSurrogate(c));
    252         if (SkUTF16_IsHighSurrogate(c)) {
    253             c = *src++;
    254             SkASSERT(SkUTF16_IsLowSurrogate(c));
    255         }
    256         count += 1;
    257     }
    258     return count;
    259 }
    260 
    261 // returns -1 on error
    262 int SkUTF16_CountUnichars(const void* text, size_t byteLength) {
    263     SkASSERT(text);
    264     if (byteLength == 0) {
    265         return 0;
    266     }
    267     if (!SkIsAlign2(intptr_t(text)) || !SkIsAlign2(byteLength)) {
    268         return -1;
    269     }
    270 
    271     const uint16_t* src = static_cast<const uint16_t*>(text);
    272     const uint16_t* stop = src + (byteLength >> 1);
    273     int count = 0;
    274     while (src < stop) {
    275         unsigned c = *src++;
    276         SkASSERT(!SkUTF16_IsLowSurrogate(c));
    277         if (SkUTF16_IsHighSurrogate(c)) {
    278             if (src >= stop) {
    279                 return -1;
    280             }
    281             c = *src++;
    282             if (!SkUTF16_IsLowSurrogate(c)) {
    283                 return -1;
    284             }
    285         }
    286         count += 1;
    287     }
    288     return count;
    289 }
    290 
    291 SkUnichar SkUTF16_NextUnichar(const uint16_t** srcPtr) {
    292     SkASSERT(srcPtr && *srcPtr);
    293 
    294     const uint16_t* src = *srcPtr;
    295     SkUnichar       c = *src++;
    296 
    297     SkASSERT(!SkUTF16_IsLowSurrogate(c));
    298     if (SkUTF16_IsHighSurrogate(c)) {
    299         unsigned c2 = *src++;
    300         SkASSERT(SkUTF16_IsLowSurrogate(c2));
    301 
    302         // c = ((c & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000
    303         // c = (((c & 0x3FF) + 64) << 10) + (c2 & 0x3FF)
    304         c = (c << 10) + c2 + (0x10000 - (0xD800 << 10) - 0xDC00);
    305     }
    306     *srcPtr = src;
    307     return c;
    308 }
    309 
    310 SkUnichar SkUTF16_PrevUnichar(const uint16_t** srcPtr) {
    311     SkASSERT(srcPtr && *srcPtr);
    312 
    313     const uint16_t* src = *srcPtr;
    314     SkUnichar       c = *--src;
    315 
    316     SkASSERT(!SkUTF16_IsHighSurrogate(c));
    317     if (SkUTF16_IsLowSurrogate(c)) {
    318         unsigned c2 = *--src;
    319         SkASSERT(SkUTF16_IsHighSurrogate(c2));
    320         c = (c2 << 10) + c + (0x10000 - (0xD800 << 10) - 0xDC00);
    321     }
    322     *srcPtr = src;
    323     return c;
    324 }
    325 
    326 size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t dst[]) {
    327     SkASSERT((unsigned)uni <= 0x10FFFF);
    328 
    329     int extra = (uni > 0xFFFF);
    330 
    331     if (dst) {
    332         if (extra) {
    333             // dst[0] = SkToU16(0xD800 | ((uni - 0x10000) >> 10));
    334             // dst[0] = SkToU16(0xD800 | ((uni >> 10) - 64));
    335             dst[0] = SkToU16((0xD800 - 64) + (uni >> 10));
    336             dst[1] = SkToU16(0xDC00 | (uni & 0x3FF));
    337 
    338             SkASSERT(SkUTF16_IsHighSurrogate(dst[0]));
    339             SkASSERT(SkUTF16_IsLowSurrogate(dst[1]));
    340         } else {
    341             dst[0] = SkToU16(uni);
    342             SkASSERT(!SkUTF16_IsHighSurrogate(dst[0]));
    343             SkASSERT(!SkUTF16_IsLowSurrogate(dst[0]));
    344         }
    345     }
    346     return 1 + extra;
    347 }
    348 
    349 size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues,
    350                       char utf8[]) {
    351     SkASSERT(numberOf16BitValues >= 0);
    352     if (numberOf16BitValues <= 0) {
    353         return 0;
    354     }
    355 
    356     SkASSERT(utf16 != nullptr);
    357 
    358     const uint16_t* stop = utf16 + numberOf16BitValues;
    359     size_t          size = 0;
    360 
    361     if (utf8 == nullptr) {    // just count
    362         while (utf16 < stop) {
    363             size += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), nullptr);
    364         }
    365     } else {
    366         char* start = utf8;
    367         while (utf16 < stop) {
    368             utf8 += SkUTF8_FromUnichar(SkUTF16_NextUnichar(&utf16), utf8);
    369         }
    370         size = utf8 - start;
    371     }
    372     return size;
    373 }
    374 
    375 const char SkHexadecimalDigits::gUpper[16] =
    376            { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
    377 const char SkHexadecimalDigits::gLower[16] =
    378            { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
    379 
    380 
    381 // returns -1 on error
    382 int SkUTF32_CountUnichars(const void* text, size_t byteLength) {
    383     if (byteLength == 0) {
    384         return 0;
    385     }
    386     if (!SkIsAlign4(intptr_t(text)) || !SkIsAlign4(byteLength)) {
    387         return -1;
    388     }
    389     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
    390     const uint32_t* ptr = static_cast<const uint32_t*>(text);
    391     const uint32_t* stop = ptr + (byteLength >> 2);
    392     while (ptr < stop) {
    393         if (*ptr & kInvalidUnicharMask) {
    394             return -1;
    395         }
    396         ptr += 1;
    397     }
    398     return SkToInt(byteLength >> 2);
    399 }
    400 
    401