Home | History | Annotate | Download | only in runtime
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "utf.h"
     18 
     19 #include "base/logging.h"
     20 #include "mirror/array.h"
     21 #include "mirror/object-inl.h"
     22 #include "utf-inl.h"
     23 
     24 namespace art {
     25 
     26 size_t CountModifiedUtf8Chars(const char* utf8) {
     27   size_t len = 0;
     28   int ic;
     29   while ((ic = *utf8++) != '\0') {
     30     len++;
     31     if ((ic & 0x80) == 0) {
     32       // one-byte encoding
     33       continue;
     34     }
     35     // two- or three-byte encoding
     36     utf8++;
     37     if ((ic & 0x20) == 0) {
     38       // two-byte encoding
     39       continue;
     40     }
     41     utf8++;
     42     if ((ic & 0x10) == 0) {
     43       // three-byte encoding
     44       continue;
     45     }
     46 
     47     // four-byte encoding: needs to be converted into a surrogate
     48     // pair.
     49     utf8++;
     50     len++;
     51   }
     52   return len;
     53 }
     54 
     55 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
     56   while (*utf8_data_in != '\0') {
     57     const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
     58     const uint16_t leading = GetLeadingUtf16Char(ch);
     59     const uint16_t trailing = GetTrailingUtf16Char(ch);
     60 
     61     *utf16_data_out++ = leading;
     62     if (trailing != 0) {
     63       *utf16_data_out++ = trailing;
     64     }
     65   }
     66 }
     67 
     68 void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) {
     69   while (char_count--) {
     70     const uint16_t ch = *utf16_in++;
     71     if (ch > 0 && ch <= 0x7f) {
     72       *utf8_out++ = ch;
     73     } else {
     74       // char_count == 0 here implies we've encountered an unpaired
     75       // surrogate and we have no choice but to encode it as 3-byte UTF
     76       // sequence. Note that unpaired surrogates can occur as a part of
     77       // "normal" operation.
     78       if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
     79         const uint16_t ch2 = *utf16_in;
     80 
     81         // Check if the other half of the pair is within the expected
     82         // range. If it isn't, we will have to emit both "halves" as
     83         // separate 3 byte sequences.
     84         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
     85           utf16_in++;
     86           char_count--;
     87           const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
     88           *utf8_out++ = (code_point >> 18) | 0xf0;
     89           *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
     90           *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
     91           *utf8_out++ = (code_point & 0x3f) | 0x80;
     92           continue;
     93         }
     94       }
     95 
     96       if (ch > 0x07ff) {
     97         // Three byte encoding.
     98         *utf8_out++ = (ch >> 12) | 0xe0;
     99         *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
    100         *utf8_out++ = (ch & 0x3f) | 0x80;
    101       } else /*(ch > 0x7f || ch == 0)*/ {
    102         // Two byte encoding.
    103         *utf8_out++ = (ch >> 6) | 0xc0;
    104         *utf8_out++ = (ch & 0x3f) | 0x80;
    105       }
    106     }
    107   }
    108 }
    109 
    110 int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) {
    111   uint32_t hash = 0;
    112   while (char_count--) {
    113     hash = hash * 31 + *chars++;
    114   }
    115   return static_cast<int32_t>(hash);
    116 }
    117 
    118 size_t ComputeModifiedUtf8Hash(const char* chars) {
    119   size_t hash = 0;
    120   while (*chars != '\0') {
    121     hash = hash * 31 + *chars++;
    122   }
    123   return static_cast<int32_t>(hash);
    124 }
    125 
    126 int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
    127                                                 size_t utf16_length) {
    128   for (;;) {
    129     if (*utf8 == '\0') {
    130       return (utf16_length == 0) ? 0 : -1;
    131     } else if (utf16_length == 0) {
    132       return 1;
    133     }
    134 
    135     const uint32_t pair = GetUtf16FromUtf8(&utf8);
    136 
    137     // First compare the leading utf16 char.
    138     const uint16_t lhs = GetLeadingUtf16Char(pair);
    139     const uint16_t rhs = *utf16++;
    140     --utf16_length;
    141     if (lhs != rhs) {
    142       return lhs > rhs ? 1 : -1;
    143     }
    144 
    145     // Then compare the trailing utf16 char. First check if there
    146     // are any characters left to consume.
    147     const uint16_t lhs2 = GetTrailingUtf16Char(pair);
    148     if (lhs2 != 0) {
    149       if (utf16_length == 0) {
    150         return 1;
    151       }
    152 
    153       const uint16_t rhs2 = *utf16++;
    154       --utf16_length;
    155       if (lhs2 != rhs2) {
    156         return lhs2 > rhs2 ? 1 : -1;
    157       }
    158     }
    159   }
    160 }
    161 
    162 size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
    163   size_t result = 0;
    164   while (char_count--) {
    165     const uint16_t ch = *chars++;
    166     if (ch > 0 && ch <= 0x7f) {
    167       ++result;
    168     } else if (ch >= 0xd800 && ch <= 0xdbff) {
    169       if (char_count > 0) {
    170         const uint16_t ch2 = *chars;
    171         // If we find a properly paired surrogate, we emit it as a 4 byte
    172         // UTF sequence. If we find an unpaired leading or trailing surrogate,
    173         // we emit it as a 3 byte sequence like would have done earlier.
    174         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
    175           chars++;
    176           char_count--;
    177 
    178           result += 4;
    179         } else {
    180           result += 3;
    181         }
    182       } else {
    183         // This implies we found an unpaired trailing surrogate at the end
    184         // of a string.
    185         result += 3;
    186       }
    187     } else if (ch > 0x7ff) {
    188       result += 3;
    189     } else {
    190       result += 2;
    191     }
    192   }
    193   return result;
    194 }
    195 
    196 }  // namespace art
    197