Home | History | Annotate | Download | only in dex
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "utf.h"
     18 
     19 #include <map>
     20 #include <vector>
     21 
     22 #include <android-base/stringprintf.h>
     23 
     24 #include "gtest/gtest.h"
     25 #include "utf-inl.h"
     26 
     27 namespace art {
     28 
     29 class UtfTest : public testing::Test {};
     30 
     31 TEST_F(UtfTest, GetLeadingUtf16Char) {
     32   EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
     33 }
     34 
     35 TEST_F(UtfTest, GetTrailingUtf16Char) {
     36   EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
     37   EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
     38 }
     39 
     40 #define EXPECT_ARRAY_POSITION(expected, end, start) \
     41   EXPECT_EQ(static_cast<uintptr_t>(expected), \
     42             reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
     43 
     44 // A test string containing one, two, three and four byte UTF-8 sequences.
     45 static const uint8_t kAllSequences[] = {
     46     0x24,
     47     0xc2, 0xa2,
     48     0xe2, 0x82, 0xac,
     49     0xf0, 0x9f, 0x8f, 0xa0,
     50     0x00
     51 };
     52 
     53 // A test string that contains a UTF-8 encoding of a surrogate pair
     54 // (code point = U+10400).
     55 static const uint8_t kSurrogateEncoding[] = {
     56     0xed, 0xa0, 0x81,
     57     0xed, 0xb0, 0x80,
     58     0x00
     59 };
     60 
     61 TEST_F(UtfTest, GetUtf16FromUtf8) {
     62   const char* const start = reinterpret_cast<const char*>(kAllSequences);
     63   const char* ptr = start;
     64   uint32_t pair = 0;
     65 
     66   // Single byte sequence.
     67   pair = GetUtf16FromUtf8(&ptr);
     68   EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
     69   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
     70   EXPECT_ARRAY_POSITION(1, ptr, start);
     71 
     72   // Two byte sequence.
     73   pair = GetUtf16FromUtf8(&ptr);
     74   EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
     75   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
     76   EXPECT_ARRAY_POSITION(3, ptr, start);
     77 
     78   // Three byte sequence.
     79   pair = GetUtf16FromUtf8(&ptr);
     80   EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
     81   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
     82   EXPECT_ARRAY_POSITION(6, ptr, start);
     83 
     84   // Four byte sequence
     85   pair = GetUtf16FromUtf8(&ptr);
     86   EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
     87   EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
     88   EXPECT_ARRAY_POSITION(10, ptr, start);
     89 
     90   // Null terminator.
     91   pair = GetUtf16FromUtf8(&ptr);
     92   EXPECT_EQ(0, GetLeadingUtf16Char(pair));
     93   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
     94   EXPECT_ARRAY_POSITION(11, ptr, start);
     95 }
     96 
     97 TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
     98   const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
     99   const char* ptr = start;
    100   uint32_t pair = 0;
    101 
    102   pair = GetUtf16FromUtf8(&ptr);
    103   EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
    104   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
    105   EXPECT_ARRAY_POSITION(3, ptr, start);
    106 
    107   pair = GetUtf16FromUtf8(&ptr);
    108   EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
    109   EXPECT_EQ(0, GetTrailingUtf16Char(pair));
    110   EXPECT_ARRAY_POSITION(6, ptr, start);
    111 }
    112 
    113 TEST_F(UtfTest, CountModifiedUtf8Chars) {
    114   EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
    115   EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
    116 }
    117 
    118 static void AssertConversion(const std::vector<uint16_t>& input,
    119                              const std::vector<uint8_t>& expected) {
    120   ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
    121 
    122   std::vector<uint8_t> output(expected.size());
    123   ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
    124                              &input[0], input.size());
    125   EXPECT_EQ(expected, output);
    126 }
    127 
    128 TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
    129   // Surrogate pairs will be converted into 4 byte sequences.
    130   AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
    131 
    132   // Three byte encodings that are below & above the leading surrogate
    133   // range respectively.
    134   AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
    135   AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
    136   // Two byte encoding.
    137   AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
    138 
    139   // Two byte special case : 0 must use an overlong encoding.
    140   AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
    141 
    142   // One byte encoding.
    143   AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
    144 
    145   AssertConversion({
    146       0xd802, 0xdc02,  // Surrogate pair.
    147       0xdef0, 0xdcff,  // Three byte encodings.
    148       0x0101, 0x0000,  // Two byte encodings.
    149       'p'   , 'p'      // One byte encoding.
    150     }, {
    151       0xf0, 0x90, 0xa0, 0x82,
    152       0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
    153       0xc4, 0x81, 0xc0, 0x80,
    154       0x70, 0x70
    155     });
    156 }
    157 
    158 TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
    159   // Unpaired trailing surrogate at the end of input.
    160   AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
    161   // Unpaired (or incorrectly paired) surrogates in the middle of the input.
    162   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
    163       {{ 'h' }, { 'h' }},
    164       {{ 0 }, { 0xc0, 0x80 }},
    165       {{ 0x81 }, { 0xc2, 0x81 }},
    166       {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
    167   };
    168   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
    169       {{ 'e' }, { 'e' }},
    170       {{ 0 }, { 0xc0, 0x80 }},
    171       {{ 0x7ff }, { 0xdf, 0xbf }},
    172       {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
    173   };
    174   const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
    175       {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
    176       {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
    177       {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
    178       {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
    179   };
    180   for (const auto& prefix : prefixes) {
    181     const std::vector<uint16_t>& prefix_in = prefix.first;
    182     const std::vector<uint8_t>& prefix_out = prefix.second;
    183     for (const auto& test : tests) {
    184       const std::vector<uint16_t>& test_in = test.first;
    185       const std::vector<uint8_t>& test_out = test.second;
    186       for (const auto& suffix : suffixes) {
    187         const std::vector<uint16_t>& suffix_in = suffix.first;
    188         const std::vector<uint8_t>& suffix_out = suffix.second;
    189         std::vector<uint16_t> in = prefix_in;
    190         in.insert(in.end(), test_in.begin(), test_in.end());
    191         in.insert(in.end(), suffix_in.begin(), suffix_in.end());
    192         std::vector<uint8_t> out = prefix_out;
    193         out.insert(out.end(), test_out.begin(), test_out.end());
    194         out.insert(out.end(), suffix_out.begin(), suffix_out.end());
    195         AssertConversion(in, out);
    196       }
    197     }
    198   }
    199 }
    200 
    201 // Old versions of functions, here to compare answers with optimized versions.
    202 
    203 size_t CountModifiedUtf8Chars_reference(const char* utf8) {
    204   size_t len = 0;
    205   int ic;
    206   while ((ic = *utf8++) != '\0') {
    207     len++;
    208     if ((ic & 0x80) == 0) {
    209       // one-byte encoding
    210       continue;
    211     }
    212     // two- or three-byte encoding
    213     utf8++;
    214     if ((ic & 0x20) == 0) {
    215       // two-byte encoding
    216       continue;
    217     }
    218     utf8++;
    219     if ((ic & 0x10) == 0) {
    220       // three-byte encoding
    221       continue;
    222     }
    223 
    224     // four-byte encoding: needs to be converted into a surrogate
    225     // pair.
    226     utf8++;
    227     len++;
    228   }
    229   return len;
    230 }
    231 
    232 static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
    233   size_t result = 0;
    234   while (char_count--) {
    235     const uint16_t ch = *chars++;
    236     if (ch > 0 && ch <= 0x7f) {
    237       ++result;
    238     } else if (ch >= 0xd800 && ch <= 0xdbff) {
    239       if (char_count > 0) {
    240         const uint16_t ch2 = *chars;
    241         // If we find a properly paired surrogate, we emit it as a 4 byte
    242         // UTF sequence. If we find an unpaired leading or trailing surrogate,
    243         // we emit it as a 3 byte sequence like would have done earlier.
    244         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
    245           chars++;
    246           char_count--;
    247 
    248           result += 4;
    249         } else {
    250           result += 3;
    251         }
    252       } else {
    253         // This implies we found an unpaired trailing surrogate at the end
    254         // of a string.
    255         result += 3;
    256       }
    257     } else if (ch > 0x7ff) {
    258       result += 3;
    259     } else {
    260       result += 2;
    261     }
    262   }
    263   return result;
    264 }
    265 
    266 static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
    267                                                  size_t char_count) {
    268   while (char_count--) {
    269     const uint16_t ch = *utf16_in++;
    270     if (ch > 0 && ch <= 0x7f) {
    271       *utf8_out++ = ch;
    272     } else {
    273       // Char_count == 0 here implies we've encountered an unpaired
    274       // surrogate and we have no choice but to encode it as 3-byte UTF
    275       // sequence. Note that unpaired surrogates can occur as a part of
    276       // "normal" operation.
    277       if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
    278         const uint16_t ch2 = *utf16_in;
    279 
    280         // Check if the other half of the pair is within the expected
    281         // range. If it isn't, we will have to emit both "halves" as
    282         // separate 3 byte sequences.
    283         if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
    284           utf16_in++;
    285           char_count--;
    286           const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
    287           *utf8_out++ = (code_point >> 18) | 0xf0;
    288           *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
    289           *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
    290           *utf8_out++ = (code_point & 0x3f) | 0x80;
    291           continue;
    292         }
    293       }
    294 
    295       if (ch > 0x07ff) {
    296         // Three byte encoding.
    297         *utf8_out++ = (ch >> 12) | 0xe0;
    298         *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
    299         *utf8_out++ = (ch & 0x3f) | 0x80;
    300       } else /*(ch > 0x7f || ch == 0)*/ {
    301         // Two byte encoding.
    302         *utf8_out++ = (ch >> 6) | 0xc0;
    303         *utf8_out++ = (ch & 0x3f) | 0x80;
    304       }
    305     }
    306   }
    307 }
    308 
    309 // Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
    310 
    311 static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
    312   first = (code_point >> 10) + 0xd7c0;
    313   second = (code_point & 0x03ff) + 0xdc00;
    314 }
    315 
    316 static void testConversions(uint16_t *buf, int char_count) {
    317   char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
    318   uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
    319   int byte_count_test, byte_count_reference;
    320   int char_count_test, char_count_reference;
    321 
    322   // Calculate the number of utf-8 bytes for the utf-16 chars.
    323   byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
    324   byte_count_test = CountUtf8Bytes(buf, char_count);
    325   EXPECT_EQ(byte_count_reference, byte_count_test);
    326 
    327   // Convert the utf-16 string to utf-8 bytes.
    328   ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
    329   ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
    330   for (int i = 0; i < byte_count_test; ++i) {
    331     EXPECT_EQ(bytes_reference[i], bytes_test[i]);
    332   }
    333 
    334   // Calculate the number of utf-16 chars from the utf-8 bytes.
    335   bytes_reference[byte_count_reference] = 0;  // Reference function needs null termination.
    336   char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
    337   char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
    338   EXPECT_EQ(char_count, char_count_reference);
    339   EXPECT_EQ(char_count, char_count_test);
    340 
    341   // Convert the utf-8 bytes back to utf-16 chars.
    342   // Does not need copied _reference version of the function because the original
    343   // function with the old API is retained for debug/testing code.
    344   ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
    345   ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
    346   for (int i = 0; i < char_count_test; ++i) {
    347     EXPECT_EQ(buf[i], out_buf_reference[i]);
    348     EXPECT_EQ(buf[i], out_buf_test[i]);
    349   }
    350 }
    351 
    352 TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
    353   for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
    354     uint16_t buf[4] = { 0 };
    355     if (codePoint <= 0xffff) {
    356       if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
    357         // According to the Unicode standard, no character will ever
    358         // be assigned to these code points, and they cannot be encoded
    359         // into either utf-16 or utf-8.
    360         continue;
    361       }
    362       buf[0] = 'h';
    363       buf[1] = codePoint;
    364       buf[2] = 'e';
    365       testConversions(buf, 2);
    366       testConversions(buf, 3);
    367       testConversions(buf + 1, 1);
    368       testConversions(buf + 1, 2);
    369     } else {
    370       buf[0] = 'h';
    371       codePointToSurrogatePair(codePoint, buf[1], buf[2]);
    372       buf[3] = 'e';
    373       testConversions(buf, 2);
    374       testConversions(buf, 3);
    375       testConversions(buf, 4);
    376       testConversions(buf + 1, 1);
    377       testConversions(buf + 1, 2);
    378       testConversions(buf + 1, 3);
    379     }
    380   }
    381 }
    382 
    383 TEST_F(UtfTest, NonAscii) {
    384   const char kNonAsciiCharacter = '\x80';
    385   const char input[] = { kNonAsciiCharacter, '\0' };
    386   uint32_t hash = ComputeModifiedUtf8Hash(input);
    387   EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash);
    388 }
    389 
    390 TEST_F(UtfTest, PrintableStringUtf8) {
    391   // Note: This is UTF-8, not Modified-UTF-8.
    392   const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 };
    393   const char* start = reinterpret_cast<const char*>(kTestSequence);
    394   const char* ptr = start;
    395   uint32_t pair = GetUtf16FromUtf8(&ptr);
    396   ASSERT_EQ(*ptr, '\0');
    397   uint16_t leading = GetLeadingUtf16Char(pair);
    398   uint16_t trailing = GetTrailingUtf16Char(pair);
    399   ASSERT_NE(0u, trailing);
    400 
    401   std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"",
    402                                                      static_cast<unsigned>(leading),
    403                                                      static_cast<unsigned>(trailing));
    404   std::string printable = PrintableString(start);
    405   EXPECT_EQ(expected, printable);
    406 }
    407 
    408 }  // namespace art
    409