1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "utf.h" 18 19 #include <map> 20 #include <vector> 21 22 #include <android-base/stringprintf.h> 23 24 #include "gtest/gtest.h" 25 #include "utf-inl.h" 26 27 namespace art { 28 29 class UtfTest : public testing::Test {}; 30 31 TEST_F(UtfTest, GetLeadingUtf16Char) { 32 EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff)); 33 } 34 35 TEST_F(UtfTest, GetTrailingUtf16Char) { 36 EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee)); 37 EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa)); 38 } 39 40 #define EXPECT_ARRAY_POSITION(expected, end, start) \ 41 EXPECT_EQ(static_cast<uintptr_t>(expected), \ 42 reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start)); 43 44 // A test string containing one, two, three and four byte UTF-8 sequences. 45 static const uint8_t kAllSequences[] = { 46 0x24, 47 0xc2, 0xa2, 48 0xe2, 0x82, 0xac, 49 0xf0, 0x9f, 0x8f, 0xa0, 50 0x00 51 }; 52 53 // A test string that contains a UTF-8 encoding of a surrogate pair 54 // (code point = U+10400). 55 static const uint8_t kSurrogateEncoding[] = { 56 0xed, 0xa0, 0x81, 57 0xed, 0xb0, 0x80, 58 0x00 59 }; 60 61 TEST_F(UtfTest, GetUtf16FromUtf8) { 62 const char* const start = reinterpret_cast<const char*>(kAllSequences); 63 const char* ptr = start; 64 uint32_t pair = 0; 65 66 // Single byte sequence. 67 pair = GetUtf16FromUtf8(&ptr); 68 EXPECT_EQ(0x24, GetLeadingUtf16Char(pair)); 69 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 70 EXPECT_ARRAY_POSITION(1, ptr, start); 71 72 // Two byte sequence. 73 pair = GetUtf16FromUtf8(&ptr); 74 EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); 75 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 76 EXPECT_ARRAY_POSITION(3, ptr, start); 77 78 // Three byte sequence. 79 pair = GetUtf16FromUtf8(&ptr); 80 EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); 81 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 82 EXPECT_ARRAY_POSITION(6, ptr, start); 83 84 // Four byte sequence 85 pair = GetUtf16FromUtf8(&ptr); 86 EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair)); 87 EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); 88 EXPECT_ARRAY_POSITION(10, ptr, start); 89 90 // Null terminator. 91 pair = GetUtf16FromUtf8(&ptr); 92 EXPECT_EQ(0, GetLeadingUtf16Char(pair)); 93 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 94 EXPECT_ARRAY_POSITION(11, ptr, start); 95 } 96 97 TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) { 98 const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding); 99 const char* ptr = start; 100 uint32_t pair = 0; 101 102 pair = GetUtf16FromUtf8(&ptr); 103 EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair)); 104 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 105 EXPECT_ARRAY_POSITION(3, ptr, start); 106 107 pair = GetUtf16FromUtf8(&ptr); 108 EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair)); 109 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 110 EXPECT_ARRAY_POSITION(6, ptr, start); 111 } 112 113 TEST_F(UtfTest, CountModifiedUtf8Chars) { 114 EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences))); 115 EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); 116 } 117 118 static void AssertConversion(const std::vector<uint16_t>& input, 119 const std::vector<uint8_t>& expected) { 120 ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); 121 122 std::vector<uint8_t> output(expected.size()); 123 ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(), 124 &input[0], input.size()); 125 EXPECT_EQ(expected, output); 126 } 127 128 TEST_F(UtfTest, CountAndConvertUtf8Bytes) { 129 // Surrogate pairs will be converted into 4 byte sequences. 130 AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); 131 132 // Three byte encodings that are below & above the leading surrogate 133 // range respectively. 134 AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 }); 135 AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf }); 136 // Two byte encoding. 137 AssertConversion({ 0x0101 }, { 0xc4, 0x81 }); 138 139 // Two byte special case : 0 must use an overlong encoding. 140 AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 }); 141 142 // One byte encoding. 143 AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); 144 145 AssertConversion({ 146 0xd802, 0xdc02, // Surrogate pair. 147 0xdef0, 0xdcff, // Three byte encodings. 148 0x0101, 0x0000, // Two byte encodings. 149 'p' , 'p' // One byte encoding. 150 }, { 151 0xf0, 0x90, 0xa0, 0x82, 152 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, 153 0xc4, 0x81, 0xc0, 0x80, 154 0x70, 0x70 155 }); 156 } 157 158 TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { 159 // Unpaired trailing surrogate at the end of input. 160 AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 }); 161 // Unpaired (or incorrectly paired) surrogates in the middle of the input. 162 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes { 163 {{ 'h' }, { 'h' }}, 164 {{ 0 }, { 0xc0, 0x80 }}, 165 {{ 0x81 }, { 0xc2, 0x81 }}, 166 {{ 0x801 }, { 0xe0, 0xa0, 0x81 }}, 167 }; 168 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes { 169 {{ 'e' }, { 'e' }}, 170 {{ 0 }, { 0xc0, 0x80 }}, 171 {{ 0x7ff }, { 0xdf, 0xbf }}, 172 {{ 0xffff }, { 0xef, 0xbf, 0xbf }}, 173 }; 174 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests { 175 {{ 0xd801 }, { 0xed, 0xa0, 0x81 }}, 176 {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }}, 177 {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }}, 178 {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }}, 179 }; 180 for (const auto& prefix : prefixes) { 181 const std::vector<uint16_t>& prefix_in = prefix.first; 182 const std::vector<uint8_t>& prefix_out = prefix.second; 183 for (const auto& test : tests) { 184 const std::vector<uint16_t>& test_in = test.first; 185 const std::vector<uint8_t>& test_out = test.second; 186 for (const auto& suffix : suffixes) { 187 const std::vector<uint16_t>& suffix_in = suffix.first; 188 const std::vector<uint8_t>& suffix_out = suffix.second; 189 std::vector<uint16_t> in = prefix_in; 190 in.insert(in.end(), test_in.begin(), test_in.end()); 191 in.insert(in.end(), suffix_in.begin(), suffix_in.end()); 192 std::vector<uint8_t> out = prefix_out; 193 out.insert(out.end(), test_out.begin(), test_out.end()); 194 out.insert(out.end(), suffix_out.begin(), suffix_out.end()); 195 AssertConversion(in, out); 196 } 197 } 198 } 199 } 200 201 // Old versions of functions, here to compare answers with optimized versions. 202 203 size_t CountModifiedUtf8Chars_reference(const char* utf8) { 204 size_t len = 0; 205 int ic; 206 while ((ic = *utf8++) != '\0') { 207 len++; 208 if ((ic & 0x80) == 0) { 209 // one-byte encoding 210 continue; 211 } 212 // two- or three-byte encoding 213 utf8++; 214 if ((ic & 0x20) == 0) { 215 // two-byte encoding 216 continue; 217 } 218 utf8++; 219 if ((ic & 0x10) == 0) { 220 // three-byte encoding 221 continue; 222 } 223 224 // four-byte encoding: needs to be converted into a surrogate 225 // pair. 226 utf8++; 227 len++; 228 } 229 return len; 230 } 231 232 static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) { 233 size_t result = 0; 234 while (char_count--) { 235 const uint16_t ch = *chars++; 236 if (ch > 0 && ch <= 0x7f) { 237 ++result; 238 } else if (ch >= 0xd800 && ch <= 0xdbff) { 239 if (char_count > 0) { 240 const uint16_t ch2 = *chars; 241 // If we find a properly paired surrogate, we emit it as a 4 byte 242 // UTF sequence. If we find an unpaired leading or trailing surrogate, 243 // we emit it as a 3 byte sequence like would have done earlier. 244 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { 245 chars++; 246 char_count--; 247 248 result += 4; 249 } else { 250 result += 3; 251 } 252 } else { 253 // This implies we found an unpaired trailing surrogate at the end 254 // of a string. 255 result += 3; 256 } 257 } else if (ch > 0x7ff) { 258 result += 3; 259 } else { 260 result += 2; 261 } 262 } 263 return result; 264 } 265 266 static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in, 267 size_t char_count) { 268 while (char_count--) { 269 const uint16_t ch = *utf16_in++; 270 if (ch > 0 && ch <= 0x7f) { 271 *utf8_out++ = ch; 272 } else { 273 // Char_count == 0 here implies we've encountered an unpaired 274 // surrogate and we have no choice but to encode it as 3-byte UTF 275 // sequence. Note that unpaired surrogates can occur as a part of 276 // "normal" operation. 277 if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { 278 const uint16_t ch2 = *utf16_in; 279 280 // Check if the other half of the pair is within the expected 281 // range. If it isn't, we will have to emit both "halves" as 282 // separate 3 byte sequences. 283 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { 284 utf16_in++; 285 char_count--; 286 const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; 287 *utf8_out++ = (code_point >> 18) | 0xf0; 288 *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; 289 *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; 290 *utf8_out++ = (code_point & 0x3f) | 0x80; 291 continue; 292 } 293 } 294 295 if (ch > 0x07ff) { 296 // Three byte encoding. 297 *utf8_out++ = (ch >> 12) | 0xe0; 298 *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; 299 *utf8_out++ = (ch & 0x3f) | 0x80; 300 } else /*(ch > 0x7f || ch == 0)*/ { 301 // Two byte encoding. 302 *utf8_out++ = (ch >> 6) | 0xc0; 303 *utf8_out++ = (ch & 0x3f) | 0x80; 304 } 305 } 306 } 307 } 308 309 // Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again. 310 311 static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) { 312 first = (code_point >> 10) + 0xd7c0; 313 second = (code_point & 0x03ff) + 0xdc00; 314 } 315 316 static void testConversions(uint16_t *buf, int char_count) { 317 char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 }; 318 uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 }; 319 int byte_count_test, byte_count_reference; 320 int char_count_test, char_count_reference; 321 322 // Calculate the number of utf-8 bytes for the utf-16 chars. 323 byte_count_reference = CountUtf8Bytes_reference(buf, char_count); 324 byte_count_test = CountUtf8Bytes(buf, char_count); 325 EXPECT_EQ(byte_count_reference, byte_count_test); 326 327 // Convert the utf-16 string to utf-8 bytes. 328 ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count); 329 ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count); 330 for (int i = 0; i < byte_count_test; ++i) { 331 EXPECT_EQ(bytes_reference[i], bytes_test[i]); 332 } 333 334 // Calculate the number of utf-16 chars from the utf-8 bytes. 335 bytes_reference[byte_count_reference] = 0; // Reference function needs null termination. 336 char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference); 337 char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test); 338 EXPECT_EQ(char_count, char_count_reference); 339 EXPECT_EQ(char_count, char_count_test); 340 341 // Convert the utf-8 bytes back to utf-16 chars. 342 // Does not need copied _reference version of the function because the original 343 // function with the old API is retained for debug/testing code. 344 ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference); 345 ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test); 346 for (int i = 0; i < char_count_test; ++i) { 347 EXPECT_EQ(buf[i], out_buf_reference[i]); 348 EXPECT_EQ(buf[i], out_buf_test[i]); 349 } 350 } 351 352 TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) { 353 for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) { 354 uint16_t buf[4] = { 0 }; 355 if (codePoint <= 0xffff) { 356 if (codePoint >= 0xd800 && codePoint <= 0xdfff) { 357 // According to the Unicode standard, no character will ever 358 // be assigned to these code points, and they cannot be encoded 359 // into either utf-16 or utf-8. 360 continue; 361 } 362 buf[0] = 'h'; 363 buf[1] = codePoint; 364 buf[2] = 'e'; 365 testConversions(buf, 2); 366 testConversions(buf, 3); 367 testConversions(buf + 1, 1); 368 testConversions(buf + 1, 2); 369 } else { 370 buf[0] = 'h'; 371 codePointToSurrogatePair(codePoint, buf[1], buf[2]); 372 buf[3] = 'e'; 373 testConversions(buf, 2); 374 testConversions(buf, 3); 375 testConversions(buf, 4); 376 testConversions(buf + 1, 1); 377 testConversions(buf + 1, 2); 378 testConversions(buf + 1, 3); 379 } 380 } 381 } 382 383 TEST_F(UtfTest, NonAscii) { 384 const char kNonAsciiCharacter = '\x80'; 385 const char input[] = { kNonAsciiCharacter, '\0' }; 386 uint32_t hash = ComputeModifiedUtf8Hash(input); 387 EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash); 388 } 389 390 TEST_F(UtfTest, PrintableStringUtf8) { 391 // Note: This is UTF-8, not Modified-UTF-8. 392 const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 }; 393 const char* start = reinterpret_cast<const char*>(kTestSequence); 394 const char* ptr = start; 395 uint32_t pair = GetUtf16FromUtf8(&ptr); 396 ASSERT_EQ(*ptr, '\0'); 397 uint16_t leading = GetLeadingUtf16Char(pair); 398 uint16_t trailing = GetTrailingUtf16Char(pair); 399 ASSERT_NE(0u, trailing); 400 401 std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"", 402 static_cast<unsigned>(leading), 403 static_cast<unsigned>(trailing)); 404 std::string printable = PrintableString(start); 405 EXPECT_EQ(expected, printable); 406 } 407 408 } // namespace art 409