1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/Support/ConvertUTF.h" 11 #include "llvm/Support/Format.h" 12 #include "gtest/gtest.h" 13 #include <string> 14 #include <utility> 15 #include <vector> 16 17 using namespace llvm; 18 19 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { 20 // Src is the look of disapproval. 21 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 22 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 23 std::string Result; 24 bool Success = convertUTF16ToUTF8String(Ref, Result); 25 EXPECT_TRUE(Success); 26 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 27 EXPECT_EQ(Expected, Result); 28 } 29 30 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { 31 // Src is the look of disapproval. 32 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0"; 33 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 34 std::string Result; 35 bool Success = convertUTF16ToUTF8String(Ref, Result); 36 EXPECT_TRUE(Success); 37 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 38 EXPECT_EQ(Expected, Result); 39 } 40 41 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { 42 // Src is the look of disapproval. 43 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; 44 StringRef Ref(Src, sizeof(Src) - 1); 45 SmallVector<UTF16, 5> Result; 46 bool Success = convertUTF8ToUTF16String(Ref, Result); 47 EXPECT_TRUE(Success); 48 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; 49 ASSERT_EQ(3u, Result.size()); 50 for (int I = 0, E = 3; I != E; ++I) 51 EXPECT_EQ(Expected[I], Result[I]); 52 } 53 54 TEST(ConvertUTFTest, OddLengthInput) { 55 std::string Result; 56 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result); 57 EXPECT_FALSE(Success); 58 } 59 60 TEST(ConvertUTFTest, Empty) { 61 std::string Result; 62 bool Success = convertUTF16ToUTF8String(None, Result); 63 EXPECT_TRUE(Success); 64 EXPECT_TRUE(Result.empty()); 65 } 66 67 TEST(ConvertUTFTest, HasUTF16BOM) { 68 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2)); 69 EXPECT_TRUE(HasBOM); 70 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2)); 71 EXPECT_TRUE(HasBOM); 72 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3)); 73 EXPECT_TRUE(HasBOM); // Don't care about odd lengths. 74 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6)); 75 EXPECT_TRUE(HasBOM); 76 77 HasBOM = hasUTF16ByteOrderMark(None); 78 EXPECT_FALSE(HasBOM); 79 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1)); 80 EXPECT_FALSE(HasBOM); 81 } 82 83 struct ConvertUTFResultContainer { 84 ConversionResult ErrorCode; 85 std::vector<unsigned> UnicodeScalars; 86 87 ConvertUTFResultContainer(ConversionResult ErrorCode) 88 : ErrorCode(ErrorCode) {} 89 90 ConvertUTFResultContainer 91 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, 92 unsigned US2 = 0x110000, unsigned US3 = 0x110000, 93 unsigned US4 = 0x110000, unsigned US5 = 0x110000, 94 unsigned US6 = 0x110000, unsigned US7 = 0x110000) { 95 ConvertUTFResultContainer Result(*this); 96 if (US0 != 0x110000) 97 Result.UnicodeScalars.push_back(US0); 98 if (US1 != 0x110000) 99 Result.UnicodeScalars.push_back(US1); 100 if (US2 != 0x110000) 101 Result.UnicodeScalars.push_back(US2); 102 if (US3 != 0x110000) 103 Result.UnicodeScalars.push_back(US3); 104 if (US4 != 0x110000) 105 Result.UnicodeScalars.push_back(US4); 106 if (US5 != 0x110000) 107 Result.UnicodeScalars.push_back(US5); 108 if (US6 != 0x110000) 109 Result.UnicodeScalars.push_back(US6); 110 if (US7 != 0x110000) 111 Result.UnicodeScalars.push_back(US7); 112 return Result; 113 } 114 }; 115 116 std::pair<ConversionResult, std::vector<unsigned>> 117 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { 118 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 119 120 const UTF8 *SourceNext = SourceStart; 121 std::vector<UTF32> Decoded(S.size(), 0); 122 UTF32 *TargetStart = Decoded.data(); 123 124 auto ErrorCode = 125 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, 126 Decoded.data() + Decoded.size(), lenientConversion); 127 128 Decoded.resize(TargetStart - Decoded.data()); 129 130 return std::make_pair(ErrorCode, Decoded); 131 } 132 133 std::pair<ConversionResult, std::vector<unsigned>> 134 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { 135 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 136 137 const UTF8 *SourceNext = SourceStart; 138 std::vector<UTF32> Decoded(S.size(), 0); 139 UTF32 *TargetStart = Decoded.data(); 140 141 auto ErrorCode = ConvertUTF8toUTF32Partial( 142 &SourceNext, SourceStart + S.size(), &TargetStart, 143 Decoded.data() + Decoded.size(), lenientConversion); 144 145 Decoded.resize(TargetStart - Decoded.data()); 146 147 return std::make_pair(ErrorCode, Decoded); 148 } 149 150 ::testing::AssertionResult 151 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, 152 StringRef S, bool Partial = false) { 153 ConversionResult ErrorCode; 154 std::vector<unsigned> Decoded; 155 if (!Partial) 156 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); 157 else 158 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); 159 160 if (Expected.ErrorCode != ErrorCode) 161 return ::testing::AssertionFailure() << "Expected error code " 162 << Expected.ErrorCode << ", actual " 163 << ErrorCode; 164 165 if (Expected.UnicodeScalars != Decoded) 166 return ::testing::AssertionFailure() 167 << "Expected lenient decoded result:\n" 168 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n" 169 << "Actual result:\n" << ::testing::PrintToString(Decoded); 170 171 return ::testing::AssertionSuccess(); 172 } 173 174 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { 175 176 // 177 // 1-byte sequences 178 // 179 180 // U+0041 LATIN CAPITAL LETTER A 181 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 182 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41")); 183 184 // 185 // 2-byte sequences 186 // 187 188 // U+0283 LATIN SMALL LETTER ESH 189 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 190 ConvertUTFResultContainer(conversionOK).withScalars(0x0283), 191 "\xca\x83")); 192 193 // U+03BA GREEK SMALL LETTER KAPPA 194 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA 195 // U+03C3 GREEK SMALL LETTER SIGMA 196 // U+03BC GREEK SMALL LETTER MU 197 // U+03B5 GREEK SMALL LETTER EPSILON 198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 199 ConvertUTFResultContainer(conversionOK) 200 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), 201 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5")); 202 203 // 204 // 3-byte sequences 205 // 206 207 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B 208 // U+6587 CJK UNIFIED IDEOGRAPH-6587 209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 210 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), 211 "\xe4\xbe\x8b\xe6\x96\x87")); 212 213 // U+D55C HANGUL SYLLABLE HAN 214 // U+AE00 HANGUL SYLLABLE GEUL 215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 216 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), 217 "\xed\x95\x9c\xea\xb8\x80")); 218 219 // U+1112 HANGUL CHOSEONG HIEUH 220 // U+1161 HANGUL JUNGSEONG A 221 // U+11AB HANGUL JONGSEONG NIEUN 222 // U+1100 HANGUL CHOSEONG KIYEOK 223 // U+1173 HANGUL JUNGSEONG EU 224 // U+11AF HANGUL JONGSEONG RIEUL 225 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 226 ConvertUTFResultContainer(conversionOK) 227 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), 228 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" 229 "\xe1\x86\xaf")); 230 231 // 232 // 4-byte sequences 233 // 234 235 // U+E0100 VARIATION SELECTOR-17 236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 237 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), 238 "\xf3\xa0\x84\x80")); 239 240 // 241 // First possible sequence of a certain length 242 // 243 244 // U+0000 NULL 245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 246 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 247 StringRef("\x00", 1))); 248 249 // U+0080 PADDING CHARACTER 250 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 251 ConvertUTFResultContainer(conversionOK).withScalars(0x0080), 252 "\xc2\x80")); 253 254 // U+0800 SAMARITAN LETTER ALAF 255 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 256 ConvertUTFResultContainer(conversionOK).withScalars(0x0800), 257 "\xe0\xa0\x80")); 258 259 // U+10000 LINEAR B SYLLABLE B008 A 260 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 261 ConvertUTFResultContainer(conversionOK).withScalars(0x10000), 262 "\xf0\x90\x80\x80")); 263 264 // U+200000 (invalid) 265 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 266 ConvertUTFResultContainer(sourceIllegal) 267 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 268 "\xf8\x88\x80\x80\x80")); 269 270 // U+4000000 (invalid) 271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 272 ConvertUTFResultContainer(sourceIllegal) 273 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 274 "\xfc\x84\x80\x80\x80\x80")); 275 276 // 277 // Last possible sequence of a certain length 278 // 279 280 // U+007F DELETE 281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 282 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f")); 283 284 // U+07FF (unassigned) 285 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 286 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), 287 "\xdf\xbf")); 288 289 // U+FFFF (noncharacter) 290 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 291 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 292 "\xef\xbf\xbf")); 293 294 // U+1FFFFF (invalid) 295 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 296 ConvertUTFResultContainer(sourceIllegal) 297 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 298 "\xf7\xbf\xbf\xbf")); 299 300 // U+3FFFFFF (invalid) 301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 302 ConvertUTFResultContainer(sourceIllegal) 303 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 304 "\xfb\xbf\xbf\xbf\xbf")); 305 306 // U+7FFFFFFF (invalid) 307 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 308 ConvertUTFResultContainer(sourceIllegal) 309 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 310 "\xfd\xbf\xbf\xbf\xbf\xbf")); 311 312 // 313 // Other boundary conditions 314 // 315 316 // U+D7FF (unassigned) 317 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 318 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), 319 "\xed\x9f\xbf")); 320 321 // U+E000 (private use) 322 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 323 ConvertUTFResultContainer(conversionOK).withScalars(0xe000), 324 "\xee\x80\x80")); 325 326 // U+FFFD REPLACEMENT CHARACTER 327 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 328 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), 329 "\xef\xbf\xbd")); 330 331 // U+10FFFF (noncharacter) 332 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 333 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 334 "\xf4\x8f\xbf\xbf")); 335 336 // U+110000 (invalid) 337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 338 ConvertUTFResultContainer(sourceIllegal) 339 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 340 "\xf4\x90\x80\x80")); 341 342 // 343 // Unexpected continuation bytes 344 // 345 346 // A sequence of unexpected continuation bytes that don't follow a first 347 // byte, every byte is a maximal subpart. 348 349 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 350 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80")); 351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 352 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf")); 353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 354 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 355 "\x80\x80")); 356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 357 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 358 "\x80\xbf")); 359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 360 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 361 "\xbf\x80")); 362 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 363 ConvertUTFResultContainer(sourceIllegal) 364 .withScalars(0xfffd, 0xfffd, 0xfffd), 365 "\x80\xbf\x80")); 366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 367 ConvertUTFResultContainer(sourceIllegal) 368 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 369 "\x80\xbf\x80\xbf")); 370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 371 ConvertUTFResultContainer(sourceIllegal) 372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 373 "\x80\xbf\x82\xbf\xaa")); 374 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 375 ConvertUTFResultContainer(sourceIllegal) 376 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 377 "\xaa\xb0\xbb\xbf\xaa\xa0")); 378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 379 ConvertUTFResultContainer(sourceIllegal) 380 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 381 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f")); 382 383 // All continuation bytes (0x80--0xbf). 384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 385 ConvertUTFResultContainer(sourceIllegal) 386 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 387 0xfffd, 0xfffd, 0xfffd, 0xfffd) 388 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 389 0xfffd, 0xfffd, 0xfffd, 0xfffd) 390 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 391 0xfffd, 0xfffd, 0xfffd, 0xfffd) 392 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 393 0xfffd, 0xfffd, 0xfffd, 0xfffd) 394 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 395 0xfffd, 0xfffd, 0xfffd, 0xfffd) 396 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 397 0xfffd, 0xfffd, 0xfffd, 0xfffd) 398 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 399 0xfffd, 0xfffd, 0xfffd, 0xfffd) 400 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 401 0xfffd, 0xfffd, 0xfffd, 0xfffd), 402 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" 403 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" 404 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" 405 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf")); 406 407 // 408 // Lonely start bytes 409 // 410 411 // Start bytes of 2-byte sequences (0xc0--0xdf). 412 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 413 ConvertUTFResultContainer(sourceIllegal) 414 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 415 0xfffd, 0xfffd, 0xfffd, 0xfffd) 416 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 417 0xfffd, 0xfffd, 0xfffd, 0xfffd) 418 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 419 0xfffd, 0xfffd, 0xfffd, 0xfffd) 420 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 421 0xfffd, 0xfffd, 0xfffd, 0xfffd), 422 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" 423 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf")); 424 425 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 426 ConvertUTFResultContainer(sourceIllegal) 427 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 428 0xfffd, 0x0020, 0xfffd, 0x0020) 429 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 430 0xfffd, 0x0020, 0xfffd, 0x0020) 431 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 432 0xfffd, 0x0020, 0xfffd, 0x0020) 433 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 434 0xfffd, 0x0020, 0xfffd, 0x0020) 435 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 436 0xfffd, 0x0020, 0xfffd, 0x0020) 437 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 438 0xfffd, 0x0020, 0xfffd, 0x0020) 439 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 440 0xfffd, 0x0020, 0xfffd, 0x0020) 441 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 442 0xfffd, 0x0020, 0xfffd, 0x0020), 443 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" 444 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" 445 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" 446 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20")); 447 448 // Start bytes of 3-byte sequences (0xe0--0xef). 449 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 450 ConvertUTFResultContainer(sourceIllegal) 451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 452 0xfffd, 0xfffd, 0xfffd, 0xfffd) 453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 454 0xfffd, 0xfffd, 0xfffd, 0xfffd), 455 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef")); 456 457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 458 ConvertUTFResultContainer(sourceIllegal) 459 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 460 0xfffd, 0x0020, 0xfffd, 0x0020) 461 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 462 0xfffd, 0x0020, 0xfffd, 0x0020) 463 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 464 0xfffd, 0x0020, 0xfffd, 0x0020) 465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 466 0xfffd, 0x0020, 0xfffd, 0x0020), 467 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" 468 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20")); 469 470 // Start bytes of 4-byte sequences (0xf0--0xf7). 471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 472 ConvertUTFResultContainer(sourceIllegal) 473 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 474 0xfffd, 0xfffd, 0xfffd, 0xfffd), 475 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")); 476 477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 478 ConvertUTFResultContainer(sourceIllegal) 479 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 480 0xfffd, 0x0020, 0xfffd, 0x0020) 481 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 482 0xfffd, 0x0020, 0xfffd, 0x0020), 483 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20")); 484 485 // Start bytes of 5-byte sequences (0xf8--0xfb). 486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 487 ConvertUTFResultContainer(sourceIllegal) 488 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 489 "\xf8\xf9\xfa\xfb")); 490 491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 492 ConvertUTFResultContainer(sourceIllegal) 493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 494 0xfffd, 0x0020, 0xfffd, 0x0020), 495 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20")); 496 497 // Start bytes of 6-byte sequences (0xfc--0xfd). 498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 499 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 500 "\xfc\xfd")); 501 502 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 503 ConvertUTFResultContainer(sourceIllegal) 504 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), 505 "\xfc\x20\xfd\x20")); 506 507 // 508 // Other bytes (0xc0--0xc1, 0xfe--0xff). 509 // 510 511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 512 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0")); 513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 514 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1")); 515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 516 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe")); 517 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 518 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff")); 519 520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 521 ConvertUTFResultContainer(sourceIllegal) 522 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 523 "\xc0\xc1\xfe\xff")); 524 525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 526 ConvertUTFResultContainer(sourceIllegal) 527 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 528 "\xfe\xfe\xff\xff")); 529 530 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 531 ConvertUTFResultContainer(sourceIllegal) 532 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 533 "\xfe\x80\x80\x80\x80\x80")); 534 535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 536 ConvertUTFResultContainer(sourceIllegal) 537 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 538 "\xff\x80\x80\x80\x80\x80")); 539 540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 541 ConvertUTFResultContainer(sourceIllegal) 542 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 543 0xfffd, 0x0020, 0xfffd, 0x0020), 544 "\xc0\x20\xc1\x20\xfe\x20\xff\x20")); 545 546 // 547 // Sequences with one continuation byte missing 548 // 549 550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 551 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2")); 552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 553 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf")); 554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 555 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 556 "\xe0\xa0")); 557 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 558 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 559 "\xe0\xbf")); 560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 561 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 562 "\xe1\x80")); 563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 564 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 565 "\xec\xbf")); 566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 567 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 568 "\xed\x80")); 569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 570 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 571 "\xed\x9f")); 572 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 573 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 574 "\xee\x80")); 575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 576 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 577 "\xef\xbf")); 578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 579 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 580 "\xf0\x90\x80")); 581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 582 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 583 "\xf0\xbf\xbf")); 584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 586 "\xf1\x80\x80")); 587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 589 "\xf3\xbf\xbf")); 590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 592 "\xf4\x80\x80")); 593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 595 "\xf4\x8f\xbf")); 596 597 // Overlong sequences with one trailing byte missing. 598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 599 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 600 "\xc0")); 601 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 602 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 603 "\xc1")); 604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 605 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 606 "\xe0\x80")); 607 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 608 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 609 "\xe0\x9f")); 610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 611 ConvertUTFResultContainer(sourceIllegal) 612 .withScalars(0xfffd, 0xfffd, 0xfffd), 613 "\xf0\x80\x80")); 614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 615 ConvertUTFResultContainer(sourceIllegal) 616 .withScalars(0xfffd, 0xfffd, 0xfffd), 617 "\xf0\x8f\x80")); 618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 619 ConvertUTFResultContainer(sourceIllegal) 620 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 621 "\xf8\x80\x80\x80")); 622 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 623 ConvertUTFResultContainer(sourceIllegal) 624 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 625 "\xfc\x80\x80\x80\x80")); 626 627 // Sequences that represent surrogates with one trailing byte missing. 628 // High surrogates 629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 630 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 631 "\xed\xa0")); 632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 633 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 634 "\xed\xac")); 635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 636 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 637 "\xed\xaf")); 638 // Low surrogates 639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 640 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 641 "\xed\xb0")); 642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 643 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 644 "\xed\xb4")); 645 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 646 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 647 "\xed\xbf")); 648 649 // Ill-formed 4-byte sequences. 650 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 651 // U+1100xx (invalid) 652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 653 ConvertUTFResultContainer(sourceIllegal) 654 .withScalars(0xfffd, 0xfffd, 0xfffd), 655 "\xf4\x90\x80")); 656 // U+13FBxx (invalid) 657 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 658 ConvertUTFResultContainer(sourceIllegal) 659 .withScalars(0xfffd, 0xfffd, 0xfffd), 660 "\xf4\xbf\xbf")); 661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 662 ConvertUTFResultContainer(sourceIllegal) 663 .withScalars(0xfffd, 0xfffd, 0xfffd), 664 "\xf5\x80\x80")); 665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 666 ConvertUTFResultContainer(sourceIllegal) 667 .withScalars(0xfffd, 0xfffd, 0xfffd), 668 "\xf6\x80\x80")); 669 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 670 ConvertUTFResultContainer(sourceIllegal) 671 .withScalars(0xfffd, 0xfffd, 0xfffd), 672 "\xf7\x80\x80")); 673 // U+1FFBxx (invalid) 674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 675 ConvertUTFResultContainer(sourceIllegal) 676 .withScalars(0xfffd, 0xfffd, 0xfffd), 677 "\xf7\xbf\xbf")); 678 679 // Ill-formed 5-byte sequences. 680 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 681 // U+2000xx (invalid) 682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 683 ConvertUTFResultContainer(sourceIllegal) 684 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 685 "\xf8\x88\x80\x80")); 686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 687 ConvertUTFResultContainer(sourceIllegal) 688 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 689 "\xf8\xbf\xbf\xbf")); 690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 691 ConvertUTFResultContainer(sourceIllegal) 692 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 693 "\xf9\x80\x80\x80")); 694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 695 ConvertUTFResultContainer(sourceIllegal) 696 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 697 "\xfa\x80\x80\x80")); 698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 699 ConvertUTFResultContainer(sourceIllegal) 700 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 701 "\xfb\x80\x80\x80")); 702 // U+3FFFFxx (invalid) 703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 704 ConvertUTFResultContainer(sourceIllegal) 705 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 706 "\xfb\xbf\xbf\xbf")); 707 708 // Ill-formed 6-byte sequences. 709 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx 710 // U+40000xx (invalid) 711 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 712 ConvertUTFResultContainer(sourceIllegal) 713 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 714 "\xfc\x84\x80\x80\x80")); 715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 716 ConvertUTFResultContainer(sourceIllegal) 717 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 718 "\xfc\xbf\xbf\xbf\xbf")); 719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 720 ConvertUTFResultContainer(sourceIllegal) 721 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 722 "\xfd\x80\x80\x80\x80")); 723 // U+7FFFFFxx (invalid) 724 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 725 ConvertUTFResultContainer(sourceIllegal) 726 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 727 "\xfd\xbf\xbf\xbf\xbf")); 728 729 // 730 // Sequences with two continuation bytes missing 731 // 732 733 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 734 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 735 "\xf0\x90")); 736 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 737 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 738 "\xf0\xbf")); 739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 740 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 741 "\xf1\x80")); 742 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 743 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 744 "\xf3\xbf")); 745 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 746 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 747 "\xf4\x80")); 748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 749 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 750 "\xf4\x8f")); 751 752 // Overlong sequences with two trailing byte missing. 753 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 754 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0")); 755 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 756 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 757 "\xf0\x80")); 758 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 759 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 760 "\xf0\x8f")); 761 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 762 ConvertUTFResultContainer(sourceIllegal) 763 .withScalars(0xfffd, 0xfffd, 0xfffd), 764 "\xf8\x80\x80")); 765 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 766 ConvertUTFResultContainer(sourceIllegal) 767 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 768 "\xfc\x80\x80\x80")); 769 770 // Sequences that represent surrogates with two trailing bytes missing. 771 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 772 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed")); 773 774 // Ill-formed 4-byte sequences. 775 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 776 // U+110yxx (invalid) 777 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 778 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 779 "\xf4\x90")); 780 // U+13Fyxx (invalid) 781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 782 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 783 "\xf4\xbf")); 784 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 785 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 786 "\xf5\x80")); 787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 789 "\xf6\x80")); 790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 792 "\xf7\x80")); 793 // U+1FFyxx (invalid) 794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 795 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 796 "\xf7\xbf")); 797 798 // Ill-formed 5-byte sequences. 799 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 800 // U+200yxx (invalid) 801 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 802 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 803 "\xf8\x88\x80")); 804 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 805 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 806 "\xf8\xbf\xbf")); 807 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 808 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 809 "\xf9\x80\x80")); 810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 812 "\xfa\x80\x80")); 813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 815 "\xfb\x80\x80")); 816 // U+3FFFyxx (invalid) 817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 818 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 819 "\xfb\xbf\xbf")); 820 821 // Ill-formed 6-byte sequences. 822 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 823 // U+4000yxx (invalid) 824 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 825 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 826 "\xfc\x84\x80\x80")); 827 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 828 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 829 "\xfc\xbf\xbf\xbf")); 830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 832 "\xfd\x80\x80\x80")); 833 // U+7FFFFyxx (invalid) 834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 836 "\xfd\xbf\xbf\xbf")); 837 838 // 839 // Sequences with three continuation bytes missing 840 // 841 842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 844 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 845 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1")); 846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 847 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2")); 848 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 849 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3")); 850 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 851 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4")); 852 853 // Broken overlong sequences. 854 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 855 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 856 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 857 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 858 "\xf8\x80")); 859 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 860 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 861 "\xfc\x80\x80")); 862 863 // Ill-formed 4-byte sequences. 864 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 865 // U+14yyxx (invalid) 866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5")); 868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6")); 870 // U+1Cyyxx (invalid) 871 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 872 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7")); 873 874 // Ill-formed 5-byte sequences. 875 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 876 // U+20yyxx (invalid) 877 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 878 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 879 "\xf8\x88")); 880 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 881 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 882 "\xf8\xbf")); 883 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 884 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 885 "\xf9\x80")); 886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 888 "\xfa\x80")); 889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 891 "\xfb\x80")); 892 // U+3FCyyxx (invalid) 893 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 894 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 895 "\xfb\xbf")); 896 897 // Ill-formed 6-byte sequences. 898 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 899 // U+400yyxx (invalid) 900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 902 "\xfc\x84\x80")); 903 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 904 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 905 "\xfc\xbf\xbf")); 906 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 907 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 908 "\xfd\x80\x80")); 909 // U+7FFCyyxx (invalid) 910 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 911 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 912 "\xfd\xbf\xbf")); 913 914 // 915 // Sequences with four continuation bytes missing 916 // 917 918 // Ill-formed 5-byte sequences. 919 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 920 // U+uzyyxx (invalid) 921 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 922 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 923 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 924 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9")); 925 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 926 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa")); 927 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 928 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 929 // U+3zyyxx (invalid) 930 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 931 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 932 933 // Broken overlong sequences. 934 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 935 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 937 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 938 "\xfc\x80")); 939 940 // Ill-formed 6-byte sequences. 941 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 942 // U+uzzyyxx (invalid) 943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 944 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 945 "\xfc\x84")); 946 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 947 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 948 "\xfc\xbf")); 949 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 950 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 951 "\xfd\x80")); 952 // U+7Fzzyyxx (invalid) 953 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 954 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 955 "\xfd\xbf")); 956 957 // 958 // Sequences with five continuation bytes missing 959 // 960 961 // Ill-formed 6-byte sequences. 962 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 963 // U+uzzyyxx (invalid) 964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc")); 966 // U+uuzzyyxx (invalid) 967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 968 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd")); 969 970 // 971 // Consecutive sequences with trailing bytes missing 972 // 973 974 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 975 ConvertUTFResultContainer(sourceIllegal) 976 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 977 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 978 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) 979 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 980 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 981 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 982 "\xc0" "\xe0\x80" "\xf0\x80\x80" 983 "\xf8\x80\x80\x80" 984 "\xfc\x80\x80\x80\x80" 985 "\xdf" "\xef\xbf" "\xf7\xbf\xbf" 986 "\xfb\xbf\xbf\xbf" 987 "\xfd\xbf\xbf\xbf\xbf")); 988 989 // 990 // Overlong UTF-8 sequences 991 // 992 993 // U+002F SOLIDUS 994 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 995 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f")); 996 997 // Overlong sequences of the above. 998 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 999 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1000 "\xc0\xaf")); 1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1002 ConvertUTFResultContainer(sourceIllegal) 1003 .withScalars(0xfffd, 0xfffd, 0xfffd), 1004 "\xe0\x80\xaf")); 1005 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1006 ConvertUTFResultContainer(sourceIllegal) 1007 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1008 "\xf0\x80\x80\xaf")); 1009 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1010 ConvertUTFResultContainer(sourceIllegal) 1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1012 "\xf8\x80\x80\x80\xaf")); 1013 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1014 ConvertUTFResultContainer(sourceIllegal) 1015 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1016 "\xfc\x80\x80\x80\x80\xaf")); 1017 1018 // U+0000 NULL 1019 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1020 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 1021 StringRef("\x00", 1))); 1022 1023 // Overlong sequences of the above. 1024 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1025 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1026 "\xc0\x80")); 1027 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1028 ConvertUTFResultContainer(sourceIllegal) 1029 .withScalars(0xfffd, 0xfffd, 0xfffd), 1030 "\xe0\x80\x80")); 1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1032 ConvertUTFResultContainer(sourceIllegal) 1033 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1034 "\xf0\x80\x80\x80")); 1035 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1036 ConvertUTFResultContainer(sourceIllegal) 1037 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1038 "\xf8\x80\x80\x80\x80")); 1039 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1040 ConvertUTFResultContainer(sourceIllegal) 1041 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1042 "\xfc\x80\x80\x80\x80\x80")); 1043 1044 // Other overlong sequences. 1045 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1046 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1047 "\xc0\xbf")); 1048 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1049 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1050 "\xc1\x80")); 1051 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1052 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1053 "\xc1\xbf")); 1054 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1055 ConvertUTFResultContainer(sourceIllegal) 1056 .withScalars(0xfffd, 0xfffd, 0xfffd), 1057 "\xe0\x9f\xbf")); 1058 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1059 ConvertUTFResultContainer(sourceIllegal) 1060 .withScalars(0xfffd, 0xfffd, 0xfffd), 1061 "\xed\xa0\x80")); 1062 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1063 ConvertUTFResultContainer(sourceIllegal) 1064 .withScalars(0xfffd, 0xfffd, 0xfffd), 1065 "\xed\xbf\xbf")); 1066 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1067 ConvertUTFResultContainer(sourceIllegal) 1068 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1069 "\xf0\x8f\x80\x80")); 1070 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1071 ConvertUTFResultContainer(sourceIllegal) 1072 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1073 "\xf0\x8f\xbf\xbf")); 1074 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1075 ConvertUTFResultContainer(sourceIllegal) 1076 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1077 "\xf8\x87\xbf\xbf\xbf")); 1078 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1079 ConvertUTFResultContainer(sourceIllegal) 1080 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1081 "\xfc\x83\xbf\xbf\xbf\xbf")); 1082 1083 // 1084 // Isolated surrogates 1085 // 1086 1087 // Unicode 6.3.0: 1088 // 1089 // D71. High-surrogate code point: A Unicode code point in the range 1090 // U+D800 to U+DBFF. 1091 // 1092 // D73. Low-surrogate code point: A Unicode code point in the range 1093 // U+DC00 to U+DFFF. 1094 1095 // Note: U+E0100 is <DB40 DD00> in UTF16. 1096 1097 // High surrogates 1098 1099 // U+D800 1100 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1101 ConvertUTFResultContainer(sourceIllegal) 1102 .withScalars(0xfffd, 0xfffd, 0xfffd), 1103 "\xed\xa0\x80")); 1104 1105 // U+DB40 1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1107 ConvertUTFResultContainer(sourceIllegal) 1108 .withScalars(0xfffd, 0xfffd, 0xfffd), 1109 "\xed\xac\xa0")); 1110 1111 // U+DBFF 1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1113 ConvertUTFResultContainer(sourceIllegal) 1114 .withScalars(0xfffd, 0xfffd, 0xfffd), 1115 "\xed\xaf\xbf")); 1116 1117 // Low surrogates 1118 1119 // U+DC00 1120 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1121 ConvertUTFResultContainer(sourceIllegal) 1122 .withScalars(0xfffd, 0xfffd, 0xfffd), 1123 "\xed\xb0\x80")); 1124 1125 // U+DD00 1126 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1127 ConvertUTFResultContainer(sourceIllegal) 1128 .withScalars(0xfffd, 0xfffd, 0xfffd), 1129 "\xed\xb4\x80")); 1130 1131 // U+DFFF 1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1133 ConvertUTFResultContainer(sourceIllegal) 1134 .withScalars(0xfffd, 0xfffd, 0xfffd), 1135 "\xed\xbf\xbf")); 1136 1137 // Surrogate pairs 1138 1139 // U+D800 U+DC00 1140 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1141 ConvertUTFResultContainer(sourceIllegal) 1142 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1143 "\xed\xa0\x80\xed\xb0\x80")); 1144 1145 // U+D800 U+DD00 1146 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1147 ConvertUTFResultContainer(sourceIllegal) 1148 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1149 "\xed\xa0\x80\xed\xb4\x80")); 1150 1151 // U+D800 U+DFFF 1152 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1153 ConvertUTFResultContainer(sourceIllegal) 1154 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1155 "\xed\xa0\x80\xed\xbf\xbf")); 1156 1157 // U+DB40 U+DC00 1158 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1159 ConvertUTFResultContainer(sourceIllegal) 1160 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1161 "\xed\xac\xa0\xed\xb0\x80")); 1162 1163 // U+DB40 U+DD00 1164 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1165 ConvertUTFResultContainer(sourceIllegal) 1166 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1167 "\xed\xac\xa0\xed\xb4\x80")); 1168 1169 // U+DB40 U+DFFF 1170 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1171 ConvertUTFResultContainer(sourceIllegal) 1172 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1173 "\xed\xac\xa0\xed\xbf\xbf")); 1174 1175 // U+DBFF U+DC00 1176 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1177 ConvertUTFResultContainer(sourceIllegal) 1178 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1179 "\xed\xaf\xbf\xed\xb0\x80")); 1180 1181 // U+DBFF U+DD00 1182 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1183 ConvertUTFResultContainer(sourceIllegal) 1184 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1185 "\xed\xaf\xbf\xed\xb4\x80")); 1186 1187 // U+DBFF U+DFFF 1188 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1189 ConvertUTFResultContainer(sourceIllegal) 1190 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1191 "\xed\xaf\xbf\xed\xbf\xbf")); 1192 1193 // 1194 // Noncharacters 1195 // 1196 1197 // Unicode 6.3.0: 1198 // 1199 // D14. Noncharacter: A code point that is permanently reserved for 1200 // internal use and that should never be interchanged. Noncharacters 1201 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) 1202 // and the values U+FDD0..U+FDEF. 1203 1204 // U+FFFE 1205 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1206 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), 1207 "\xef\xbf\xbe")); 1208 1209 // U+FFFF 1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1211 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 1212 "\xef\xbf\xbf")); 1213 1214 // U+1FFFE 1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1216 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), 1217 "\xf0\x9f\xbf\xbe")); 1218 1219 // U+1FFFF 1220 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1221 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), 1222 "\xf0\x9f\xbf\xbf")); 1223 1224 // U+2FFFE 1225 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1226 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), 1227 "\xf0\xaf\xbf\xbe")); 1228 1229 // U+2FFFF 1230 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1231 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), 1232 "\xf0\xaf\xbf\xbf")); 1233 1234 // U+3FFFE 1235 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1236 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), 1237 "\xf0\xbf\xbf\xbe")); 1238 1239 // U+3FFFF 1240 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1241 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), 1242 "\xf0\xbf\xbf\xbf")); 1243 1244 // U+4FFFE 1245 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1246 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), 1247 "\xf1\x8f\xbf\xbe")); 1248 1249 // U+4FFFF 1250 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1251 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), 1252 "\xf1\x8f\xbf\xbf")); 1253 1254 // U+5FFFE 1255 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1256 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), 1257 "\xf1\x9f\xbf\xbe")); 1258 1259 // U+5FFFF 1260 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1261 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), 1262 "\xf1\x9f\xbf\xbf")); 1263 1264 // U+6FFFE 1265 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1266 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), 1267 "\xf1\xaf\xbf\xbe")); 1268 1269 // U+6FFFF 1270 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1271 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), 1272 "\xf1\xaf\xbf\xbf")); 1273 1274 // U+7FFFE 1275 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1276 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), 1277 "\xf1\xbf\xbf\xbe")); 1278 1279 // U+7FFFF 1280 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1281 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), 1282 "\xf1\xbf\xbf\xbf")); 1283 1284 // U+8FFFE 1285 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1286 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), 1287 "\xf2\x8f\xbf\xbe")); 1288 1289 // U+8FFFF 1290 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1291 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), 1292 "\xf2\x8f\xbf\xbf")); 1293 1294 // U+9FFFE 1295 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1296 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), 1297 "\xf2\x9f\xbf\xbe")); 1298 1299 // U+9FFFF 1300 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1301 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), 1302 "\xf2\x9f\xbf\xbf")); 1303 1304 // U+AFFFE 1305 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1306 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), 1307 "\xf2\xaf\xbf\xbe")); 1308 1309 // U+AFFFF 1310 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1311 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), 1312 "\xf2\xaf\xbf\xbf")); 1313 1314 // U+BFFFE 1315 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1316 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), 1317 "\xf2\xbf\xbf\xbe")); 1318 1319 // U+BFFFF 1320 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1321 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), 1322 "\xf2\xbf\xbf\xbf")); 1323 1324 // U+CFFFE 1325 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1326 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), 1327 "\xf3\x8f\xbf\xbe")); 1328 1329 // U+CFFFF 1330 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1331 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), 1332 "\xf3\x8f\xbf\xbf")); 1333 1334 // U+DFFFE 1335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1336 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), 1337 "\xf3\x9f\xbf\xbe")); 1338 1339 // U+DFFFF 1340 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1341 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), 1342 "\xf3\x9f\xbf\xbf")); 1343 1344 // U+EFFFE 1345 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1346 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), 1347 "\xf3\xaf\xbf\xbe")); 1348 1349 // U+EFFFF 1350 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1351 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), 1352 "\xf3\xaf\xbf\xbf")); 1353 1354 // U+FFFFE 1355 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1356 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), 1357 "\xf3\xbf\xbf\xbe")); 1358 1359 // U+FFFFF 1360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1361 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), 1362 "\xf3\xbf\xbf\xbf")); 1363 1364 // U+10FFFE 1365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1366 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), 1367 "\xf4\x8f\xbf\xbe")); 1368 1369 // U+10FFFF 1370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1371 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 1372 "\xf4\x8f\xbf\xbf")); 1373 1374 // U+FDD0 1375 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1376 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), 1377 "\xef\xb7\x90")); 1378 1379 // U+FDD1 1380 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1381 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), 1382 "\xef\xb7\x91")); 1383 1384 // U+FDD2 1385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1386 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), 1387 "\xef\xb7\x92")); 1388 1389 // U+FDD3 1390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1391 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), 1392 "\xef\xb7\x93")); 1393 1394 // U+FDD4 1395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1396 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), 1397 "\xef\xb7\x94")); 1398 1399 // U+FDD5 1400 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1401 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), 1402 "\xef\xb7\x95")); 1403 1404 // U+FDD6 1405 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1406 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), 1407 "\xef\xb7\x96")); 1408 1409 // U+FDD7 1410 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1411 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), 1412 "\xef\xb7\x97")); 1413 1414 // U+FDD8 1415 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1416 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), 1417 "\xef\xb7\x98")); 1418 1419 // U+FDD9 1420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1421 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), 1422 "\xef\xb7\x99")); 1423 1424 // U+FDDA 1425 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1426 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), 1427 "\xef\xb7\x9a")); 1428 1429 // U+FDDB 1430 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1431 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), 1432 "\xef\xb7\x9b")); 1433 1434 // U+FDDC 1435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1436 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), 1437 "\xef\xb7\x9c")); 1438 1439 // U+FDDD 1440 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1441 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), 1442 "\xef\xb7\x9d")); 1443 1444 // U+FDDE 1445 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1446 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), 1447 "\xef\xb7\x9e")); 1448 1449 // U+FDDF 1450 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1451 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), 1452 "\xef\xb7\x9f")); 1453 1454 // U+FDE0 1455 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1456 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), 1457 "\xef\xb7\xa0")); 1458 1459 // U+FDE1 1460 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1461 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), 1462 "\xef\xb7\xa1")); 1463 1464 // U+FDE2 1465 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1466 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), 1467 "\xef\xb7\xa2")); 1468 1469 // U+FDE3 1470 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1471 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), 1472 "\xef\xb7\xa3")); 1473 1474 // U+FDE4 1475 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1476 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), 1477 "\xef\xb7\xa4")); 1478 1479 // U+FDE5 1480 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1481 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), 1482 "\xef\xb7\xa5")); 1483 1484 // U+FDE6 1485 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1486 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), 1487 "\xef\xb7\xa6")); 1488 1489 // U+FDE7 1490 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1491 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), 1492 "\xef\xb7\xa7")); 1493 1494 // U+FDE8 1495 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1496 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), 1497 "\xef\xb7\xa8")); 1498 1499 // U+FDE9 1500 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1501 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), 1502 "\xef\xb7\xa9")); 1503 1504 // U+FDEA 1505 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1506 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), 1507 "\xef\xb7\xaa")); 1508 1509 // U+FDEB 1510 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1511 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), 1512 "\xef\xb7\xab")); 1513 1514 // U+FDEC 1515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1516 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), 1517 "\xef\xb7\xac")); 1518 1519 // U+FDED 1520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1521 ConvertUTFResultContainer(conversionOK).withScalars(0xfded), 1522 "\xef\xb7\xad")); 1523 1524 // U+FDEE 1525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1526 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), 1527 "\xef\xb7\xae")); 1528 1529 // U+FDEF 1530 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1531 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), 1532 "\xef\xb7\xaf")); 1533 1534 // U+FDF0 1535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1536 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), 1537 "\xef\xb7\xb0")); 1538 1539 // U+FDF1 1540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1541 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), 1542 "\xef\xb7\xb1")); 1543 1544 // U+FDF2 1545 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1546 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), 1547 "\xef\xb7\xb2")); 1548 1549 // U+FDF3 1550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1551 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), 1552 "\xef\xb7\xb3")); 1553 1554 // U+FDF4 1555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1556 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), 1557 "\xef\xb7\xb4")); 1558 1559 // U+FDF5 1560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1561 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), 1562 "\xef\xb7\xb5")); 1563 1564 // U+FDF6 1565 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1566 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), 1567 "\xef\xb7\xb6")); 1568 1569 // U+FDF7 1570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1571 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), 1572 "\xef\xb7\xb7")); 1573 1574 // U+FDF8 1575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1576 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), 1577 "\xef\xb7\xb8")); 1578 1579 // U+FDF9 1580 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1581 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), 1582 "\xef\xb7\xb9")); 1583 1584 // U+FDFA 1585 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1586 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), 1587 "\xef\xb7\xba")); 1588 1589 // U+FDFB 1590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1591 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), 1592 "\xef\xb7\xbb")); 1593 1594 // U+FDFC 1595 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1596 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), 1597 "\xef\xb7\xbc")); 1598 1599 // U+FDFD 1600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1601 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), 1602 "\xef\xb7\xbd")); 1603 1604 // U+FDFE 1605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1606 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), 1607 "\xef\xb7\xbe")); 1608 1609 // U+FDFF 1610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1611 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), 1612 "\xef\xb7\xbf")); 1613 } 1614 1615 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { 1616 // U+0041 LATIN CAPITAL LETTER A 1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1618 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), 1619 "\x41", true)); 1620 1621 // 1622 // Sequences with one continuation byte missing 1623 // 1624 1625 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1626 ConvertUTFResultContainer(sourceExhausted), 1627 "\xc2", true)); 1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1629 ConvertUTFResultContainer(sourceExhausted), 1630 "\xdf", true)); 1631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1632 ConvertUTFResultContainer(sourceExhausted), 1633 "\xe0\xa0", true)); 1634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1635 ConvertUTFResultContainer(sourceExhausted), 1636 "\xe0\xbf", true)); 1637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1638 ConvertUTFResultContainer(sourceExhausted), 1639 "\xe1\x80", true)); 1640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1641 ConvertUTFResultContainer(sourceExhausted), 1642 "\xec\xbf", true)); 1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1644 ConvertUTFResultContainer(sourceExhausted), 1645 "\xed\x80", true)); 1646 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1647 ConvertUTFResultContainer(sourceExhausted), 1648 "\xed\x9f", true)); 1649 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1650 ConvertUTFResultContainer(sourceExhausted), 1651 "\xee\x80", true)); 1652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1653 ConvertUTFResultContainer(sourceExhausted), 1654 "\xef\xbf", true)); 1655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1656 ConvertUTFResultContainer(sourceExhausted), 1657 "\xf0\x90\x80", true)); 1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1659 ConvertUTFResultContainer(sourceExhausted), 1660 "\xf0\xbf\xbf", true)); 1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1662 ConvertUTFResultContainer(sourceExhausted), 1663 "\xf1\x80\x80", true)); 1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1665 ConvertUTFResultContainer(sourceExhausted), 1666 "\xf3\xbf\xbf", true)); 1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1668 ConvertUTFResultContainer(sourceExhausted), 1669 "\xf4\x80\x80", true)); 1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1671 ConvertUTFResultContainer(sourceExhausted), 1672 "\xf4\x8f\xbf", true)); 1673 1674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1675 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), 1676 "\x41\xc2", true)); 1677 } 1678 1679