1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/Support/ConvertUTF.h" 11 #include "gtest/gtest.h" 12 #include <string> 13 #include <vector> 14 #include <utility> 15 16 using namespace llvm; 17 18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { 19 // Src is the look of disapproval. 20 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c"; 21 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 22 std::string Result; 23 bool Success = convertUTF16ToUTF8String(Ref, Result); 24 EXPECT_TRUE(Success); 25 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 26 EXPECT_EQ(Expected, Result); 27 } 28 29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { 30 // Src is the look of disapproval. 31 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0"; 32 ArrayRef<char> Ref(Src, sizeof(Src) - 1); 33 std::string Result; 34 bool Success = convertUTF16ToUTF8String(Ref, Result); 35 EXPECT_TRUE(Success); 36 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0"); 37 EXPECT_EQ(Expected, Result); 38 } 39 40 TEST(ConvertUTFTest, OddLengthInput) { 41 std::string Result; 42 bool Success = convertUTF16ToUTF8String(ArrayRef<char>("xxxxx", 5), Result); 43 EXPECT_FALSE(Success); 44 } 45 46 TEST(ConvertUTFTest, Empty) { 47 std::string Result; 48 bool Success = convertUTF16ToUTF8String(ArrayRef<char>(), Result); 49 EXPECT_TRUE(Success); 50 EXPECT_TRUE(Result.empty()); 51 } 52 53 TEST(ConvertUTFTest, HasUTF16BOM) { 54 bool HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xff\xfe", 2)); 55 EXPECT_TRUE(HasBOM); 56 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff", 2)); 57 EXPECT_TRUE(HasBOM); 58 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff ", 3)); 59 EXPECT_TRUE(HasBOM); // Don't care about odd lengths. 60 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff\x00asdf", 6)); 61 EXPECT_TRUE(HasBOM); 62 63 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>()); 64 EXPECT_FALSE(HasBOM); 65 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1)); 66 EXPECT_FALSE(HasBOM); 67 } 68 69 struct ConvertUTFResultContainer { 70 ConversionResult ErrorCode; 71 std::vector<unsigned> UnicodeScalars; 72 73 ConvertUTFResultContainer(ConversionResult ErrorCode) 74 : ErrorCode(ErrorCode) {} 75 76 ConvertUTFResultContainer 77 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, 78 unsigned US2 = 0x110000, unsigned US3 = 0x110000, 79 unsigned US4 = 0x110000, unsigned US5 = 0x110000, 80 unsigned US6 = 0x110000, unsigned US7 = 0x110000) { 81 ConvertUTFResultContainer Result(*this); 82 if (US0 != 0x110000) 83 Result.UnicodeScalars.push_back(US0); 84 if (US1 != 0x110000) 85 Result.UnicodeScalars.push_back(US1); 86 if (US2 != 0x110000) 87 Result.UnicodeScalars.push_back(US2); 88 if (US3 != 0x110000) 89 Result.UnicodeScalars.push_back(US3); 90 if (US4 != 0x110000) 91 Result.UnicodeScalars.push_back(US4); 92 if (US5 != 0x110000) 93 Result.UnicodeScalars.push_back(US5); 94 if (US6 != 0x110000) 95 Result.UnicodeScalars.push_back(US6); 96 if (US7 != 0x110000) 97 Result.UnicodeScalars.push_back(US7); 98 return Result; 99 } 100 }; 101 102 std::pair<ConversionResult, std::vector<unsigned>> 103 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { 104 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 105 106 const UTF8 *SourceNext = SourceStart; 107 std::vector<UTF32> Decoded(S.size(), 0); 108 UTF32 *TargetStart = Decoded.data(); 109 110 auto ErrorCode = 111 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, 112 Decoded.data() + Decoded.size(), lenientConversion); 113 114 Decoded.resize(TargetStart - Decoded.data()); 115 116 return std::make_pair(ErrorCode, Decoded); 117 } 118 119 std::pair<ConversionResult, std::vector<unsigned>> 120 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { 121 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); 122 123 const UTF8 *SourceNext = SourceStart; 124 std::vector<UTF32> Decoded(S.size(), 0); 125 UTF32 *TargetStart = Decoded.data(); 126 127 auto ErrorCode = ConvertUTF8toUTF32Partial( 128 &SourceNext, SourceStart + S.size(), &TargetStart, 129 Decoded.data() + Decoded.size(), lenientConversion); 130 131 Decoded.resize(TargetStart - Decoded.data()); 132 133 return std::make_pair(ErrorCode, Decoded); 134 } 135 136 ::testing::AssertionResult 137 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, 138 StringRef S, bool Partial = false) { 139 ConversionResult ErrorCode; 140 std::vector<unsigned> Decoded; 141 if (!Partial) 142 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); 143 else 144 145 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); 146 if (Expected.ErrorCode != ErrorCode) 147 return ::testing::AssertionFailure() << "Expected error code " 148 << Expected.ErrorCode << ", actual " 149 << ErrorCode; 150 151 if (Expected.UnicodeScalars != Decoded) 152 return ::testing::AssertionFailure() 153 << "Expected lenient decoded result:\n" 154 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n" 155 << "Actual result:\n" << ::testing::PrintToString(Decoded); 156 157 return ::testing::AssertionSuccess(); 158 } 159 160 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { 161 162 // 163 // 1-byte sequences 164 // 165 166 // U+0041 LATIN CAPITAL LETTER A 167 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 168 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41")); 169 170 // 171 // 2-byte sequences 172 // 173 174 // U+0283 LATIN SMALL LETTER ESH 175 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 176 ConvertUTFResultContainer(conversionOK).withScalars(0x0283), 177 "\xca\x83")); 178 179 // U+03BA GREEK SMALL LETTER KAPPA 180 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA 181 // U+03C3 GREEK SMALL LETTER SIGMA 182 // U+03BC GREEK SMALL LETTER MU 183 // U+03B5 GREEK SMALL LETTER EPSILON 184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 185 ConvertUTFResultContainer(conversionOK) 186 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), 187 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5")); 188 189 // 190 // 3-byte sequences 191 // 192 193 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B 194 // U+6587 CJK UNIFIED IDEOGRAPH-6587 195 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 196 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), 197 "\xe4\xbe\x8b\xe6\x96\x87")); 198 199 // U+D55C HANGUL SYLLABLE HAN 200 // U+AE00 HANGUL SYLLABLE GEUL 201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 202 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), 203 "\xed\x95\x9c\xea\xb8\x80")); 204 205 // U+1112 HANGUL CHOSEONG HIEUH 206 // U+1161 HANGUL JUNGSEONG A 207 // U+11AB HANGUL JONGSEONG NIEUN 208 // U+1100 HANGUL CHOSEONG KIYEOK 209 // U+1173 HANGUL JUNGSEONG EU 210 // U+11AF HANGUL JONGSEONG RIEUL 211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 212 ConvertUTFResultContainer(conversionOK) 213 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), 214 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" 215 "\xe1\x86\xaf")); 216 217 // 218 // 4-byte sequences 219 // 220 221 // U+E0100 VARIATION SELECTOR-17 222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 223 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), 224 "\xf3\xa0\x84\x80")); 225 226 // 227 // First possible sequence of a certain length 228 // 229 230 // U+0000 NULL 231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 232 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 233 StringRef("\x00", 1))); 234 235 // U+0080 PADDING CHARACTER 236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 237 ConvertUTFResultContainer(conversionOK).withScalars(0x0080), 238 "\xc2\x80")); 239 240 // U+0800 SAMARITAN LETTER ALAF 241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 242 ConvertUTFResultContainer(conversionOK).withScalars(0x0800), 243 "\xe0\xa0\x80")); 244 245 // U+10000 LINEAR B SYLLABLE B008 A 246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 247 ConvertUTFResultContainer(conversionOK).withScalars(0x10000), 248 "\xf0\x90\x80\x80")); 249 250 // U+200000 (invalid) 251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 252 ConvertUTFResultContainer(sourceIllegal) 253 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 254 "\xf8\x88\x80\x80\x80")); 255 256 // U+4000000 (invalid) 257 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 258 ConvertUTFResultContainer(sourceIllegal) 259 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 260 "\xfc\x84\x80\x80\x80\x80")); 261 262 // 263 // Last possible sequence of a certain length 264 // 265 266 // U+007F DELETE 267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 268 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f")); 269 270 // U+07FF (unassigned) 271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 272 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), 273 "\xdf\xbf")); 274 275 // U+FFFF (noncharacter) 276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 277 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 278 "\xef\xbf\xbf")); 279 280 // U+1FFFFF (invalid) 281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 282 ConvertUTFResultContainer(sourceIllegal) 283 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 284 "\xf7\xbf\xbf\xbf")); 285 286 // U+3FFFFFF (invalid) 287 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 288 ConvertUTFResultContainer(sourceIllegal) 289 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 290 "\xfb\xbf\xbf\xbf\xbf")); 291 292 // U+7FFFFFFF (invalid) 293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 294 ConvertUTFResultContainer(sourceIllegal) 295 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 296 "\xfd\xbf\xbf\xbf\xbf\xbf")); 297 298 // 299 // Other boundary conditions 300 // 301 302 // U+D7FF (unassigned) 303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 304 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), 305 "\xed\x9f\xbf")); 306 307 // U+E000 (private use) 308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 309 ConvertUTFResultContainer(conversionOK).withScalars(0xe000), 310 "\xee\x80\x80")); 311 312 // U+FFFD REPLACEMENT CHARACTER 313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 314 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), 315 "\xef\xbf\xbd")); 316 317 // U+10FFFF (noncharacter) 318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 319 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 320 "\xf4\x8f\xbf\xbf")); 321 322 // U+110000 (invalid) 323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 324 ConvertUTFResultContainer(sourceIllegal) 325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 326 "\xf4\x90\x80\x80")); 327 328 // 329 // Unexpected continuation bytes 330 // 331 332 // A sequence of unexpected continuation bytes that don't follow a first 333 // byte, every byte is a maximal subpart. 334 335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 336 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80")); 337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 338 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf")); 339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 340 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 341 "\x80\x80")); 342 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 343 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 344 "\x80\xbf")); 345 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 346 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 347 "\xbf\x80")); 348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 349 ConvertUTFResultContainer(sourceIllegal) 350 .withScalars(0xfffd, 0xfffd, 0xfffd), 351 "\x80\xbf\x80")); 352 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 353 ConvertUTFResultContainer(sourceIllegal) 354 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 355 "\x80\xbf\x80\xbf")); 356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 357 ConvertUTFResultContainer(sourceIllegal) 358 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 359 "\x80\xbf\x82\xbf\xaa")); 360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 361 ConvertUTFResultContainer(sourceIllegal) 362 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 363 "\xaa\xb0\xbb\xbf\xaa\xa0")); 364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 365 ConvertUTFResultContainer(sourceIllegal) 366 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 367 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f")); 368 369 // All continuation bytes (0x80--0xbf). 370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 371 ConvertUTFResultContainer(sourceIllegal) 372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 373 0xfffd, 0xfffd, 0xfffd, 0xfffd) 374 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 375 0xfffd, 0xfffd, 0xfffd, 0xfffd) 376 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 377 0xfffd, 0xfffd, 0xfffd, 0xfffd) 378 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 379 0xfffd, 0xfffd, 0xfffd, 0xfffd) 380 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 381 0xfffd, 0xfffd, 0xfffd, 0xfffd) 382 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 383 0xfffd, 0xfffd, 0xfffd, 0xfffd) 384 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 385 0xfffd, 0xfffd, 0xfffd, 0xfffd) 386 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 387 0xfffd, 0xfffd, 0xfffd, 0xfffd), 388 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" 389 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" 390 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" 391 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf")); 392 393 // 394 // Lonely start bytes 395 // 396 397 // Start bytes of 2-byte sequences (0xc0--0xdf). 398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 399 ConvertUTFResultContainer(sourceIllegal) 400 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 401 0xfffd, 0xfffd, 0xfffd, 0xfffd) 402 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 403 0xfffd, 0xfffd, 0xfffd, 0xfffd) 404 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 405 0xfffd, 0xfffd, 0xfffd, 0xfffd) 406 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 407 0xfffd, 0xfffd, 0xfffd, 0xfffd), 408 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" 409 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf")); 410 411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 412 ConvertUTFResultContainer(sourceIllegal) 413 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 414 0xfffd, 0x0020, 0xfffd, 0x0020) 415 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 416 0xfffd, 0x0020, 0xfffd, 0x0020) 417 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 418 0xfffd, 0x0020, 0xfffd, 0x0020) 419 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 420 0xfffd, 0x0020, 0xfffd, 0x0020) 421 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 422 0xfffd, 0x0020, 0xfffd, 0x0020) 423 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 424 0xfffd, 0x0020, 0xfffd, 0x0020) 425 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 426 0xfffd, 0x0020, 0xfffd, 0x0020) 427 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 428 0xfffd, 0x0020, 0xfffd, 0x0020), 429 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" 430 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" 431 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" 432 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20")); 433 434 // Start bytes of 3-byte sequences (0xe0--0xef). 435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 436 ConvertUTFResultContainer(sourceIllegal) 437 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 438 0xfffd, 0xfffd, 0xfffd, 0xfffd) 439 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 440 0xfffd, 0xfffd, 0xfffd, 0xfffd), 441 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef")); 442 443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 444 ConvertUTFResultContainer(sourceIllegal) 445 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 446 0xfffd, 0x0020, 0xfffd, 0x0020) 447 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 448 0xfffd, 0x0020, 0xfffd, 0x0020) 449 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 450 0xfffd, 0x0020, 0xfffd, 0x0020) 451 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 452 0xfffd, 0x0020, 0xfffd, 0x0020), 453 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" 454 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20")); 455 456 // Start bytes of 4-byte sequences (0xf0--0xf7). 457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 458 ConvertUTFResultContainer(sourceIllegal) 459 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 460 0xfffd, 0xfffd, 0xfffd, 0xfffd), 461 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")); 462 463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 464 ConvertUTFResultContainer(sourceIllegal) 465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 466 0xfffd, 0x0020, 0xfffd, 0x0020) 467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 468 0xfffd, 0x0020, 0xfffd, 0x0020), 469 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20")); 470 471 // Start bytes of 5-byte sequences (0xf8--0xfb). 472 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 473 ConvertUTFResultContainer(sourceIllegal) 474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 475 "\xf8\xf9\xfa\xfb")); 476 477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 478 ConvertUTFResultContainer(sourceIllegal) 479 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 480 0xfffd, 0x0020, 0xfffd, 0x0020), 481 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20")); 482 483 // Start bytes of 6-byte sequences (0xfc--0xfd). 484 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 485 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 486 "\xfc\xfd")); 487 488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 489 ConvertUTFResultContainer(sourceIllegal) 490 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), 491 "\xfc\x20\xfd\x20")); 492 493 // 494 // Other bytes (0xc0--0xc1, 0xfe--0xff). 495 // 496 497 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 498 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0")); 499 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 500 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1")); 501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 502 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe")); 503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 504 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff")); 505 506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 507 ConvertUTFResultContainer(sourceIllegal) 508 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 509 "\xc0\xc1\xfe\xff")); 510 511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 512 ConvertUTFResultContainer(sourceIllegal) 513 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 514 "\xfe\xfe\xff\xff")); 515 516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 517 ConvertUTFResultContainer(sourceIllegal) 518 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 519 "\xfe\x80\x80\x80\x80\x80")); 520 521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 522 ConvertUTFResultContainer(sourceIllegal) 523 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 524 "\xff\x80\x80\x80\x80\x80")); 525 526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 527 ConvertUTFResultContainer(sourceIllegal) 528 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, 529 0xfffd, 0x0020, 0xfffd, 0x0020), 530 "\xc0\x20\xc1\x20\xfe\x20\xff\x20")); 531 532 // 533 // Sequences with one continuation byte missing 534 // 535 536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 537 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2")); 538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 539 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf")); 540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 541 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 542 "\xe0\xa0")); 543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 544 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 545 "\xe0\xbf")); 546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 547 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 548 "\xe1\x80")); 549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 551 "\xec\xbf")); 552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 553 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 554 "\xed\x80")); 555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 556 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 557 "\xed\x9f")); 558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 559 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 560 "\xee\x80")); 561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 562 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 563 "\xef\xbf")); 564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 565 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 566 "\xf0\x90\x80")); 567 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 568 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 569 "\xf0\xbf\xbf")); 570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 571 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 572 "\xf1\x80\x80")); 573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 574 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 575 "\xf3\xbf\xbf")); 576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 577 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 578 "\xf4\x80\x80")); 579 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 580 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 581 "\xf4\x8f\xbf")); 582 583 // Overlong sequences with one trailing byte missing. 584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 586 "\xc0")); 587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 589 "\xc1")); 590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 592 "\xe0\x80")); 593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 595 "\xe0\x9f")); 596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 597 ConvertUTFResultContainer(sourceIllegal) 598 .withScalars(0xfffd, 0xfffd, 0xfffd), 599 "\xf0\x80\x80")); 600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 601 ConvertUTFResultContainer(sourceIllegal) 602 .withScalars(0xfffd, 0xfffd, 0xfffd), 603 "\xf0\x8f\x80")); 604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 605 ConvertUTFResultContainer(sourceIllegal) 606 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 607 "\xf8\x80\x80\x80")); 608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 609 ConvertUTFResultContainer(sourceIllegal) 610 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 611 "\xfc\x80\x80\x80\x80")); 612 613 // Sequences that represent surrogates with one trailing byte missing. 614 // High surrogates 615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 617 "\xed\xa0")); 618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 620 "\xed\xac")); 621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 623 "\xed\xaf")); 624 // Low surrogates 625 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 626 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 627 "\xed\xb0")); 628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 629 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 630 "\xed\xb4")); 631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 632 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 633 "\xed\xbf")); 634 635 // Ill-formed 4-byte sequences. 636 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 637 // U+1100xx (invalid) 638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 639 ConvertUTFResultContainer(sourceIllegal) 640 .withScalars(0xfffd, 0xfffd, 0xfffd), 641 "\xf4\x90\x80")); 642 // U+13FBxx (invalid) 643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 644 ConvertUTFResultContainer(sourceIllegal) 645 .withScalars(0xfffd, 0xfffd, 0xfffd), 646 "\xf4\xbf\xbf")); 647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 648 ConvertUTFResultContainer(sourceIllegal) 649 .withScalars(0xfffd, 0xfffd, 0xfffd), 650 "\xf5\x80\x80")); 651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 652 ConvertUTFResultContainer(sourceIllegal) 653 .withScalars(0xfffd, 0xfffd, 0xfffd), 654 "\xf6\x80\x80")); 655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 656 ConvertUTFResultContainer(sourceIllegal) 657 .withScalars(0xfffd, 0xfffd, 0xfffd), 658 "\xf7\x80\x80")); 659 // U+1FFBxx (invalid) 660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 661 ConvertUTFResultContainer(sourceIllegal) 662 .withScalars(0xfffd, 0xfffd, 0xfffd), 663 "\xf7\xbf\xbf")); 664 665 // Ill-formed 5-byte sequences. 666 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 667 // U+2000xx (invalid) 668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 669 ConvertUTFResultContainer(sourceIllegal) 670 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 671 "\xf8\x88\x80\x80")); 672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 673 ConvertUTFResultContainer(sourceIllegal) 674 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 675 "\xf8\xbf\xbf\xbf")); 676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 677 ConvertUTFResultContainer(sourceIllegal) 678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 679 "\xf9\x80\x80\x80")); 680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 681 ConvertUTFResultContainer(sourceIllegal) 682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 683 "\xfa\x80\x80\x80")); 684 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 685 ConvertUTFResultContainer(sourceIllegal) 686 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 687 "\xfb\x80\x80\x80")); 688 // U+3FFFFxx (invalid) 689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 690 ConvertUTFResultContainer(sourceIllegal) 691 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 692 "\xfb\xbf\xbf\xbf")); 693 694 // Ill-formed 6-byte sequences. 695 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx 696 // U+40000xx (invalid) 697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 698 ConvertUTFResultContainer(sourceIllegal) 699 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 700 "\xfc\x84\x80\x80\x80")); 701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 702 ConvertUTFResultContainer(sourceIllegal) 703 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 704 "\xfc\xbf\xbf\xbf\xbf")); 705 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 706 ConvertUTFResultContainer(sourceIllegal) 707 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 708 "\xfd\x80\x80\x80\x80")); 709 // U+7FFFFFxx (invalid) 710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 711 ConvertUTFResultContainer(sourceIllegal) 712 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 713 "\xfd\xbf\xbf\xbf\xbf")); 714 715 // 716 // Sequences with two continuation bytes missing 717 // 718 719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 720 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 721 "\xf0\x90")); 722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 723 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 724 "\xf0\xbf")); 725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 726 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 727 "\xf1\x80")); 728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 729 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 730 "\xf3\xbf")); 731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 732 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 733 "\xf4\x80")); 734 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 735 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), 736 "\xf4\x8f")); 737 738 // Overlong sequences with two trailing byte missing. 739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 740 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0")); 741 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 742 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 743 "\xf0\x80")); 744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 745 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 746 "\xf0\x8f")); 747 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 748 ConvertUTFResultContainer(sourceIllegal) 749 .withScalars(0xfffd, 0xfffd, 0xfffd), 750 "\xf8\x80\x80")); 751 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 752 ConvertUTFResultContainer(sourceIllegal) 753 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 754 "\xfc\x80\x80\x80")); 755 756 // Sequences that represent surrogates with two trailing bytes missing. 757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 758 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed")); 759 760 // Ill-formed 4-byte sequences. 761 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 762 // U+110yxx (invalid) 763 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 764 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 765 "\xf4\x90")); 766 // U+13Fyxx (invalid) 767 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 768 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 769 "\xf4\xbf")); 770 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 771 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 772 "\xf5\x80")); 773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 774 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 775 "\xf6\x80")); 776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 777 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 778 "\xf7\x80")); 779 // U+1FFyxx (invalid) 780 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 781 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 782 "\xf7\xbf")); 783 784 // Ill-formed 5-byte sequences. 785 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 786 // U+200yxx (invalid) 787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 789 "\xf8\x88\x80")); 790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 792 "\xf8\xbf\xbf")); 793 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 794 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 795 "\xf9\x80\x80")); 796 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 797 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 798 "\xfa\x80\x80")); 799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 800 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 801 "\xfb\x80\x80")); 802 // U+3FFFyxx (invalid) 803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 804 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 805 "\xfb\xbf\xbf")); 806 807 // Ill-formed 6-byte sequences. 808 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 809 // U+4000yxx (invalid) 810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 812 "\xfc\x84\x80\x80")); 813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 815 "\xfc\xbf\xbf\xbf")); 816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 817 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 818 "\xfd\x80\x80\x80")); 819 // U+7FFFFyxx (invalid) 820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 822 "\xfd\xbf\xbf\xbf")); 823 824 // 825 // Sequences with three continuation bytes missing 826 // 827 828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1")); 832 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 833 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2")); 834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3")); 836 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 837 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4")); 838 839 // Broken overlong sequences. 840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0")); 842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 844 "\xf8\x80")); 845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 846 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 847 "\xfc\x80\x80")); 848 849 // Ill-formed 4-byte sequences. 850 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 851 // U+14yyxx (invalid) 852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 853 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5")); 854 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 855 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6")); 856 // U+1Cyyxx (invalid) 857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 858 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7")); 859 860 // Ill-formed 5-byte sequences. 861 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 862 // U+20yyxx (invalid) 863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 865 "\xf8\x88")); 866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 868 "\xf8\xbf")); 869 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 870 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 871 "\xf9\x80")); 872 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 873 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 874 "\xfa\x80")); 875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 877 "\xfb\x80")); 878 // U+3FCyyxx (invalid) 879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 880 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 881 "\xfb\xbf")); 882 883 // Ill-formed 6-byte sequences. 884 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 885 // U+400yyxx (invalid) 886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 888 "\xfc\x84\x80")); 889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 891 "\xfc\xbf\xbf")); 892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 894 "\xfd\x80\x80")); 895 // U+7FFCyyxx (invalid) 896 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 897 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), 898 "\xfd\xbf\xbf")); 899 900 // 901 // Sequences with four continuation bytes missing 902 // 903 904 // Ill-formed 5-byte sequences. 905 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 906 // U+uzyyxx (invalid) 907 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 908 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 909 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 910 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9")); 911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa")); 913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 915 // U+3zyyxx (invalid) 916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb")); 918 919 // Broken overlong sequences. 920 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 921 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8")); 922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 924 "\xfc\x80")); 925 926 // Ill-formed 6-byte sequences. 927 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 928 // U+uzzyyxx (invalid) 929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 931 "\xfc\x84")); 932 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 933 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 934 "\xfc\xbf")); 935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 936 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 937 "\xfd\x80")); 938 // U+7Fzzyyxx (invalid) 939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 941 "\xfd\xbf")); 942 943 // 944 // Sequences with five continuation bytes missing 945 // 946 947 // Ill-formed 6-byte sequences. 948 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx 949 // U+uzzyyxx (invalid) 950 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 951 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc")); 952 // U+uuzzyyxx (invalid) 953 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 954 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd")); 955 956 // 957 // Consecutive sequences with trailing bytes missing 958 // 959 960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 961 ConvertUTFResultContainer(sourceIllegal) 962 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 963 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 964 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) 965 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) 966 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) 967 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 968 "\xc0" "\xe0\x80" "\xf0\x80\x80" 969 "\xf8\x80\x80\x80" 970 "\xfc\x80\x80\x80\x80" 971 "\xdf" "\xef\xbf" "\xf7\xbf\xbf" 972 "\xfb\xbf\xbf\xbf" 973 "\xfd\xbf\xbf\xbf\xbf")); 974 975 // 976 // Overlong UTF-8 sequences 977 // 978 979 // U+002F SOLIDUS 980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 981 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f")); 982 983 // Overlong sequences of the above. 984 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 985 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 986 "\xc0\xaf")); 987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 988 ConvertUTFResultContainer(sourceIllegal) 989 .withScalars(0xfffd, 0xfffd, 0xfffd), 990 "\xe0\x80\xaf")); 991 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 992 ConvertUTFResultContainer(sourceIllegal) 993 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 994 "\xf0\x80\x80\xaf")); 995 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 996 ConvertUTFResultContainer(sourceIllegal) 997 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 998 "\xf8\x80\x80\x80\xaf")); 999 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1000 ConvertUTFResultContainer(sourceIllegal) 1001 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1002 "\xfc\x80\x80\x80\x80\xaf")); 1003 1004 // U+0000 NULL 1005 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1006 ConvertUTFResultContainer(conversionOK).withScalars(0x0000), 1007 StringRef("\x00", 1))); 1008 1009 // Overlong sequences of the above. 1010 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1011 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1012 "\xc0\x80")); 1013 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1014 ConvertUTFResultContainer(sourceIllegal) 1015 .withScalars(0xfffd, 0xfffd, 0xfffd), 1016 "\xe0\x80\x80")); 1017 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1018 ConvertUTFResultContainer(sourceIllegal) 1019 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1020 "\xf0\x80\x80\x80")); 1021 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1022 ConvertUTFResultContainer(sourceIllegal) 1023 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1024 "\xf8\x80\x80\x80\x80")); 1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1026 ConvertUTFResultContainer(sourceIllegal) 1027 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1028 "\xfc\x80\x80\x80\x80\x80")); 1029 1030 // Other overlong sequences. 1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1032 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1033 "\xc0\xbf")); 1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1035 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1036 "\xc1\x80")); 1037 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1038 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), 1039 "\xc1\xbf")); 1040 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1041 ConvertUTFResultContainer(sourceIllegal) 1042 .withScalars(0xfffd, 0xfffd, 0xfffd), 1043 "\xe0\x9f\xbf")); 1044 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1045 ConvertUTFResultContainer(sourceIllegal) 1046 .withScalars(0xfffd, 0xfffd, 0xfffd), 1047 "\xed\xa0\x80")); 1048 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1049 ConvertUTFResultContainer(sourceIllegal) 1050 .withScalars(0xfffd, 0xfffd, 0xfffd), 1051 "\xed\xbf\xbf")); 1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1053 ConvertUTFResultContainer(sourceIllegal) 1054 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1055 "\xf0\x8f\x80\x80")); 1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1057 ConvertUTFResultContainer(sourceIllegal) 1058 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), 1059 "\xf0\x8f\xbf\xbf")); 1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1061 ConvertUTFResultContainer(sourceIllegal) 1062 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1063 "\xf8\x87\xbf\xbf\xbf")); 1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1065 ConvertUTFResultContainer(sourceIllegal) 1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1067 "\xfc\x83\xbf\xbf\xbf\xbf")); 1068 1069 // 1070 // Isolated surrogates 1071 // 1072 1073 // Unicode 6.3.0: 1074 // 1075 // D71. High-surrogate code point: A Unicode code point in the range 1076 // U+D800 to U+DBFF. 1077 // 1078 // D73. Low-surrogate code point: A Unicode code point in the range 1079 // U+DC00 to U+DFFF. 1080 1081 // Note: U+E0100 is <DB40 DD00> in UTF16. 1082 1083 // High surrogates 1084 1085 // U+D800 1086 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1087 ConvertUTFResultContainer(sourceIllegal) 1088 .withScalars(0xfffd, 0xfffd, 0xfffd), 1089 "\xed\xa0\x80")); 1090 1091 // U+DB40 1092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1093 ConvertUTFResultContainer(sourceIllegal) 1094 .withScalars(0xfffd, 0xfffd, 0xfffd), 1095 "\xed\xac\xa0")); 1096 1097 // U+DBFF 1098 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1099 ConvertUTFResultContainer(sourceIllegal) 1100 .withScalars(0xfffd, 0xfffd, 0xfffd), 1101 "\xed\xaf\xbf")); 1102 1103 // Low surrogates 1104 1105 // U+DC00 1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1107 ConvertUTFResultContainer(sourceIllegal) 1108 .withScalars(0xfffd, 0xfffd, 0xfffd), 1109 "\xed\xb0\x80")); 1110 1111 // U+DD00 1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1113 ConvertUTFResultContainer(sourceIllegal) 1114 .withScalars(0xfffd, 0xfffd, 0xfffd), 1115 "\xed\xb4\x80")); 1116 1117 // U+DFFF 1118 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1119 ConvertUTFResultContainer(sourceIllegal) 1120 .withScalars(0xfffd, 0xfffd, 0xfffd), 1121 "\xed\xbf\xbf")); 1122 1123 // Surrogate pairs 1124 1125 // U+D800 U+DC00 1126 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1127 ConvertUTFResultContainer(sourceIllegal) 1128 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1129 "\xed\xa0\x80\xed\xb0\x80")); 1130 1131 // U+D800 U+DD00 1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1133 ConvertUTFResultContainer(sourceIllegal) 1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1135 "\xed\xa0\x80\xed\xb4\x80")); 1136 1137 // U+D800 U+DFFF 1138 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1139 ConvertUTFResultContainer(sourceIllegal) 1140 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1141 "\xed\xa0\x80\xed\xbf\xbf")); 1142 1143 // U+DB40 U+DC00 1144 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1145 ConvertUTFResultContainer(sourceIllegal) 1146 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1147 "\xed\xac\xa0\xed\xb0\x80")); 1148 1149 // U+DB40 U+DD00 1150 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1151 ConvertUTFResultContainer(sourceIllegal) 1152 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1153 "\xed\xac\xa0\xed\xb4\x80")); 1154 1155 // U+DB40 U+DFFF 1156 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1157 ConvertUTFResultContainer(sourceIllegal) 1158 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1159 "\xed\xac\xa0\xed\xbf\xbf")); 1160 1161 // U+DBFF U+DC00 1162 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1163 ConvertUTFResultContainer(sourceIllegal) 1164 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1165 "\xed\xaf\xbf\xed\xb0\x80")); 1166 1167 // U+DBFF U+DD00 1168 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1169 ConvertUTFResultContainer(sourceIllegal) 1170 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1171 "\xed\xaf\xbf\xed\xb4\x80")); 1172 1173 // U+DBFF U+DFFF 1174 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1175 ConvertUTFResultContainer(sourceIllegal) 1176 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), 1177 "\xed\xaf\xbf\xed\xbf\xbf")); 1178 1179 // 1180 // Noncharacters 1181 // 1182 1183 // Unicode 6.3.0: 1184 // 1185 // D14. Noncharacter: A code point that is permanently reserved for 1186 // internal use and that should never be interchanged. Noncharacters 1187 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) 1188 // and the values U+FDD0..U+FDEF. 1189 1190 // U+FFFE 1191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1192 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), 1193 "\xef\xbf\xbe")); 1194 1195 // U+FFFF 1196 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1197 ConvertUTFResultContainer(conversionOK).withScalars(0xffff), 1198 "\xef\xbf\xbf")); 1199 1200 // U+1FFFE 1201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1202 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), 1203 "\xf0\x9f\xbf\xbe")); 1204 1205 // U+1FFFF 1206 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1207 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), 1208 "\xf0\x9f\xbf\xbf")); 1209 1210 // U+2FFFE 1211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1212 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), 1213 "\xf0\xaf\xbf\xbe")); 1214 1215 // U+2FFFF 1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1217 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), 1218 "\xf0\xaf\xbf\xbf")); 1219 1220 // U+3FFFE 1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1222 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), 1223 "\xf0\xbf\xbf\xbe")); 1224 1225 // U+3FFFF 1226 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1227 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), 1228 "\xf0\xbf\xbf\xbf")); 1229 1230 // U+4FFFE 1231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1232 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), 1233 "\xf1\x8f\xbf\xbe")); 1234 1235 // U+4FFFF 1236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1237 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), 1238 "\xf1\x8f\xbf\xbf")); 1239 1240 // U+5FFFE 1241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1242 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), 1243 "\xf1\x9f\xbf\xbe")); 1244 1245 // U+5FFFF 1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1247 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), 1248 "\xf1\x9f\xbf\xbf")); 1249 1250 // U+6FFFE 1251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1252 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), 1253 "\xf1\xaf\xbf\xbe")); 1254 1255 // U+6FFFF 1256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1257 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), 1258 "\xf1\xaf\xbf\xbf")); 1259 1260 // U+7FFFE 1261 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1262 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), 1263 "\xf1\xbf\xbf\xbe")); 1264 1265 // U+7FFFF 1266 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1267 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), 1268 "\xf1\xbf\xbf\xbf")); 1269 1270 // U+8FFFE 1271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1272 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), 1273 "\xf2\x8f\xbf\xbe")); 1274 1275 // U+8FFFF 1276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1277 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), 1278 "\xf2\x8f\xbf\xbf")); 1279 1280 // U+9FFFE 1281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1282 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), 1283 "\xf2\x9f\xbf\xbe")); 1284 1285 // U+9FFFF 1286 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1287 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), 1288 "\xf2\x9f\xbf\xbf")); 1289 1290 // U+AFFFE 1291 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1292 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), 1293 "\xf2\xaf\xbf\xbe")); 1294 1295 // U+AFFFF 1296 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1297 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), 1298 "\xf2\xaf\xbf\xbf")); 1299 1300 // U+BFFFE 1301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1302 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), 1303 "\xf2\xbf\xbf\xbe")); 1304 1305 // U+BFFFF 1306 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1307 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), 1308 "\xf2\xbf\xbf\xbf")); 1309 1310 // U+CFFFE 1311 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1312 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), 1313 "\xf3\x8f\xbf\xbe")); 1314 1315 // U+CFFFF 1316 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1317 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), 1318 "\xf3\x8f\xbf\xbf")); 1319 1320 // U+DFFFE 1321 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1322 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), 1323 "\xf3\x9f\xbf\xbe")); 1324 1325 // U+DFFFF 1326 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1327 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), 1328 "\xf3\x9f\xbf\xbf")); 1329 1330 // U+EFFFE 1331 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1332 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), 1333 "\xf3\xaf\xbf\xbe")); 1334 1335 // U+EFFFF 1336 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1337 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), 1338 "\xf3\xaf\xbf\xbf")); 1339 1340 // U+FFFFE 1341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1342 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), 1343 "\xf3\xbf\xbf\xbe")); 1344 1345 // U+FFFFF 1346 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1347 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), 1348 "\xf3\xbf\xbf\xbf")); 1349 1350 // U+10FFFE 1351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1352 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), 1353 "\xf4\x8f\xbf\xbe")); 1354 1355 // U+10FFFF 1356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1357 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), 1358 "\xf4\x8f\xbf\xbf")); 1359 1360 // U+FDD0 1361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1362 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), 1363 "\xef\xb7\x90")); 1364 1365 // U+FDD1 1366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1367 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), 1368 "\xef\xb7\x91")); 1369 1370 // U+FDD2 1371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1372 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), 1373 "\xef\xb7\x92")); 1374 1375 // U+FDD3 1376 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1377 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), 1378 "\xef\xb7\x93")); 1379 1380 // U+FDD4 1381 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1382 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), 1383 "\xef\xb7\x94")); 1384 1385 // U+FDD5 1386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1387 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), 1388 "\xef\xb7\x95")); 1389 1390 // U+FDD6 1391 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1392 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), 1393 "\xef\xb7\x96")); 1394 1395 // U+FDD7 1396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1397 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), 1398 "\xef\xb7\x97")); 1399 1400 // U+FDD8 1401 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1402 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), 1403 "\xef\xb7\x98")); 1404 1405 // U+FDD9 1406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1407 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), 1408 "\xef\xb7\x99")); 1409 1410 // U+FDDA 1411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1412 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), 1413 "\xef\xb7\x9a")); 1414 1415 // U+FDDB 1416 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1417 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), 1418 "\xef\xb7\x9b")); 1419 1420 // U+FDDC 1421 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1422 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), 1423 "\xef\xb7\x9c")); 1424 1425 // U+FDDD 1426 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1427 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), 1428 "\xef\xb7\x9d")); 1429 1430 // U+FDDE 1431 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1432 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), 1433 "\xef\xb7\x9e")); 1434 1435 // U+FDDF 1436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1437 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), 1438 "\xef\xb7\x9f")); 1439 1440 // U+FDE0 1441 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1442 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), 1443 "\xef\xb7\xa0")); 1444 1445 // U+FDE1 1446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1447 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), 1448 "\xef\xb7\xa1")); 1449 1450 // U+FDE2 1451 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1452 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), 1453 "\xef\xb7\xa2")); 1454 1455 // U+FDE3 1456 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1457 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), 1458 "\xef\xb7\xa3")); 1459 1460 // U+FDE4 1461 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1462 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), 1463 "\xef\xb7\xa4")); 1464 1465 // U+FDE5 1466 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1467 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), 1468 "\xef\xb7\xa5")); 1469 1470 // U+FDE6 1471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1472 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), 1473 "\xef\xb7\xa6")); 1474 1475 // U+FDE7 1476 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1477 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), 1478 "\xef\xb7\xa7")); 1479 1480 // U+FDE8 1481 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1482 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), 1483 "\xef\xb7\xa8")); 1484 1485 // U+FDE9 1486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1487 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), 1488 "\xef\xb7\xa9")); 1489 1490 // U+FDEA 1491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1492 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), 1493 "\xef\xb7\xaa")); 1494 1495 // U+FDEB 1496 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1497 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), 1498 "\xef\xb7\xab")); 1499 1500 // U+FDEC 1501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1502 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), 1503 "\xef\xb7\xac")); 1504 1505 // U+FDED 1506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1507 ConvertUTFResultContainer(conversionOK).withScalars(0xfded), 1508 "\xef\xb7\xad")); 1509 1510 // U+FDEE 1511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1512 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), 1513 "\xef\xb7\xae")); 1514 1515 // U+FDEF 1516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1517 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), 1518 "\xef\xb7\xaf")); 1519 1520 // U+FDF0 1521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1522 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), 1523 "\xef\xb7\xb0")); 1524 1525 // U+FDF1 1526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1527 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), 1528 "\xef\xb7\xb1")); 1529 1530 // U+FDF2 1531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1532 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), 1533 "\xef\xb7\xb2")); 1534 1535 // U+FDF3 1536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1537 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), 1538 "\xef\xb7\xb3")); 1539 1540 // U+FDF4 1541 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1542 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), 1543 "\xef\xb7\xb4")); 1544 1545 // U+FDF5 1546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1547 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), 1548 "\xef\xb7\xb5")); 1549 1550 // U+FDF6 1551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1552 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), 1553 "\xef\xb7\xb6")); 1554 1555 // U+FDF7 1556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1557 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), 1558 "\xef\xb7\xb7")); 1559 1560 // U+FDF8 1561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1562 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), 1563 "\xef\xb7\xb8")); 1564 1565 // U+FDF9 1566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1567 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), 1568 "\xef\xb7\xb9")); 1569 1570 // U+FDFA 1571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1572 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), 1573 "\xef\xb7\xba")); 1574 1575 // U+FDFB 1576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1577 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), 1578 "\xef\xb7\xbb")); 1579 1580 // U+FDFC 1581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1582 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), 1583 "\xef\xb7\xbc")); 1584 1585 // U+FDFD 1586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1587 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), 1588 "\xef\xb7\xbd")); 1589 1590 // U+FDFE 1591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1592 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), 1593 "\xef\xb7\xbe")); 1594 1595 // U+FDFF 1596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1597 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), 1598 "\xef\xb7\xbf")); 1599 } 1600 1601 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { 1602 // U+0041 LATIN CAPITAL LETTER A 1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1604 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), 1605 "\x41", true)); 1606 1607 // 1608 // Sequences with one continuation byte missing 1609 // 1610 1611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1612 ConvertUTFResultContainer(sourceExhausted), 1613 "\xc2", true)); 1614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1615 ConvertUTFResultContainer(sourceExhausted), 1616 "\xdf", true)); 1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1618 ConvertUTFResultContainer(sourceExhausted), 1619 "\xe0\xa0", true)); 1620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1621 ConvertUTFResultContainer(sourceExhausted), 1622 "\xe0\xbf", true)); 1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1624 ConvertUTFResultContainer(sourceExhausted), 1625 "\xe1\x80", true)); 1626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1627 ConvertUTFResultContainer(sourceExhausted), 1628 "\xec\xbf", true)); 1629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1630 ConvertUTFResultContainer(sourceExhausted), 1631 "\xed\x80", true)); 1632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1633 ConvertUTFResultContainer(sourceExhausted), 1634 "\xed\x9f", true)); 1635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1636 ConvertUTFResultContainer(sourceExhausted), 1637 "\xee\x80", true)); 1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1639 ConvertUTFResultContainer(sourceExhausted), 1640 "\xef\xbf", true)); 1641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1642 ConvertUTFResultContainer(sourceExhausted), 1643 "\xf0\x90\x80", true)); 1644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1645 ConvertUTFResultContainer(sourceExhausted), 1646 "\xf0\xbf\xbf", true)); 1647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1648 ConvertUTFResultContainer(sourceExhausted), 1649 "\xf1\x80\x80", true)); 1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1651 ConvertUTFResultContainer(sourceExhausted), 1652 "\xf3\xbf\xbf", true)); 1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1654 ConvertUTFResultContainer(sourceExhausted), 1655 "\xf4\x80\x80", true)); 1656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1657 ConvertUTFResultContainer(sourceExhausted), 1658 "\xf4\x8f\xbf", true)); 1659 1660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( 1661 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), 1662 "\x41\xc2", true)); 1663 } 1664 1665