1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <stddef.h> 6 7 #include <string> 8 9 #include "base/macros.h" 10 #include "base/strings/string_piece.h" 11 #include "base/strings/sys_string_conversions.h" 12 #include "base/strings/utf_string_conversions.h" 13 #include "base/test/scoped_locale.h" 14 #include "build/build_config.h" 15 #include "testing/gtest/include/gtest/gtest.h" 16 17 #ifdef WCHAR_T_IS_UTF32 18 static const std::wstring kSysWideOldItalicLetterA = L"\x10300"; 19 #else 20 static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00"; 21 #endif 22 23 namespace base { 24 25 TEST(SysStrings, SysWideToUTF8) { 26 EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world")); 27 EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d")); 28 29 // >16 bits 30 EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA)); 31 32 // Error case. When Windows finds a UTF-16 character going off the end of 33 // a string, it just converts that literal value to UTF-8, even though this 34 // is invalid. 35 // 36 // This is what XP does, but Vista has different behavior, so we don't bother 37 // verifying it: 38 // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw", 39 // SysWideToUTF8(L"\x4f60\xd800zyxw")); 40 41 // Test embedded NULLs. 42 std::wstring wide_null(L"a"); 43 wide_null.push_back(0); 44 wide_null.push_back('b'); 45 46 std::string expected_null("a"); 47 expected_null.push_back(0); 48 expected_null.push_back('b'); 49 50 EXPECT_EQ(expected_null, SysWideToUTF8(wide_null)); 51 } 52 53 TEST(SysStrings, SysUTF8ToWide) { 54 EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world")); 55 EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd")); 56 // >16 bits 57 EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80")); 58 59 // Error case. When Windows finds an invalid UTF-8 character, it just skips 60 // it. This seems weird because it's inconsistent with the reverse conversion. 61 // 62 // This is what XP does, but Vista has different behavior, so we don't bother 63 // verifying it: 64 // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw")); 65 66 // Test embedded NULLs. 67 std::string utf8_null("a"); 68 utf8_null.push_back(0); 69 utf8_null.push_back('b'); 70 71 std::wstring expected_null(L"a"); 72 expected_null.push_back(0); 73 expected_null.push_back('b'); 74 75 EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null)); 76 } 77 78 #if defined(OS_LINUX) // Tests depend on setting a specific Linux locale. 79 80 TEST(SysStrings, SysWideToNativeMB) { 81 #if !defined(SYSTEM_NATIVE_UTF8) 82 ScopedLocale locale("en_US.UTF-8"); 83 #endif 84 EXPECT_EQ("Hello, world", SysWideToNativeMB(L"Hello, world")); 85 EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToNativeMB(L"\x4f60\x597d")); 86 87 // >16 bits 88 EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToNativeMB(kSysWideOldItalicLetterA)); 89 90 // Error case. When Windows finds a UTF-16 character going off the end of 91 // a string, it just converts that literal value to UTF-8, even though this 92 // is invalid. 93 // 94 // This is what XP does, but Vista has different behavior, so we don't bother 95 // verifying it: 96 // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw", 97 // SysWideToNativeMB(L"\x4f60\xd800zyxw")); 98 99 // Test embedded NULLs. 100 std::wstring wide_null(L"a"); 101 wide_null.push_back(0); 102 wide_null.push_back('b'); 103 104 std::string expected_null("a"); 105 expected_null.push_back(0); 106 expected_null.push_back('b'); 107 108 EXPECT_EQ(expected_null, SysWideToNativeMB(wide_null)); 109 } 110 111 // We assume the test is running in a UTF8 locale. 112 TEST(SysStrings, SysNativeMBToWide) { 113 #if !defined(SYSTEM_NATIVE_UTF8) 114 ScopedLocale locale("en_US.UTF-8"); 115 #endif 116 EXPECT_EQ(L"Hello, world", SysNativeMBToWide("Hello, world")); 117 EXPECT_EQ(L"\x4f60\x597d", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5\xbd")); 118 // >16 bits 119 EXPECT_EQ(kSysWideOldItalicLetterA, SysNativeMBToWide("\xF0\x90\x8C\x80")); 120 121 // Error case. When Windows finds an invalid UTF-8 character, it just skips 122 // it. This seems weird because it's inconsistent with the reverse conversion. 123 // 124 // This is what XP does, but Vista has different behavior, so we don't bother 125 // verifying it: 126 // EXPECT_EQ(L"\x4f60zyxw", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5zyxw")); 127 128 // Test embedded NULLs. 129 std::string utf8_null("a"); 130 utf8_null.push_back(0); 131 utf8_null.push_back('b'); 132 133 std::wstring expected_null(L"a"); 134 expected_null.push_back(0); 135 expected_null.push_back('b'); 136 137 EXPECT_EQ(expected_null, SysNativeMBToWide(utf8_null)); 138 } 139 140 static const wchar_t* const kConvertRoundtripCases[] = { 141 L"Google Video", 142 // " " 143 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", 144 // " " 145 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 146 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", 147 // " " 148 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" 149 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" 150 L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", 151 // "" 152 L"\xc804\xccb4\xc11c\xbe44\xc2a4", 153 154 // Test characters that take more than 16 bits. This will depend on whether 155 // wchar_t is 16 or 32 bits. 156 #if defined(WCHAR_T_IS_UTF16) 157 L"\xd800\xdf00", 158 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 159 L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", 160 #elif defined(WCHAR_T_IS_UTF32) 161 L"\x10300", 162 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) 163 L"\x11d40\x11d41\x11d42\x11d43\x11d44", 164 #endif 165 }; 166 167 168 TEST(SysStrings, SysNativeMBAndWide) { 169 #if !defined(SYSTEM_NATIVE_UTF8) 170 ScopedLocale locale("en_US.UTF-8"); 171 #endif 172 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { 173 std::wstring wide = kConvertRoundtripCases[i]; 174 std::wstring trip = SysNativeMBToWide(SysWideToNativeMB(wide)); 175 EXPECT_EQ(wide.size(), trip.size()); 176 EXPECT_EQ(wide, trip); 177 } 178 179 // We assume our test is running in UTF-8, so double check through ICU. 180 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { 181 std::wstring wide = kConvertRoundtripCases[i]; 182 std::wstring trip = SysNativeMBToWide(WideToUTF8(wide)); 183 EXPECT_EQ(wide.size(), trip.size()); 184 EXPECT_EQ(wide, trip); 185 } 186 187 for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { 188 std::wstring wide = kConvertRoundtripCases[i]; 189 std::wstring trip = UTF8ToWide(SysWideToNativeMB(wide)); 190 EXPECT_EQ(wide.size(), trip.size()); 191 EXPECT_EQ(wide, trip); 192 } 193 } 194 #endif // OS_LINUX 195 196 } // namespace base 197