Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <string>
      6 
      7 #include "base/basictypes.h"
      8 #include "base/strings/string_piece.h"
      9 #include "base/strings/sys_string_conversions.h"
     10 #include "base/strings/utf_string_conversions.h"
     11 #include "base/test/scoped_locale.h"
     12 #include "testing/gtest/include/gtest/gtest.h"
     13 
     14 #ifdef WCHAR_T_IS_UTF32
     15 static const std::wstring kSysWideOldItalicLetterA = L"\x10300";
     16 #else
     17 static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00";
     18 #endif
     19 
     20 namespace base {
     21 
     22 TEST(SysStrings, SysWideToUTF8) {
     23   EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world"));
     24   EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d"));
     25 
     26   // >16 bits
     27   EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA));
     28 
     29   // Error case. When Windows finds a UTF-16 character going off the end of
     30   // a string, it just converts that literal value to UTF-8, even though this
     31   // is invalid.
     32   //
     33   // This is what XP does, but Vista has different behavior, so we don't bother
     34   // verifying it:
     35   // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
     36   //           SysWideToUTF8(L"\x4f60\xd800zyxw"));
     37 
     38   // Test embedded NULLs.
     39   std::wstring wide_null(L"a");
     40   wide_null.push_back(0);
     41   wide_null.push_back('b');
     42 
     43   std::string expected_null("a");
     44   expected_null.push_back(0);
     45   expected_null.push_back('b');
     46 
     47   EXPECT_EQ(expected_null, SysWideToUTF8(wide_null));
     48 }
     49 
     50 TEST(SysStrings, SysUTF8ToWide) {
     51   EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world"));
     52   EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
     53   // >16 bits
     54   EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80"));
     55 
     56   // Error case. When Windows finds an invalid UTF-8 character, it just skips
     57   // it. This seems weird because it's inconsistent with the reverse conversion.
     58   //
     59   // This is what XP does, but Vista has different behavior, so we don't bother
     60   // verifying it:
     61   // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
     62 
     63   // Test embedded NULLs.
     64   std::string utf8_null("a");
     65   utf8_null.push_back(0);
     66   utf8_null.push_back('b');
     67 
     68   std::wstring expected_null(L"a");
     69   expected_null.push_back(0);
     70   expected_null.push_back('b');
     71 
     72   EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null));
     73 }
     74 
     75 #if defined(OS_LINUX)  // Tests depend on setting a specific Linux locale.
     76 
     77 TEST(SysStrings, SysWideToNativeMB) {
     78   ScopedLocale locale("en_US.utf-8");
     79   EXPECT_EQ("Hello, world", SysWideToNativeMB(L"Hello, world"));
     80   EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToNativeMB(L"\x4f60\x597d"));
     81 
     82   // >16 bits
     83   EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToNativeMB(kSysWideOldItalicLetterA));
     84 
     85   // Error case. When Windows finds a UTF-16 character going off the end of
     86   // a string, it just converts that literal value to UTF-8, even though this
     87   // is invalid.
     88   //
     89   // This is what XP does, but Vista has different behavior, so we don't bother
     90   // verifying it:
     91   // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
     92   //           SysWideToNativeMB(L"\x4f60\xd800zyxw"));
     93 
     94   // Test embedded NULLs.
     95   std::wstring wide_null(L"a");
     96   wide_null.push_back(0);
     97   wide_null.push_back('b');
     98 
     99   std::string expected_null("a");
    100   expected_null.push_back(0);
    101   expected_null.push_back('b');
    102 
    103   EXPECT_EQ(expected_null, SysWideToNativeMB(wide_null));
    104 }
    105 
    106 // We assume the test is running in a UTF8 locale.
    107 TEST(SysStrings, SysNativeMBToWide) {
    108   ScopedLocale locale("en_US.utf-8");
    109   EXPECT_EQ(L"Hello, world", SysNativeMBToWide("Hello, world"));
    110   EXPECT_EQ(L"\x4f60\x597d", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
    111   // >16 bits
    112   EXPECT_EQ(kSysWideOldItalicLetterA, SysNativeMBToWide("\xF0\x90\x8C\x80"));
    113 
    114   // Error case. When Windows finds an invalid UTF-8 character, it just skips
    115   // it. This seems weird because it's inconsistent with the reverse conversion.
    116   //
    117   // This is what XP does, but Vista has different behavior, so we don't bother
    118   // verifying it:
    119   // EXPECT_EQ(L"\x4f60zyxw", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
    120 
    121   // Test embedded NULLs.
    122   std::string utf8_null("a");
    123   utf8_null.push_back(0);
    124   utf8_null.push_back('b');
    125 
    126   std::wstring expected_null(L"a");
    127   expected_null.push_back(0);
    128   expected_null.push_back('b');
    129 
    130   EXPECT_EQ(expected_null, SysNativeMBToWide(utf8_null));
    131 }
    132 
    133 static const wchar_t* const kConvertRoundtripCases[] = {
    134   L"Google Video",
    135   // "   "
    136   L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
    137   //  " "
    138   L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
    139   L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
    140   // "   "
    141   L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
    142   L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
    143   L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
    144   // ""
    145   L"\xc804\xccb4\xc11c\xbe44\xc2a4",
    146 
    147   // Test characters that take more than 16 bits. This will depend on whether
    148   // wchar_t is 16 or 32 bits.
    149 #if defined(WCHAR_T_IS_UTF16)
    150   L"\xd800\xdf00",
    151   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
    152   L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
    153 #elif defined(WCHAR_T_IS_UTF32)
    154   L"\x10300",
    155   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
    156   L"\x11d40\x11d41\x11d42\x11d43\x11d44",
    157 #endif
    158 };
    159 
    160 
    161 TEST(SysStrings, SysNativeMBAndWide) {
    162   ScopedLocale locale("en_US.utf-8");
    163   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    164     std::wstring wide = kConvertRoundtripCases[i];
    165     std::wstring trip = SysNativeMBToWide(SysWideToNativeMB(wide));
    166     EXPECT_EQ(wide.size(), trip.size());
    167     EXPECT_EQ(wide, trip);
    168   }
    169 
    170   // We assume our test is running in UTF-8, so double check through ICU.
    171   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    172     std::wstring wide = kConvertRoundtripCases[i];
    173     std::wstring trip = SysNativeMBToWide(WideToUTF8(wide));
    174     EXPECT_EQ(wide.size(), trip.size());
    175     EXPECT_EQ(wide, trip);
    176   }
    177 
    178   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    179     std::wstring wide = kConvertRoundtripCases[i];
    180     std::wstring trip = UTF8ToWide(SysWideToNativeMB(wide));
    181     EXPECT_EQ(wide.size(), trip.size());
    182     EXPECT_EQ(wide, trip);
    183   }
    184 }
    185 #endif  // OS_LINUX
    186 
    187 }  // namespace base
    188