Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <stddef.h>
      6 
      7 #include <string>
      8 
      9 #include "base/macros.h"
     10 #include "base/strings/string_piece.h"
     11 #include "base/strings/sys_string_conversions.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "base/test/scoped_locale.h"
     14 #include "build/build_config.h"
     15 #include "testing/gtest/include/gtest/gtest.h"
     16 
     17 #ifdef WCHAR_T_IS_UTF32
     18 static const std::wstring kSysWideOldItalicLetterA = L"\x10300";
     19 #else
     20 static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00";
     21 #endif
     22 
     23 namespace base {
     24 
     25 TEST(SysStrings, SysWideToUTF8) {
     26   EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world"));
     27   EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d"));
     28 
     29   // >16 bits
     30   EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA));
     31 
     32   // Error case. When Windows finds a UTF-16 character going off the end of
     33   // a string, it just converts that literal value to UTF-8, even though this
     34   // is invalid.
     35   //
     36   // This is what XP does, but Vista has different behavior, so we don't bother
     37   // verifying it:
     38   // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
     39   //           SysWideToUTF8(L"\x4f60\xd800zyxw"));
     40 
     41   // Test embedded NULLs.
     42   std::wstring wide_null(L"a");
     43   wide_null.push_back(0);
     44   wide_null.push_back('b');
     45 
     46   std::string expected_null("a");
     47   expected_null.push_back(0);
     48   expected_null.push_back('b');
     49 
     50   EXPECT_EQ(expected_null, SysWideToUTF8(wide_null));
     51 }
     52 
     53 TEST(SysStrings, SysUTF8ToWide) {
     54   EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world"));
     55   EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
     56   // >16 bits
     57   EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80"));
     58 
     59   // Error case. When Windows finds an invalid UTF-8 character, it just skips
     60   // it. This seems weird because it's inconsistent with the reverse conversion.
     61   //
     62   // This is what XP does, but Vista has different behavior, so we don't bother
     63   // verifying it:
     64   // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
     65 
     66   // Test embedded NULLs.
     67   std::string utf8_null("a");
     68   utf8_null.push_back(0);
     69   utf8_null.push_back('b');
     70 
     71   std::wstring expected_null(L"a");
     72   expected_null.push_back(0);
     73   expected_null.push_back('b');
     74 
     75   EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null));
     76 }
     77 
     78 #if defined(OS_LINUX)  // Tests depend on setting a specific Linux locale.
     79 
     80 TEST(SysStrings, SysWideToNativeMB) {
     81 #if !defined(SYSTEM_NATIVE_UTF8)
     82   ScopedLocale locale("en_US.UTF-8");
     83 #endif
     84   EXPECT_EQ("Hello, world", SysWideToNativeMB(L"Hello, world"));
     85   EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToNativeMB(L"\x4f60\x597d"));
     86 
     87   // >16 bits
     88   EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToNativeMB(kSysWideOldItalicLetterA));
     89 
     90   // Error case. When Windows finds a UTF-16 character going off the end of
     91   // a string, it just converts that literal value to UTF-8, even though this
     92   // is invalid.
     93   //
     94   // This is what XP does, but Vista has different behavior, so we don't bother
     95   // verifying it:
     96   // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
     97   //           SysWideToNativeMB(L"\x4f60\xd800zyxw"));
     98 
     99   // Test embedded NULLs.
    100   std::wstring wide_null(L"a");
    101   wide_null.push_back(0);
    102   wide_null.push_back('b');
    103 
    104   std::string expected_null("a");
    105   expected_null.push_back(0);
    106   expected_null.push_back('b');
    107 
    108   EXPECT_EQ(expected_null, SysWideToNativeMB(wide_null));
    109 }
    110 
    111 // We assume the test is running in a UTF8 locale.
    112 TEST(SysStrings, SysNativeMBToWide) {
    113 #if !defined(SYSTEM_NATIVE_UTF8)
    114   ScopedLocale locale("en_US.UTF-8");
    115 #endif
    116   EXPECT_EQ(L"Hello, world", SysNativeMBToWide("Hello, world"));
    117   EXPECT_EQ(L"\x4f60\x597d", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
    118   // >16 bits
    119   EXPECT_EQ(kSysWideOldItalicLetterA, SysNativeMBToWide("\xF0\x90\x8C\x80"));
    120 
    121   // Error case. When Windows finds an invalid UTF-8 character, it just skips
    122   // it. This seems weird because it's inconsistent with the reverse conversion.
    123   //
    124   // This is what XP does, but Vista has different behavior, so we don't bother
    125   // verifying it:
    126   // EXPECT_EQ(L"\x4f60zyxw", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
    127 
    128   // Test embedded NULLs.
    129   std::string utf8_null("a");
    130   utf8_null.push_back(0);
    131   utf8_null.push_back('b');
    132 
    133   std::wstring expected_null(L"a");
    134   expected_null.push_back(0);
    135   expected_null.push_back('b');
    136 
    137   EXPECT_EQ(expected_null, SysNativeMBToWide(utf8_null));
    138 }
    139 
    140 static const wchar_t* const kConvertRoundtripCases[] = {
    141   L"Google Video",
    142   // "   "
    143   L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
    144   //  " "
    145   L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
    146   L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
    147   // "   "
    148   L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
    149   L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
    150   L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
    151   // ""
    152   L"\xc804\xccb4\xc11c\xbe44\xc2a4",
    153 
    154   // Test characters that take more than 16 bits. This will depend on whether
    155   // wchar_t is 16 or 32 bits.
    156 #if defined(WCHAR_T_IS_UTF16)
    157   L"\xd800\xdf00",
    158   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
    159   L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
    160 #elif defined(WCHAR_T_IS_UTF32)
    161   L"\x10300",
    162   // ?????  (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
    163   L"\x11d40\x11d41\x11d42\x11d43\x11d44",
    164 #endif
    165 };
    166 
    167 
    168 TEST(SysStrings, SysNativeMBAndWide) {
    169 #if !defined(SYSTEM_NATIVE_UTF8)
    170   ScopedLocale locale("en_US.UTF-8");
    171 #endif
    172   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    173     std::wstring wide = kConvertRoundtripCases[i];
    174     std::wstring trip = SysNativeMBToWide(SysWideToNativeMB(wide));
    175     EXPECT_EQ(wide.size(), trip.size());
    176     EXPECT_EQ(wide, trip);
    177   }
    178 
    179   // We assume our test is running in UTF-8, so double check through ICU.
    180   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    181     std::wstring wide = kConvertRoundtripCases[i];
    182     std::wstring trip = SysNativeMBToWide(WideToUTF8(wide));
    183     EXPECT_EQ(wide.size(), trip.size());
    184     EXPECT_EQ(wide, trip);
    185   }
    186 
    187   for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
    188     std::wstring wide = kConvertRoundtripCases[i];
    189     std::wstring trip = UTF8ToWide(SysWideToNativeMB(wide));
    190     EXPECT_EQ(wide.size(), trip.size());
    191     EXPECT_EQ(wide, trip);
    192   }
    193 }
    194 #endif  // OS_LINUX
    195 
    196 }  // namespace base
    197