Home | History | Annotate | Download | only in url
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/macros.h"
      6 #include "testing/gtest/include/gtest/gtest.h"
      7 #include "third_party/icu/source/common/unicode/ucnv.h"
      8 #include "url/url_canon.h"
      9 #include "url/url_canon_icu.h"
     10 #include "url/url_canon_stdstring.h"
     11 #include "url/url_test_utils.h"
     12 
     13 // Some implementations of base/basictypes.h may define ARRAYSIZE.
     14 // If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
     15 // which is in our version of basictypes.h.
     16 #ifndef ARRAYSIZE
     17 #define ARRAYSIZE ARRAYSIZE_UNSAFE
     18 #endif
     19 
     20 namespace url {
     21 
     22 using test_utils::WStringToUTF16;
     23 
     24 namespace {
     25 
     26 // Wrapper around a UConverter object that managers creation and destruction.
     27 class UConvScoper {
     28  public:
     29   explicit UConvScoper(const char* charset_name) {
     30     UErrorCode err = U_ZERO_ERROR;
     31     converter_ = ucnv_open(charset_name, &err);
     32   }
     33 
     34   ~UConvScoper() {
     35     if (converter_)
     36       ucnv_close(converter_);
     37   }
     38 
     39   // Returns the converter object, may be NULL.
     40   UConverter* converter() const { return converter_; }
     41 
     42  private:
     43   UConverter* converter_;
     44 };
     45 
     46 TEST(URLCanonIcuTest, ICUCharsetConverter) {
     47   struct ICUCase {
     48     const wchar_t* input;
     49     const char* encoding;
     50     const char* expected;
     51   } icu_cases[] = {
     52       // UTF-8.
     53     {L"Hello, world", "utf-8", "Hello, world"},
     54     {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
     55       // Non-BMP UTF-8.
     56     {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
     57       // Big5
     58     {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
     59       // Unrepresentable character in the destination set.
     60     {L"hello\x4f60\x06de\x597dworld", "big5",
     61       "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
     62   };
     63 
     64   for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
     65     UConvScoper conv(icu_cases[i].encoding);
     66     ASSERT_TRUE(conv.converter() != NULL);
     67     ICUCharsetConverter converter(conv.converter());
     68 
     69     std::string str;
     70     StdStringCanonOutput output(&str);
     71 
     72     base::string16 input_str(WStringToUTF16(icu_cases[i].input));
     73     int input_len = static_cast<int>(input_str.length());
     74     converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
     75     output.Complete();
     76 
     77     EXPECT_STREQ(icu_cases[i].expected, str.c_str());
     78   }
     79 
     80   // Test string sizes around the resize boundary for the output to make sure
     81   // the converter resizes as needed.
     82   const int static_size = 16;
     83   UConvScoper conv("utf-8");
     84   ASSERT_TRUE(conv.converter());
     85   ICUCharsetConverter converter(conv.converter());
     86   for (int i = static_size - 2; i <= static_size + 2; i++) {
     87     // Make a string with the appropriate length.
     88     base::string16 input;
     89     for (int ch = 0; ch < i; ch++)
     90       input.push_back('a');
     91 
     92     RawCanonOutput<static_size> output;
     93     converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
     94                                &output);
     95     EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
     96   }
     97 }
     98 
     99 TEST(URLCanonIcuTest, QueryWithConverter) {
    100   struct QueryCase {
    101     const char* input8;
    102     const wchar_t* input16;
    103     const char* encoding;
    104     const char* expected;
    105   } query_cases[] = {
    106       // Regular ASCII case in some different encodings.
    107     {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
    108     {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
    109     {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
    110       // Chinese input/output
    111     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
    112       "?q=%C4%E3%BA%C3"},
    113     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
    114       // Unencodable character in the destination character set should be
    115       // escaped. The escape sequence unescapes to be the entity name:
    116       // "?q=&#20320;"
    117     {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
    118       "?q=Chinese%26%2365319%3B"},
    119   };
    120 
    121   for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
    122     Component out_comp;
    123 
    124     UConvScoper conv(query_cases[i].encoding);
    125     ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
    126     ICUCharsetConverter converter(conv.converter());
    127 
    128     if (query_cases[i].input8) {
    129       int len = static_cast<int>(strlen(query_cases[i].input8));
    130       Component in_comp(0, len);
    131       std::string out_str;
    132 
    133       StdStringCanonOutput output(&out_str);
    134       CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
    135                         &out_comp);
    136       output.Complete();
    137 
    138       EXPECT_EQ(query_cases[i].expected, out_str);
    139     }
    140 
    141     if (query_cases[i].input16) {
    142       base::string16 input16(WStringToUTF16(query_cases[i].input16));
    143       int len = static_cast<int>(input16.length());
    144       Component in_comp(0, len);
    145       std::string out_str;
    146 
    147       StdStringCanonOutput output(&out_str);
    148       CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
    149                         &out_comp);
    150       output.Complete();
    151 
    152       EXPECT_EQ(query_cases[i].expected, out_str);
    153     }
    154   }
    155 
    156   // Extra test for input with embedded NULL;
    157   std::string out_str;
    158   StdStringCanonOutput output(&out_str);
    159   Component out_comp;
    160   CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
    161   output.Complete();
    162   EXPECT_EQ("?a%20%00z%01", out_str);
    163 }
    164 
    165 }  // namespace
    166 
    167 }  // namespace url
    168