Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 #include <string>
      7 
      8 #include "net/base/escape.h"
      9 
     10 #include "base/basictypes.h"
     11 #include "base/i18n/icu_string_conversions.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/stringprintf.h"
     14 #include "base/strings/utf_string_conversions.h"
     15 #include "testing/gtest/include/gtest/gtest.h"
     16 
     17 namespace net {
     18 namespace {
     19 
     20 const size_t kNpos = base::string16::npos;
     21 
     22 struct EscapeCase {
     23   const char* input;
     24   const char* output;
     25 };
     26 
     27 struct UnescapeURLCase {
     28   const wchar_t* input;
     29   UnescapeRule::Type rules;
     30   const wchar_t* output;
     31 };
     32 
     33 struct UnescapeURLCaseASCII {
     34   const char* input;
     35   UnescapeRule::Type rules;
     36   const char* output;
     37 };
     38 
     39 struct UnescapeAndDecodeCase {
     40   const char* input;
     41 
     42   // The expected output when run through UnescapeURL.
     43   const char* url_unescaped;
     44 
     45   // The expected output when run through UnescapeQuery.
     46   const char* query_unescaped;
     47 
     48   // The expected output when run through UnescapeAndDecodeURLComponent.
     49   const wchar_t* decoded;
     50 };
     51 
     52 struct AdjustOffsetCase {
     53   const char* input;
     54   size_t input_offset;
     55   size_t output_offset;
     56 };
     57 
     58 struct EscapeForHTMLCase {
     59   const char* input;
     60   const char* expected_output;
     61 };
     62 
     63 TEST(EscapeTest, EscapeTextForFormSubmission) {
     64   const EscapeCase escape_cases[] = {
     65     {"foo", "foo"},
     66     {"foo bar", "foo+bar"},
     67     {"foo++", "foo%2B%2B"}
     68   };
     69   for (size_t i = 0; i < arraysize(escape_cases); ++i) {
     70     EscapeCase value = escape_cases[i];
     71     EXPECT_EQ(value.output, EscapeQueryParamValue(value.input, true));
     72   }
     73 
     74   const EscapeCase escape_cases_no_plus[] = {
     75     {"foo", "foo"},
     76     {"foo bar", "foo%20bar"},
     77     {"foo++", "foo%2B%2B"}
     78   };
     79   for (size_t i = 0; i < arraysize(escape_cases_no_plus); ++i) {
     80     EscapeCase value = escape_cases_no_plus[i];
     81     EXPECT_EQ(value.output, EscapeQueryParamValue(value.input, false));
     82   }
     83 
     84   // Test all the values in we're supposed to be escaping.
     85   const std::string no_escape(
     86     "abcdefghijklmnopqrstuvwxyz"
     87     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     88     "0123456789"
     89     "!'()*-._~");
     90   for (int i = 0; i < 256; ++i) {
     91     std::string in;
     92     in.push_back(i);
     93     std::string out = EscapeQueryParamValue(in, true);
     94     if (0 == i) {
     95       EXPECT_EQ(out, std::string("%00"));
     96     } else if (32 == i) {
     97       // Spaces are plus escaped like web forms.
     98       EXPECT_EQ(out, std::string("+"));
     99     } else if (no_escape.find(in) == std::string::npos) {
    100       // Check %hex escaping
    101       std::string expected = base::StringPrintf("%%%02X", i);
    102       EXPECT_EQ(expected, out);
    103     } else {
    104       // No change for things in the no_escape list.
    105       EXPECT_EQ(out, in);
    106     }
    107   }
    108 }
    109 
    110 TEST(EscapeTest, EscapePath) {
    111   ASSERT_EQ(
    112     // Most of the character space we care about, un-escaped
    113     EscapePath(
    114       "\x02\n\x1d !\"#$%&'()*+,-./0123456789:;"
    115       "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    116       "[\\]^_`abcdefghijklmnopqrstuvwxyz"
    117       "{|}~\x7f\x80\xff"),
    118     // Escaped
    119     "%02%0A%1D%20!%22%23$%25&'()*+,-./0123456789%3A;"
    120     "%3C=%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    121     "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz"
    122     "%7B%7C%7D~%7F%80%FF");
    123 }
    124 
    125 TEST(EscapeTest, EscapeUrlEncodedData) {
    126   ASSERT_EQ(
    127     // Most of the character space we care about, un-escaped
    128     EscapeUrlEncodedData(
    129       "\x02\n\x1d !\"#$%&'()*+,-./0123456789:;"
    130       "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    131       "[\\]^_`abcdefghijklmnopqrstuvwxyz"
    132       "{|}~\x7f\x80\xff", true),
    133     // Escaped
    134     "%02%0A%1D+!%22%23%24%25%26%27()*%2B,-./0123456789:%3B"
    135     "%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    136     "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz"
    137     "%7B%7C%7D~%7F%80%FF");
    138 }
    139 
    140 TEST(EscapeTest, EscapeUrlEncodedDataSpace) {
    141   ASSERT_EQ(EscapeUrlEncodedData("a b", true), "a+b");
    142   ASSERT_EQ(EscapeUrlEncodedData("a b", false), "a%20b");
    143 }
    144 
    145 TEST(EscapeTest, UnescapeURLComponentASCII) {
    146   const UnescapeURLCaseASCII unescape_cases[] = {
    147     {"", UnescapeRule::NORMAL, ""},
    148     {"%2", UnescapeRule::NORMAL, "%2"},
    149     {"%%%%%%", UnescapeRule::NORMAL, "%%%%%%"},
    150     {"Don't escape anything", UnescapeRule::NORMAL, "Don't escape anything"},
    151     {"Invalid %escape %2", UnescapeRule::NORMAL, "Invalid %escape %2"},
    152     {"Some%20random text %25%2dOK", UnescapeRule::NONE,
    153      "Some%20random text %25%2dOK"},
    154     {"Some%20random text %25%2dOK", UnescapeRule::NORMAL,
    155      "Some%20random text %25-OK"},
    156     {"Some%20random text %25%2dOK", UnescapeRule::SPACES,
    157      "Some random text %25-OK"},
    158     {"Some%20random text %25%2dOK", UnescapeRule::URL_SPECIAL_CHARS,
    159      "Some%20random text %-OK"},
    160     {"Some%20random text %25%2dOK",
    161      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
    162      "Some random text %-OK"},
    163     {"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, "\xA0\xB1\xC2\xD3\xE4\xF5"},
    164     {"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, "\xAa\xBb\xCc\xDd\xEe\xFf"},
    165     // Certain URL-sensitive characters should not be unescaped unless asked.
    166     {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES,
    167      "Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"},
    168     {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+",
    169      UnescapeRule::URL_SPECIAL_CHARS,
    170      "Hello%20%13%10world ## ?? == && %% ++"},
    171     // We can neither escape nor unescape '@' since some websites expect it to
    172     // be preserved as either '@' or "%40".
    173     // See http://b/996720 and http://crbug.com/23933 .
    174     {"me@my%40example", UnescapeRule::NORMAL, "me@my%40example"},
    175     // Control characters.
    176     {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS,
    177      "%01%02%03%04%05%06%07%08%09 %"},
    178     {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS,
    179      "\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"},
    180     {"Hello%20%13%10%02", UnescapeRule::SPACES, "Hello %13%10%02"},
    181     {"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS, "Hello%20\x13\x10\x02"},
    182   };
    183 
    184   for (size_t i = 0; i < arraysize(unescape_cases); i++) {
    185     std::string str(unescape_cases[i].input);
    186     EXPECT_EQ(std::string(unescape_cases[i].output),
    187               UnescapeURLComponent(str, unescape_cases[i].rules));
    188   }
    189 
    190   // Test the NULL character unescaping (which wouldn't work above since those
    191   // are just char pointers).
    192   std::string input("Null");
    193   input.push_back(0);  // Also have a NULL in the input.
    194   input.append("%00%39Test");
    195 
    196   // When we're unescaping NULLs
    197   std::string expected("Null");
    198   expected.push_back(0);
    199   expected.push_back(0);
    200   expected.append("9Test");
    201   EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS));
    202 
    203   // When we're not unescaping NULLs.
    204   expected = "Null";
    205   expected.push_back(0);
    206   expected.append("%009Test");
    207   EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL));
    208 }
    209 
    210 TEST(EscapeTest, UnescapeURLComponent) {
    211   const UnescapeURLCase unescape_cases[] = {
    212     {L"", UnescapeRule::NORMAL, L""},
    213     {L"%2", UnescapeRule::NORMAL, L"%2"},
    214     {L"%%%%%%", UnescapeRule::NORMAL, L"%%%%%%"},
    215     {L"Don't escape anything", UnescapeRule::NORMAL, L"Don't escape anything"},
    216     {L"Invalid %escape %2", UnescapeRule::NORMAL, L"Invalid %escape %2"},
    217     {L"Some%20random text %25%2dOK", UnescapeRule::NONE,
    218      L"Some%20random text %25%2dOK"},
    219     {L"Some%20random text %25%2dOK", UnescapeRule::NORMAL,
    220      L"Some%20random text %25-OK"},
    221     {L"Some%20random text %25%2dOK", UnescapeRule::SPACES,
    222      L"Some random text %25-OK"},
    223     {L"Some%20random text %25%2dOK", UnescapeRule::URL_SPECIAL_CHARS,
    224      L"Some%20random text %-OK"},
    225     {L"Some%20random text %25%2dOK",
    226      UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
    227      L"Some random text %-OK"},
    228     {L"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, L"\xA0\xB1\xC2\xD3\xE4\xF5"},
    229     {L"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, L"\xAa\xBb\xCc\xDd\xEe\xFf"},
    230     // Certain URL-sensitive characters should not be unescaped unless asked.
    231     {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES,
    232      L"Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"},
    233     {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+",
    234      UnescapeRule::URL_SPECIAL_CHARS,
    235      L"Hello%20%13%10world ## ?? == && %% ++"},
    236     // We can neither escape nor unescape '@' since some websites expect it to
    237     // be preserved as either '@' or "%40".
    238     // See http://b/996720 and http://crbug.com/23933 .
    239     {L"me@my%40example", UnescapeRule::NORMAL, L"me@my%40example"},
    240     // Control characters.
    241     {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS,
    242      L"%01%02%03%04%05%06%07%08%09 %"},
    243     {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS,
    244      L"\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"},
    245     {L"Hello%20%13%10%02", UnescapeRule::SPACES, L"Hello %13%10%02"},
    246     {L"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS,
    247      L"Hello%20\x13\x10\x02"},
    248     {L"Hello\x9824\x9827", UnescapeRule::CONTROL_CHARS,
    249      L"Hello\x9824\x9827"},
    250   };
    251 
    252   for (size_t i = 0; i < arraysize(unescape_cases); i++) {
    253     base::string16 str(WideToUTF16(unescape_cases[i].input));
    254     EXPECT_EQ(WideToUTF16(unescape_cases[i].output),
    255               UnescapeURLComponent(str, unescape_cases[i].rules));
    256   }
    257 
    258   // Test the NULL character unescaping (which wouldn't work above since those
    259   // are just char pointers).
    260   base::string16 input(WideToUTF16(L"Null"));
    261   input.push_back(0);  // Also have a NULL in the input.
    262   input.append(WideToUTF16(L"%00%39Test"));
    263 
    264   // When we're unescaping NULLs
    265   base::string16 expected(WideToUTF16(L"Null"));
    266   expected.push_back(0);
    267   expected.push_back(0);
    268   expected.append(ASCIIToUTF16("9Test"));
    269   EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS));
    270 
    271   // When we're not unescaping NULLs.
    272   expected = WideToUTF16(L"Null");
    273   expected.push_back(0);
    274   expected.append(WideToUTF16(L"%009Test"));
    275   EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL));
    276 }
    277 
    278 TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) {
    279   const UnescapeAndDecodeCase unescape_cases[] = {
    280     { "%",
    281       "%",
    282       "%",
    283      L"%"},
    284     { "+",
    285       "+",
    286       " ",
    287      L"+"},
    288     { "%2+",
    289       "%2+",
    290       "%2 ",
    291      L"%2+"},
    292     { "+%%%+%%%",
    293       "+%%%+%%%",
    294       " %%% %%%",
    295      L"+%%%+%%%"},
    296     { "Don't escape anything",
    297       "Don't escape anything",
    298       "Don't escape anything",
    299      L"Don't escape anything"},
    300     { "+Invalid %escape %2+",
    301       "+Invalid %escape %2+",
    302       " Invalid %escape %2 ",
    303      L"+Invalid %escape %2+"},
    304     { "Some random text %25%2dOK",
    305       "Some random text %25-OK",
    306       "Some random text %25-OK",
    307      L"Some random text %25-OK"},
    308     { "%01%02%03%04%05%06%07%08%09",
    309       "%01%02%03%04%05%06%07%08%09",
    310       "%01%02%03%04%05%06%07%08%09",
    311      L"%01%02%03%04%05%06%07%08%09"},
    312     { "%E4%BD%A0+%E5%A5%BD",
    313       "\xE4\xBD\xA0+\xE5\xA5\xBD",
    314       "\xE4\xBD\xA0 \xE5\xA5\xBD",
    315      L"\x4f60+\x597d"},
    316     { "%ED%ED",  // Invalid UTF-8.
    317       "\xED\xED",
    318       "\xED\xED",
    319      L"%ED%ED"},  // Invalid UTF-8 -> kept unescaped.
    320   };
    321 
    322   for (size_t i = 0; i < arraysize(unescape_cases); i++) {
    323     std::string unescaped = UnescapeURLComponent(unescape_cases[i].input,
    324                                                  UnescapeRule::NORMAL);
    325     EXPECT_EQ(std::string(unescape_cases[i].url_unescaped), unescaped);
    326 
    327     unescaped = UnescapeURLComponent(unescape_cases[i].input,
    328                                      UnescapeRule::REPLACE_PLUS_WITH_SPACE);
    329     EXPECT_EQ(std::string(unescape_cases[i].query_unescaped), unescaped);
    330 
    331     // TODO: Need to test unescape_spaces and unescape_percent.
    332     base::string16 decoded = UnescapeAndDecodeUTF8URLComponent(
    333         unescape_cases[i].input, UnescapeRule::NORMAL, NULL);
    334     EXPECT_EQ(WideToUTF16(unescape_cases[i].decoded), decoded);
    335   }
    336 }
    337 
    338 TEST(EscapeTest, AdjustOffset) {
    339   const AdjustOffsetCase adjust_cases[] = {
    340     {"", 0, std::string::npos},
    341     {"test", 0, 0},
    342     {"test", 2, 2},
    343     {"test", 4, std::string::npos},
    344     {"test", std::string::npos, std::string::npos},
    345     {"%2dtest", 6, 4},
    346     {"%2dtest", 2, std::string::npos},
    347     {"test%2d", 2, 2},
    348     {"%E4%BD%A0+%E5%A5%BD", 9, 1},
    349     {"%E4%BD%A0+%E5%A5%BD", 6, std::string::npos},
    350     {"%ED%B0%80+%E5%A5%BD", 6, 6},
    351   };
    352 
    353   for (size_t i = 0; i < arraysize(adjust_cases); i++) {
    354     size_t offset = adjust_cases[i].input_offset;
    355     UnescapeAndDecodeUTF8URLComponent(adjust_cases[i].input,
    356                                       UnescapeRule::NORMAL, &offset);
    357     EXPECT_EQ(adjust_cases[i].output_offset, offset);
    358   }
    359 }
    360 
    361 TEST(EscapeTest, EscapeForHTML) {
    362   const EscapeForHTMLCase tests[] = {
    363     { "hello", "hello" },
    364     { "<hello>", "&lt;hello&gt;" },
    365     { "don\'t mess with me", "don&#39;t mess with me" },
    366   };
    367   for (size_t i = 0; i < arraysize(tests); ++i) {
    368     std::string result = EscapeForHTML(std::string(tests[i].input));
    369     EXPECT_EQ(std::string(tests[i].expected_output), result);
    370   }
    371 }
    372 
    373 TEST(EscapeTest, UnescapeForHTML) {
    374   const EscapeForHTMLCase tests[] = {
    375     { "", "" },
    376     { "&lt;hello&gt;", "<hello>" },
    377     { "don&#39;t mess with me", "don\'t mess with me" },
    378     { "&lt;&gt;&amp;&quot;&#39;", "<>&\"'" },
    379     { "& lt; &amp ; &; '", "& lt; &amp ; &; '" },
    380     { "&amp;", "&" },
    381     { "&quot;", "\"" },
    382     { "&#39;", "'" },
    383     { "&lt;", "<" },
    384     { "&gt;", ">" },
    385     { "&amp; &", "& &" },
    386   };
    387   for (size_t i = 0; i < arraysize(tests); ++i) {
    388     base::string16 result = UnescapeForHTML(ASCIIToUTF16(tests[i].input));
    389     EXPECT_EQ(ASCIIToUTF16(tests[i].expected_output), result);
    390   }
    391 }
    392 
    393 TEST(EscapeTest, AdjustEncodingOffset) {
    394   // Imagine we have strings as shown in the following cases where the
    395   // %XX's represent encoded characters
    396 
    397   // 1: abc%ECdef ==> abcXdef
    398   std::vector<size_t> offsets;
    399   for (size_t t = 0; t < 9; ++t)
    400     offsets.push_back(t);
    401   internal::AdjustEncodingOffset::Adjustments adjustments;
    402   adjustments.push_back(3);
    403   std::for_each(offsets.begin(), offsets.end(),
    404                 internal::AdjustEncodingOffset(adjustments));
    405   size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6};
    406   EXPECT_EQ(offsets.size(), arraysize(expected_1));
    407   for (size_t i = 0; i < arraysize(expected_1); ++i)
    408     EXPECT_EQ(expected_1[i], offsets[i]);
    409 
    410 
    411   // 2: %ECabc%EC%ECdef%EC ==> XabcXXdefX
    412   offsets.clear();
    413   for (size_t t = 0; t < 18; ++t)
    414     offsets.push_back(t);
    415   adjustments.clear();
    416   adjustments.push_back(0);
    417   adjustments.push_back(6);
    418   adjustments.push_back(9);
    419   adjustments.push_back(15);
    420   std::for_each(offsets.begin(), offsets.end(),
    421                 internal::AdjustEncodingOffset(adjustments));
    422   size_t expected_2[] = {0, kNpos, kNpos, 1, 2, 3, 4, kNpos, kNpos, 5, kNpos,
    423                          kNpos, 6, 7, 8, 9, kNpos, kNpos};
    424   EXPECT_EQ(offsets.size(), arraysize(expected_2));
    425   for (size_t i = 0; i < arraysize(expected_2); ++i)
    426     EXPECT_EQ(expected_2[i], offsets[i]);
    427 }
    428 
    429 }  // namespace
    430 }  // namespace net
    431