1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <algorithm> 6 #include <string> 7 8 #include "net/base/escape.h" 9 10 #include "base/basictypes.h" 11 #include "base/i18n/icu_string_conversions.h" 12 #include "base/string_util.h" 13 #include "base/stringprintf.h" 14 #include "base/utf_string_conversions.h" 15 #include "testing/gtest/include/gtest/gtest.h" 16 17 namespace { 18 19 static const size_t kNpos = string16::npos; 20 21 struct EscapeCase { 22 const wchar_t* input; 23 const wchar_t* output; 24 }; 25 26 struct UnescapeURLCase { 27 const wchar_t* input; 28 UnescapeRule::Type rules; 29 const wchar_t* output; 30 }; 31 32 struct UnescapeURLCaseASCII { 33 const char* input; 34 UnescapeRule::Type rules; 35 const char* output; 36 }; 37 38 struct UnescapeAndDecodeCase { 39 const char* input; 40 41 // The expected output when run through UnescapeURL. 42 const char* url_unescaped; 43 44 // The expected output when run through UnescapeQuery. 45 const char* query_unescaped; 46 47 // The expected output when run through UnescapeAndDecodeURLComponent. 48 const wchar_t* decoded; 49 }; 50 51 struct AdjustOffsetCase { 52 const char* input; 53 size_t input_offset; 54 size_t output_offset; 55 }; 56 57 struct EscapeForHTMLCase { 58 const char* input; 59 const char* expected_output; 60 }; 61 62 } // namespace 63 64 TEST(EscapeTest, EscapeTextForFormSubmission) { 65 const EscapeCase escape_cases[] = { 66 {L"foo", L"foo"}, 67 {L"foo bar", L"foo+bar"}, 68 {L"foo++", L"foo%2B%2B"} 69 }; 70 for (size_t i = 0; i < arraysize(escape_cases); ++i) { 71 EscapeCase value = escape_cases[i]; 72 EXPECT_EQ(WideToUTF16Hack(value.output), 73 EscapeQueryParamValueUTF8(WideToUTF16Hack(value.input), true)); 74 } 75 76 const EscapeCase escape_cases_no_plus[] = { 77 {L"foo", L"foo"}, 78 {L"foo bar", L"foo%20bar"}, 79 {L"foo++", L"foo%2B%2B"} 80 }; 81 for (size_t i = 0; i < arraysize(escape_cases_no_plus); ++i) { 82 EscapeCase value = escape_cases_no_plus[i]; 83 EXPECT_EQ(WideToUTF16Hack(value.output), 84 EscapeQueryParamValueUTF8(WideToUTF16Hack(value.input), false)); 85 } 86 87 // Test all the values in we're supposed to be escaping. 88 const std::string no_escape( 89 "abcdefghijklmnopqrstuvwxyz" 90 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 91 "0123456789" 92 "!'()*-._~"); 93 for (int i = 0; i < 256; ++i) { 94 std::string in; 95 in.push_back(i); 96 std::string out = EscapeQueryParamValue(in, true); 97 if (0 == i) { 98 EXPECT_EQ(out, std::string("%00")); 99 } else if (32 == i) { 100 // Spaces are plus escaped like web forms. 101 EXPECT_EQ(out, std::string("+")); 102 } else if (no_escape.find(in) == std::string::npos) { 103 // Check %hex escaping 104 std::string expected = base::StringPrintf("%%%02X", i); 105 EXPECT_EQ(expected, out); 106 } else { 107 // No change for things in the no_escape list. 108 EXPECT_EQ(out, in); 109 } 110 } 111 112 // Check to see if EscapeQueryParamValueUTF8 is the same as 113 // EscapeQueryParamValue(..., kCodepageUTF8,) 114 string16 test_str; 115 test_str.reserve(5000); 116 for (int i = 1; i < 5000; ++i) { 117 test_str.push_back(i); 118 } 119 string16 wide; 120 EXPECT_TRUE(EscapeQueryParamValue(test_str, base::kCodepageUTF8, true, 121 &wide)); 122 EXPECT_EQ(wide, EscapeQueryParamValueUTF8(test_str, true)); 123 EXPECT_TRUE(EscapeQueryParamValue(test_str, base::kCodepageUTF8, false, 124 &wide)); 125 EXPECT_EQ(wide, EscapeQueryParamValueUTF8(test_str, false)); 126 } 127 128 TEST(EscapeTest, EscapePath) { 129 ASSERT_EQ( 130 // Most of the character space we care about, un-escaped 131 EscapePath( 132 "\x02\n\x1d !\"#$%&'()*+,-./0123456789:;" 133 "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" 134 "[\\]^_`abcdefghijklmnopqrstuvwxyz" 135 "{|}~\x7f\x80\xff"), 136 // Escaped 137 "%02%0A%1D%20!%22%23$%25&'()*+,-./0123456789%3A;" 138 "%3C=%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ" 139 "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz" 140 "%7B%7C%7D~%7F%80%FF"); 141 } 142 143 TEST(EscapeTest, EscapeUrlEncodedData) { 144 ASSERT_EQ( 145 // Most of the character space we care about, un-escaped 146 EscapeUrlEncodedData( 147 "\x02\n\x1d !\"#$%&'()*+,-./0123456789:;" 148 "<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" 149 "[\\]^_`abcdefghijklmnopqrstuvwxyz" 150 "{|}~\x7f\x80\xff"), 151 // Escaped 152 "%02%0A%1D+!%22%23%24%25%26%27()*%2B,-./0123456789:%3B" 153 "%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ" 154 "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz" 155 "%7B%7C%7D~%7F%80%FF"); 156 } 157 158 TEST(EscapeTest, UnescapeURLComponentASCII) { 159 const UnescapeURLCaseASCII unescape_cases[] = { 160 {"", UnescapeRule::NORMAL, ""}, 161 {"%2", UnescapeRule::NORMAL, "%2"}, 162 {"%%%%%%", UnescapeRule::NORMAL, "%%%%%%"}, 163 {"Don't escape anything", UnescapeRule::NORMAL, "Don't escape anything"}, 164 {"Invalid %escape %2", UnescapeRule::NORMAL, "Invalid %escape %2"}, 165 {"Some%20random text %25%2dOK", UnescapeRule::NONE, 166 "Some%20random text %25%2dOK"}, 167 {"Some%20random text %25%2dOK", UnescapeRule::NORMAL, 168 "Some%20random text %25-OK"}, 169 {"Some%20random text %25%2dOK", UnescapeRule::SPACES, 170 "Some random text %25-OK"}, 171 {"Some%20random text %25%2dOK", UnescapeRule::URL_SPECIAL_CHARS, 172 "Some%20random text %-OK"}, 173 {"Some%20random text %25%2dOK", 174 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 175 "Some random text %-OK"}, 176 {"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, "\xA0\xB1\xC2\xD3\xE4\xF5"}, 177 {"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, "\xAa\xBb\xCc\xDd\xEe\xFf"}, 178 // Certain URL-sensitive characters should not be unescaped unless asked. 179 {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES, 180 "Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"}, 181 {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", 182 UnescapeRule::URL_SPECIAL_CHARS, 183 "Hello%20%13%10world ## ?? == && %% ++"}, 184 // Control characters. 185 {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS, 186 "%01%02%03%04%05%06%07%08%09 %"}, 187 {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS, 188 "\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"}, 189 {"Hello%20%13%10%02", UnescapeRule::SPACES, "Hello %13%10%02"}, 190 {"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS, "Hello%20\x13\x10\x02"}, 191 }; 192 193 for (size_t i = 0; i < arraysize(unescape_cases); i++) { 194 std::string str(unescape_cases[i].input); 195 EXPECT_EQ(std::string(unescape_cases[i].output), 196 UnescapeURLComponent(str, unescape_cases[i].rules)); 197 } 198 199 // Test the NULL character unescaping (which wouldn't work above since those 200 // are just char pointers). 201 std::string input("Null"); 202 input.push_back(0); // Also have a NULL in the input. 203 input.append("%00%39Test"); 204 205 // When we're unescaping NULLs 206 std::string expected("Null"); 207 expected.push_back(0); 208 expected.push_back(0); 209 expected.append("9Test"); 210 EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS)); 211 212 // When we're not unescaping NULLs. 213 expected = "Null"; 214 expected.push_back(0); 215 expected.append("%009Test"); 216 EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); 217 } 218 219 TEST(EscapeTest, UnescapeURLComponent) { 220 const UnescapeURLCase unescape_cases[] = { 221 {L"", UnescapeRule::NORMAL, L""}, 222 {L"%2", UnescapeRule::NORMAL, L"%2"}, 223 {L"%%%%%%", UnescapeRule::NORMAL, L"%%%%%%"}, 224 {L"Don't escape anything", UnescapeRule::NORMAL, L"Don't escape anything"}, 225 {L"Invalid %escape %2", UnescapeRule::NORMAL, L"Invalid %escape %2"}, 226 {L"Some%20random text %25%2dOK", UnescapeRule::NONE, 227 L"Some%20random text %25%2dOK"}, 228 {L"Some%20random text %25%2dOK", UnescapeRule::NORMAL, 229 L"Some%20random text %25-OK"}, 230 {L"Some%20random text %25%2dOK", UnescapeRule::SPACES, 231 L"Some random text %25-OK"}, 232 {L"Some%20random text %25%2dOK", UnescapeRule::URL_SPECIAL_CHARS, 233 L"Some%20random text %-OK"}, 234 {L"Some%20random text %25%2dOK", 235 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 236 L"Some random text %-OK"}, 237 {L"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, L"\xA0\xB1\xC2\xD3\xE4\xF5"}, 238 {L"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, L"\xAa\xBb\xCc\xDd\xEe\xFf"}, 239 // Certain URL-sensitive characters should not be unescaped unless asked. 240 {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES, 241 L"Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"}, 242 {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", 243 UnescapeRule::URL_SPECIAL_CHARS, 244 L"Hello%20%13%10world ## ?? == && %% ++"}, 245 // We can neither escape nor unescape '@' since some websites expect it to 246 // be preserved as either '@' or "%40". 247 // See http://b/996720 and http://crbug.com/23933 . 248 {L"me@my%40example", UnescapeRule::NORMAL, L"me@my%40example"}, 249 // Control characters. 250 {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS, 251 L"%01%02%03%04%05%06%07%08%09 %"}, 252 {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS, 253 L"\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"}, 254 {L"Hello%20%13%10%02", UnescapeRule::SPACES, L"Hello %13%10%02"}, 255 {L"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS, 256 L"Hello%20\x13\x10\x02"}, 257 {L"Hello\x9824\x9827", UnescapeRule::CONTROL_CHARS, 258 L"Hello\x9824\x9827"}, 259 }; 260 261 for (size_t i = 0; i < arraysize(unescape_cases); i++) { 262 string16 str(WideToUTF16(unescape_cases[i].input)); 263 EXPECT_EQ(WideToUTF16(unescape_cases[i].output), 264 UnescapeURLComponent(str, unescape_cases[i].rules)); 265 } 266 267 // Test the NULL character unescaping (which wouldn't work above since those 268 // are just char pointers). 269 string16 input(WideToUTF16(L"Null")); 270 input.push_back(0); // Also have a NULL in the input. 271 input.append(WideToUTF16(L"%00%39Test")); 272 273 // When we're unescaping NULLs 274 string16 expected(WideToUTF16(L"Null")); 275 expected.push_back(0); 276 expected.push_back(0); 277 expected.append(ASCIIToUTF16("9Test")); 278 EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS)); 279 280 // When we're not unescaping NULLs. 281 expected = WideToUTF16(L"Null"); 282 expected.push_back(0); 283 expected.append(WideToUTF16(L"%009Test")); 284 EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); 285 } 286 287 TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) { 288 const UnescapeAndDecodeCase unescape_cases[] = { 289 { "%", 290 "%", 291 "%", 292 L"%"}, 293 { "+", 294 "+", 295 " ", 296 L"+"}, 297 { "%2+", 298 "%2+", 299 "%2 ", 300 L"%2+"}, 301 { "+%%%+%%%", 302 "+%%%+%%%", 303 " %%% %%%", 304 L"+%%%+%%%"}, 305 { "Don't escape anything", 306 "Don't escape anything", 307 "Don't escape anything", 308 L"Don't escape anything"}, 309 { "+Invalid %escape %2+", 310 "+Invalid %escape %2+", 311 " Invalid %escape %2 ", 312 L"+Invalid %escape %2+"}, 313 { "Some random text %25%2dOK", 314 "Some random text %25-OK", 315 "Some random text %25-OK", 316 L"Some random text %25-OK"}, 317 { "%01%02%03%04%05%06%07%08%09", 318 "%01%02%03%04%05%06%07%08%09", 319 "%01%02%03%04%05%06%07%08%09", 320 L"%01%02%03%04%05%06%07%08%09"}, 321 { "%E4%BD%A0+%E5%A5%BD", 322 "\xE4\xBD\xA0+\xE5\xA5\xBD", 323 "\xE4\xBD\xA0 \xE5\xA5\xBD", 324 L"\x4f60+\x597d"}, 325 { "%ED%ED", // Invalid UTF-8. 326 "\xED\xED", 327 "\xED\xED", 328 L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. 329 }; 330 331 for (size_t i = 0; i < arraysize(unescape_cases); i++) { 332 std::string unescaped = UnescapeURLComponent(unescape_cases[i].input, 333 UnescapeRule::NORMAL); 334 EXPECT_EQ(std::string(unescape_cases[i].url_unescaped), unescaped); 335 336 unescaped = UnescapeURLComponent(unescape_cases[i].input, 337 UnescapeRule::REPLACE_PLUS_WITH_SPACE); 338 EXPECT_EQ(std::string(unescape_cases[i].query_unescaped), unescaped); 339 340 // TODO: Need to test unescape_spaces and unescape_percent. 341 string16 decoded = UnescapeAndDecodeUTF8URLComponent( 342 unescape_cases[i].input, UnescapeRule::NORMAL, NULL); 343 EXPECT_EQ(WideToUTF16Hack(std::wstring(unescape_cases[i].decoded)), 344 decoded); 345 } 346 } 347 348 TEST(EscapeTest, AdjustOffset) { 349 const AdjustOffsetCase adjust_cases[] = { 350 {"", 0, std::wstring::npos}, 351 {"test", 0, 0}, 352 {"test", 2, 2}, 353 {"test", 4, std::wstring::npos}, 354 {"test", std::wstring::npos, std::wstring::npos}, 355 {"%2dtest", 6, 4}, 356 {"%2dtest", 2, std::wstring::npos}, 357 {"test%2d", 2, 2}, 358 {"%E4%BD%A0+%E5%A5%BD", 9, 1}, 359 {"%E4%BD%A0+%E5%A5%BD", 6, std::wstring::npos}, 360 {"%ED%B0%80+%E5%A5%BD", 6, 6}, 361 }; 362 363 for (size_t i = 0; i < arraysize(adjust_cases); i++) { 364 size_t offset = adjust_cases[i].input_offset; 365 UnescapeAndDecodeUTF8URLComponent(adjust_cases[i].input, 366 UnescapeRule::NORMAL, &offset); 367 EXPECT_EQ(adjust_cases[i].output_offset, offset); 368 } 369 } 370 371 TEST(EscapeTest, EscapeForHTML) { 372 const EscapeForHTMLCase tests[] = { 373 { "hello", "hello" }, 374 { "<hello>", "<hello>" }, 375 { "don\'t mess with me", "don't mess with me" }, 376 }; 377 for (size_t i = 0; i < arraysize(tests); ++i) { 378 std::string result = EscapeForHTML(std::string(tests[i].input)); 379 EXPECT_EQ(std::string(tests[i].expected_output), result); 380 } 381 } 382 383 TEST(EscapeTest, UnescapeForHTML) { 384 const EscapeForHTMLCase tests[] = { 385 { "", "" }, 386 { "<hello>", "<hello>" }, 387 { "don't mess with me", "don\'t mess with me" }, 388 { "<>&"'", "<>&\"'" }, 389 { "& lt; & ; &; '", "& lt; & ; &; '" }, 390 { "&", "&" }, 391 { """, "\"" }, 392 { "'", "'" }, 393 { "<", "<" }, 394 { ">", ">" }, 395 { "& &", "& &" }, 396 }; 397 for (size_t i = 0; i < arraysize(tests); ++i) { 398 string16 result = UnescapeForHTML(ASCIIToUTF16(tests[i].input)); 399 EXPECT_EQ(ASCIIToUTF16(tests[i].expected_output), result); 400 } 401 } 402 403 TEST(EscapeTest, AdjustEncodingOffset) { 404 // Imagine we have strings as shown in the following cases where the 405 // %XX's represent encoded characters 406 407 // 1: abc%ECdef ==> abcXdef 408 std::vector<size_t> offsets; 409 for (size_t t = 0; t < 9; ++t) 410 offsets.push_back(t); 411 AdjustEncodingOffset::Adjustments adjustments; 412 adjustments.push_back(3); 413 std::for_each(offsets.begin(), offsets.end(), 414 AdjustEncodingOffset(adjustments)); 415 size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6}; 416 EXPECT_EQ(offsets.size(), arraysize(expected_1)); 417 for (size_t i = 0; i < arraysize(expected_1); ++i) 418 EXPECT_EQ(expected_1[i], offsets[i]); 419 420 421 // 2: %ECabc%EC%ECdef%EC ==> XabcXXdefX 422 offsets.clear(); 423 for (size_t t = 0; t < 18; ++t) 424 offsets.push_back(t); 425 adjustments.clear(); 426 adjustments.push_back(0); 427 adjustments.push_back(6); 428 adjustments.push_back(9); 429 adjustments.push_back(15); 430 std::for_each(offsets.begin(), offsets.end(), 431 AdjustEncodingOffset(adjustments)); 432 size_t expected_2[] = {0, kNpos, kNpos, 1, 2, 3, 4, kNpos, kNpos, 5, kNpos, 433 kNpos, 6, 7, 8, 9, kNpos, kNpos}; 434 EXPECT_EQ(offsets.size(), arraysize(expected_2)); 435 for (size_t i = 0; i < arraysize(expected_2); ++i) 436 EXPECT_EQ(expected_2[i], offsets[i]); 437 } 438