1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <errno.h> 6 7 #include "base/macros.h" 8 #include "testing/gtest/include/gtest/gtest.h" 9 #include "url/url_canon.h" 10 #include "url/url_canon_internal.h" 11 #include "url/url_canon_stdstring.h" 12 #include "url/url_parse.h" 13 #include "url/url_test_utils.h" 14 15 // Some implementations of base/basictypes.h may define ARRAYSIZE. 16 // If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro 17 // which is in our version of basictypes.h. 18 #ifndef ARRAYSIZE 19 #define ARRAYSIZE ARRAYSIZE_UNSAFE 20 #endif 21 22 namespace url { 23 24 using test_utils::WStringToUTF16; 25 using test_utils::ConvertUTF8ToUTF16; 26 using test_utils::ConvertUTF16ToUTF8; 27 28 namespace { 29 30 struct ComponentCase { 31 const char* input; 32 const char* expected; 33 Component expected_component; 34 bool expected_success; 35 }; 36 37 // ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests 38 // treat each input as optional, and will only try processing if non-NULL. 39 // The output is always 8-bit. 40 struct DualComponentCase { 41 const char* input8; 42 const wchar_t* input16; 43 const char* expected; 44 Component expected_component; 45 bool expected_success; 46 }; 47 48 // Test cases for CanonicalizeIPAddress(). The inputs are identical to 49 // DualComponentCase, but the output has extra CanonHostInfo fields. 50 struct IPAddressCase { 51 const char* input8; 52 const wchar_t* input16; 53 const char* expected; 54 Component expected_component; 55 56 // CanonHostInfo fields, for verbose output. 57 CanonHostInfo::Family expected_family; 58 int expected_num_ipv4_components; 59 const char* expected_address_hex; // Two hex chars per IP address byte. 60 }; 61 62 std::string BytesToHexString(unsigned char bytes[16], int length) { 63 EXPECT_TRUE(length == 0 || length == 4 || length == 16) 64 << "Bad IP address length: " << length; 65 std::string result; 66 for (int i = 0; i < length; ++i) { 67 result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]); 68 result.push_back(kHexCharLookup[bytes[i] & 0xf]); 69 } 70 return result; 71 } 72 73 struct ReplaceCase { 74 const char* base; 75 const char* scheme; 76 const char* username; 77 const char* password; 78 const char* host; 79 const char* port; 80 const char* path; 81 const char* query; 82 const char* ref; 83 const char* expected; 84 }; 85 86 // Magic string used in the replacements code that tells SetupReplComp to 87 // call the clear function. 88 const char kDeleteComp[] = "|"; 89 90 // Sets up a replacement for a single component. This is given pointers to 91 // the set and clear function for the component being replaced, and will 92 // either set the component (if it exists) or clear it (if the replacement 93 // string matches kDeleteComp). 94 // 95 // This template is currently used only for the 8-bit case, and the strlen 96 // causes it to fail in other cases. It is left a template in case we have 97 // tests for wide replacements. 98 template<typename CHAR> 99 void SetupReplComp( 100 void (Replacements<CHAR>::*set)(const CHAR*, const Component&), 101 void (Replacements<CHAR>::*clear)(), 102 Replacements<CHAR>* rep, 103 const CHAR* str) { 104 if (str && str[0] == kDeleteComp[0]) { 105 (rep->*clear)(); 106 } else if (str) { 107 (rep->*set)(str, Component(0, static_cast<int>(strlen(str)))); 108 } 109 } 110 111 } // namespace 112 113 TEST(URLCanonTest, DoAppendUTF8) { 114 struct UTF8Case { 115 unsigned input; 116 const char* output; 117 } utf_cases[] = { 118 // Valid code points. 119 {0x24, "\x24"}, 120 {0xA2, "\xC2\xA2"}, 121 {0x20AC, "\xE2\x82\xAC"}, 122 {0x24B62, "\xF0\xA4\xAD\xA2"}, 123 {0x10FFFF, "\xF4\x8F\xBF\xBF"}, 124 }; 125 std::string out_str; 126 for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) { 127 out_str.clear(); 128 StdStringCanonOutput output(&out_str); 129 AppendUTF8Value(utf_cases[i].input, &output); 130 output.Complete(); 131 EXPECT_EQ(utf_cases[i].output, out_str); 132 } 133 } 134 135 #if defined(GTEST_HAS_DEATH_TEST) 136 // TODO(mattm): Can't run this in debug mode for now, since the DCHECK will 137 // cause the Chromium stacktrace dialog to appear and hang the test. 138 // See http://crbug.com/49580. 139 #if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON) 140 #define MAYBE_DoAppendUTF8Invalid DoAppendUTF8Invalid 141 #else 142 #define MAYBE_DoAppendUTF8Invalid DISABLED_DoAppendUTF8Invalid 143 #endif 144 TEST(URLCanonTest, MAYBE_DoAppendUTF8Invalid) { 145 std::string out_str; 146 StdStringCanonOutput output(&out_str); 147 // Invalid code point (too large). 148 ASSERT_DEBUG_DEATH({ 149 AppendUTF8Value(0x110000, &output); 150 output.Complete(); 151 EXPECT_EQ("", out_str); 152 }, ""); 153 } 154 #endif // defined(GTEST_HAS_DEATH_TEST) 155 156 TEST(URLCanonTest, UTF) { 157 // Low-level test that we handle reading, canonicalization, and writing 158 // UTF-8/UTF-16 strings properly. 159 struct UTFCase { 160 const char* input8; 161 const wchar_t* input16; 162 bool expected_success; 163 const char* output; 164 } utf_cases[] = { 165 // Valid canonical input should get passed through & escaped. 166 {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, 167 // Test a characer that takes > 16 bits (U+10300 = old italic letter A) 168 {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, 169 // Non-shortest-form UTF-8 are invalid. The bad char should be replaced 170 // with the invalid character (EF BF DB in UTF-8). 171 {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"}, 172 // Invalid UTF-8 sequences should be marked as invalid (the first 173 // sequence is truncated). 174 {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"}, 175 // Character going off the end. 176 {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"}, 177 // ...same with low surrogates with no high surrogate. 178 {"\xed\xb0\x80", L"\xdc00", false, "%EF%BF%BD"}, 179 // Test a UTF-8 encoded surrogate value is marked as invalid. 180 // ED A0 80 = U+D800 181 {"\xed\xa0\x80", NULL, false, "%EF%BF%BD"}, 182 }; 183 184 std::string out_str; 185 for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) { 186 if (utf_cases[i].input8) { 187 out_str.clear(); 188 StdStringCanonOutput output(&out_str); 189 190 int input_len = static_cast<int>(strlen(utf_cases[i].input8)); 191 bool success = true; 192 for (int ch = 0; ch < input_len; ch++) { 193 success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len, 194 &output); 195 } 196 output.Complete(); 197 EXPECT_EQ(utf_cases[i].expected_success, success); 198 EXPECT_EQ(std::string(utf_cases[i].output), out_str); 199 } 200 if (utf_cases[i].input16) { 201 out_str.clear(); 202 StdStringCanonOutput output(&out_str); 203 204 base::string16 input_str(WStringToUTF16(utf_cases[i].input16)); 205 int input_len = static_cast<int>(input_str.length()); 206 bool success = true; 207 for (int ch = 0; ch < input_len; ch++) { 208 success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len, 209 &output); 210 } 211 output.Complete(); 212 EXPECT_EQ(utf_cases[i].expected_success, success); 213 EXPECT_EQ(std::string(utf_cases[i].output), out_str); 214 } 215 216 if (utf_cases[i].input8 && utf_cases[i].input16 && 217 utf_cases[i].expected_success) { 218 // Check that the UTF-8 and UTF-16 inputs are equivalent. 219 220 // UTF-16 -> UTF-8 221 std::string input8_str(utf_cases[i].input8); 222 base::string16 input16_str(WStringToUTF16(utf_cases[i].input16)); 223 EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str)); 224 225 // UTF-8 -> UTF-16 226 EXPECT_EQ(input16_str, ConvertUTF8ToUTF16(input8_str)); 227 } 228 } 229 } 230 231 TEST(URLCanonTest, Scheme) { 232 // Here, we're mostly testing that unusual characters are handled properly. 233 // The canonicalizer doesn't do any parsing or whitespace detection. It will 234 // also do its best on error, and will escape funny sequences (these won't be 235 // valid schemes and it will return error). 236 // 237 // Note that the canonicalizer will append a colon to the output to separate 238 // out the rest of the URL, which is not present in the input. We check, 239 // however, that the output range includes everything but the colon. 240 ComponentCase scheme_cases[] = { 241 {"http", "http:", Component(0, 4), true}, 242 {"HTTP", "http:", Component(0, 4), true}, 243 {" HTTP ", "%20http%20:", Component(0, 10), false}, 244 {"htt: ", "htt%3A%20:", Component(0, 9), false}, 245 {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false}, 246 // Don't re-escape something already escaped. Note that it will 247 // "canonicalize" the 'A' to 'a', but that's OK. 248 {"ht%3Atp", "ht%3atp:", Component(0, 7), false}, 249 }; 250 251 std::string out_str; 252 253 for (size_t i = 0; i < arraysize(scheme_cases); i++) { 254 int url_len = static_cast<int>(strlen(scheme_cases[i].input)); 255 Component in_comp(0, url_len); 256 Component out_comp; 257 258 out_str.clear(); 259 StdStringCanonOutput output1(&out_str); 260 bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1, 261 &out_comp); 262 output1.Complete(); 263 264 EXPECT_EQ(scheme_cases[i].expected_success, success); 265 EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); 266 EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); 267 EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); 268 269 // Now try the wide version 270 out_str.clear(); 271 StdStringCanonOutput output2(&out_str); 272 273 base::string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input)); 274 in_comp.len = static_cast<int>(wide_input.length()); 275 success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2, 276 &out_comp); 277 output2.Complete(); 278 279 EXPECT_EQ(scheme_cases[i].expected_success, success); 280 EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); 281 EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); 282 EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); 283 } 284 285 // Test the case where the scheme is declared nonexistant, it should be 286 // converted into an empty scheme. 287 Component out_comp; 288 out_str.clear(); 289 StdStringCanonOutput output(&out_str); 290 291 EXPECT_TRUE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp)); 292 output.Complete(); 293 294 EXPECT_EQ(std::string(":"), out_str); 295 EXPECT_EQ(0, out_comp.begin); 296 EXPECT_EQ(0, out_comp.len); 297 } 298 299 TEST(URLCanonTest, Host) { 300 IPAddressCase host_cases[] = { 301 // Basic canonicalization, uppercase should be converted to lowercase. 302 {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, 303 // Spaces and some other characters should be escaped. 304 {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""}, 305 // Exciting different types of spaces! 306 {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""}, 307 // Other types of space (no-break, zero-width, zero-width-no-break) are 308 // name-prepped away to nothing. 309 {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, 310 // Ideographic full stop (full-width period for Chinese, etc.) should be 311 // treated as a dot. 312 {NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""}, 313 // Invalid unicode characters should fail... 314 // ...In wide input, ICU will barf and we'll end up with the input as 315 // escaped UTF-8 (the invalid character should be replaced with the 316 // replacement character). 317 {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, 318 // ...This is the same as previous but with with escaped. 319 {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, 320 // Test name prepping, fullwidth input should be converted to ASCII and NOT 321 // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. 322 {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""}, 323 // Test that fullwidth escaped values are properly name-prepped, 324 // then converted or rejected. 325 // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input) 326 {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, 327 {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, 328 // ...%00 in fullwidth should fail (also as escaped UTF-8 input) 329 {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, 330 {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, 331 // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN 332 {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, 333 // See http://unicode.org/cldr/utility/idna.jsp for other 334 // examples/experiments and http://goo.gl/7yG11o 335 // for the full list of characters handled differently by 336 // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008. 337 338 // 4 Deviation characters are mapped/ignored in UTS 46 transitional 339 // mechansm. UTS 46, table 4 row (g). 340 // Sharp-s is mapped to 'ss' in UTS 46 and IDNA 2003. 341 // Otherwise, it'd be "xn--fuball-cta.de". 342 {"fu\xc3\x9f" "ball.de", L"fu\x00df" L"ball.de", "fussball.de", 343 Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, 344 // Final-sigma (U+03C3) is mapped to regular sigma (U+03C2). 345 // Otherwise, it'd be "xn--wxaijb9b". 346 {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2", 347 "xn--wxaikc6b", Component(0, 12), 348 CanonHostInfo::NEUTRAL, -1, ""}, 349 // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional 350 // handling as well as in IDNA 2003. 351 {"a\xe2\x80\x8c" "b\xe2\x80\x8d" "c", L"a\x200c" L"b\x200d" L"c", "abc", 352 Component(0, 3), CanonHostInfo::NEUTRAL, -1, ""}, 353 // ZWJ between Devanagari characters is still mapped away in UTS 46 354 // transitional handling. IDNA 2008 would give xn--11bo0mv54g. 355 {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c", 356 L"\x915\x94d\x200d\x91c", "xn--11bo0m", 357 Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""}, 358 // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b) 359 // However, we do allow this at the moment because we don't use 360 // STD3 rules and canonicalize full-width ASCII to ASCII. 361 {"wow\xef\xbc\x81", L"wow\xff01", "wow%21", 362 Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""}, 363 // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c) 364 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 365 {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", 366 Component(0, 11), CanonHostInfo::BROKEN, -1, ""}, 367 // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d) 368 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 369 {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn", 370 "%F0%AF%A1%A8%E5%A7%BB.cn", 371 Component(0, 24), CanonHostInfo::BROKEN, -1, ""}, 372 // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e) 373 {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", 374 Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, 375 // Symbol/punctuations are allowed in IDNA 2003/UTS46. 376 // Not allowed in IDNA 2008. UTS 46 table 4 row (f). 377 {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", 378 Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""}, 379 // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h) 380 // We used to allow it because we passed through unassigned code points. 381 {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com", 382 Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""}, 383 // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i) 384 // Used to be allowed in INDA 2003. 385 {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", 386 Component(0, 9), CanonHostInfo::BROKEN, -1, ""}, 387 // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based 388 // on Unicode 3.2). We did allow it in the past because we let unassigned 389 // code point pass. We continue to allow it even though it's a 390 // "punctuation and symbol" blocked in IDNA 2008. 391 // UTS 46 table 4, row (j) 392 {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", 393 Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, 394 // Maps uppercase letters to lower case letters. 395 // In IDNA 2003, it's allowed without case-folding 396 // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2 397 // (added in Unicode 4.1). UTS 46 table 4 row (k) 398 {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", 399 Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""}, 400 // BiDi check test 401 // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM. 402 // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008. 403 {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8", 404 L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", 405 Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""}, 406 // Disallowed in both IDNA 2003 and 2008 with BiDi check. 407 // Labels starting with a RTL character cannot end with a LTR character. 408 {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz", 409 "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), 410 CanonHostInfo::BROKEN, -1, ""}, 411 // Labels starting with a RTL character can end with BC=EN (European 412 // number). Disallowed in IDNA 2003 but now allowed. 413 {"\xd8\xac\xd8\xa7\xd8\xb1" "2", L"\x62c\x627\x631" L"2", 414 "xn--2-ymcov", Component(0, 11), 415 CanonHostInfo::NEUTRAL, -1, ""}, 416 // Labels starting with a RTL character cannot have "L" characters 417 // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008. 418 {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2", 419 "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), 420 CanonHostInfo::BROKEN, -1, ""}, 421 // Labels starting with a RTL character can end with BC=AN (Arabic number) 422 // Disallowed in IDNA 2003, but now allowed. 423 {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662", 424 "xn--mgbjq0r", Component(0, 11), 425 CanonHostInfo::NEUTRAL, -1, ""}, 426 // Labels starting with a RTL character cannot have "L" characters 427 // even if it ends with an BC=AN (Arabic number). 428 // Disallowed in both IDNA 2003/2008. 429 {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662", 430 "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), 431 CanonHostInfo::BROKEN, -1, ""}, 432 // Labels starting with a RTL character cannot mix BC=EN and BC=AN 433 {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662", 434 "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), 435 CanonHostInfo::BROKEN, -1, ""}, 436 // As of Unicode 6.2, U+20CF is not assigned. We do not allow it. 437 {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", 438 Component(0, 13), CanonHostInfo::BROKEN, -1, ""}, 439 // U+0080 is not allowed. 440 {"\xc2\x80.com", L"\x80.com", "%C2%80.com", 441 Component(0, 10), CanonHostInfo::BROKEN, -1, ""}, 442 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped 443 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped 444 // UTF-8 (wide case). The output should be equivalent to the true wide 445 // character input above). 446 {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", 447 L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", 448 Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, 449 // Invalid escaped characters should fail and the percents should be 450 // escaped. 451 {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10), 452 CanonHostInfo::BROKEN, -1, ""}, 453 // If we get an invalid character that has been escaped. 454 {"%25", L"%25", "%25", Component(0, 3), 455 CanonHostInfo::BROKEN, -1, ""}, 456 {"hello%00", L"hello%00", "hello%00", Component(0, 8), 457 CanonHostInfo::BROKEN, -1, ""}, 458 // Escaped numbers should be treated like IP addresses if they are. 459 {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", 460 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, 461 "C0A80001"}, 462 {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", 463 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, 464 "C0A80001"}, 465 // Invalid escaping should trigger the regular host error handling. 466 {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", Component(0, 17), CanonHostInfo::BROKEN, -1, ""}, 467 // Something that isn't exactly an IP should get treated as a host and 468 // spaces escaped. 469 {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, 470 // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. 471 // These are "0Xc0.0250.01" in fullwidth. 472 {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, 473 // Broken IP addresses get marked as such. 474 {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), CanonHostInfo::BROKEN, -1, ""}, 475 {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), CanonHostInfo::BROKEN, -1, ""}, 476 // Cyrillic letter followed by '(' should return punycode for '(' escaped 477 // before punycode string was created. I.e. 478 // if '(' is escaped after punycode is created we would get xn--%28-8tb 479 // (incorrect). 480 {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11), 481 CanonHostInfo::NEUTRAL, -1, ""}, 482 // Address with all hexidecimal characters with leading number of 1<<32 483 // or greater and should return NEUTRAL rather than BROKEN if not all 484 // components are numbers. 485 {"12345678912345.de", L"12345678912345.de", "12345678912345.de", Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""}, 486 {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, 487 {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""}, 488 {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""}, 489 {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", Component(0, 19), CanonHostInfo::BROKEN, -1, ""}, 490 }; 491 492 // CanonicalizeHost() non-verbose. 493 std::string out_str; 494 for (size_t i = 0; i < arraysize(host_cases); i++) { 495 // Narrow version. 496 if (host_cases[i].input8) { 497 int host_len = static_cast<int>(strlen(host_cases[i].input8)); 498 Component in_comp(0, host_len); 499 Component out_comp; 500 501 out_str.clear(); 502 StdStringCanonOutput output(&out_str); 503 504 bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output, 505 &out_comp); 506 output.Complete(); 507 508 EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, 509 success) << "for input: " << host_cases[i].input8; 510 EXPECT_EQ(std::string(host_cases[i].expected), out_str) << 511 "for input: " << host_cases[i].input8; 512 EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) << 513 "for input: " << host_cases[i].input8; 514 EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) << 515 "for input: " << host_cases[i].input8; 516 } 517 518 // Wide version. 519 if (host_cases[i].input16) { 520 base::string16 input16(WStringToUTF16(host_cases[i].input16)); 521 int host_len = static_cast<int>(input16.length()); 522 Component in_comp(0, host_len); 523 Component out_comp; 524 525 out_str.clear(); 526 StdStringCanonOutput output(&out_str); 527 528 bool success = CanonicalizeHost(input16.c_str(), in_comp, &output, 529 &out_comp); 530 output.Complete(); 531 532 EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, 533 success); 534 EXPECT_EQ(std::string(host_cases[i].expected), out_str); 535 EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); 536 EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); 537 } 538 } 539 540 // CanonicalizeHostVerbose() 541 for (size_t i = 0; i < arraysize(host_cases); i++) { 542 // Narrow version. 543 if (host_cases[i].input8) { 544 int host_len = static_cast<int>(strlen(host_cases[i].input8)); 545 Component in_comp(0, host_len); 546 547 out_str.clear(); 548 StdStringCanonOutput output(&out_str); 549 CanonHostInfo host_info; 550 551 CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output, 552 &host_info); 553 output.Complete(); 554 555 EXPECT_EQ(host_cases[i].expected_family, host_info.family); 556 EXPECT_EQ(std::string(host_cases[i].expected), out_str); 557 EXPECT_EQ(host_cases[i].expected_component.begin, 558 host_info.out_host.begin); 559 EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); 560 EXPECT_EQ(std::string(host_cases[i].expected_address_hex), 561 BytesToHexString(host_info.address, host_info.AddressLength())); 562 if (host_cases[i].expected_family == CanonHostInfo::IPV4) { 563 EXPECT_EQ(host_cases[i].expected_num_ipv4_components, 564 host_info.num_ipv4_components); 565 } 566 } 567 568 // Wide version. 569 if (host_cases[i].input16) { 570 base::string16 input16(WStringToUTF16(host_cases[i].input16)); 571 int host_len = static_cast<int>(input16.length()); 572 Component in_comp(0, host_len); 573 574 out_str.clear(); 575 StdStringCanonOutput output(&out_str); 576 CanonHostInfo host_info; 577 578 CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info); 579 output.Complete(); 580 581 EXPECT_EQ(host_cases[i].expected_family, host_info.family); 582 EXPECT_EQ(std::string(host_cases[i].expected), out_str); 583 EXPECT_EQ(host_cases[i].expected_component.begin, 584 host_info.out_host.begin); 585 EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); 586 EXPECT_EQ(std::string(host_cases[i].expected_address_hex), 587 BytesToHexString(host_info.address, host_info.AddressLength())); 588 if (host_cases[i].expected_family == CanonHostInfo::IPV4) { 589 EXPECT_EQ(host_cases[i].expected_num_ipv4_components, 590 host_info.num_ipv4_components); 591 } 592 } 593 } 594 } 595 596 TEST(URLCanonTest, IPv4) { 597 IPAddressCase cases[] = { 598 // Empty is not an IP address. 599 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 600 {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 601 // Regular IP addresses in different bases. 602 {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, 603 {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, 604 {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, 605 // Non-IP addresses due to invalid characters. 606 {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 607 // Invalid characters for the base should be rejected. 608 {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 609 {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 610 {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 611 // If there are not enough components, the last one should fill them out. 612 {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"}, 613 {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, 614 {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, 615 {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, 616 {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"}, 617 {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, 618 {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, 619 {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, 620 // Too many components means not an IP address. 621 {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 622 // We allow a single trailing dot. 623 {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, 624 {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 625 {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 626 // Two dots in a row means not an IP address. 627 {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 628 // Any numerical overflow should be marked as BROKEN. 629 {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 630 {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 631 {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 632 {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 633 {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 634 {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 635 {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 636 {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 637 {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 638 // Repeat the previous tests, minus 1, to verify boundaries. 639 {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"}, 640 {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"}, 641 {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"}, 642 {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"}, 643 {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"}, 644 {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"}, 645 {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"}, 646 {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"}, 647 {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"}, 648 // Old trunctations tests. They're all "BROKEN" now. 649 {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 650 {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 651 {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 652 {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 653 {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 654 {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 655 // Spaces should be rejected. 656 {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 657 // Very large numbers. 658 {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"}, 659 {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""}, 660 // A number has no length limit, but long numbers can still overflow. 661 {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"}, 662 {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 663 // If a long component is non-numeric, it's a hostname, *not* a broken IP. 664 {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 665 {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 666 // Truncation of all zeros should still result in 0. 667 {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"}, 668 }; 669 670 for (size_t i = 0; i < arraysize(cases); i++) { 671 // 8-bit version. 672 Component component(0, static_cast<int>(strlen(cases[i].input8))); 673 674 std::string out_str1; 675 StdStringCanonOutput output1(&out_str1); 676 CanonHostInfo host_info; 677 CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info); 678 output1.Complete(); 679 680 EXPECT_EQ(cases[i].expected_family, host_info.family); 681 EXPECT_EQ(std::string(cases[i].expected_address_hex), 682 BytesToHexString(host_info.address, host_info.AddressLength())); 683 if (host_info.family == CanonHostInfo::IPV4) { 684 EXPECT_STREQ(cases[i].expected, out_str1.c_str()); 685 EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); 686 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); 687 EXPECT_EQ(cases[i].expected_num_ipv4_components, 688 host_info.num_ipv4_components); 689 } 690 691 // 16-bit version. 692 base::string16 input16(WStringToUTF16(cases[i].input16)); 693 component = Component(0, static_cast<int>(input16.length())); 694 695 std::string out_str2; 696 StdStringCanonOutput output2(&out_str2); 697 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); 698 output2.Complete(); 699 700 EXPECT_EQ(cases[i].expected_family, host_info.family); 701 EXPECT_EQ(std::string(cases[i].expected_address_hex), 702 BytesToHexString(host_info.address, host_info.AddressLength())); 703 if (host_info.family == CanonHostInfo::IPV4) { 704 EXPECT_STREQ(cases[i].expected, out_str2.c_str()); 705 EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); 706 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); 707 EXPECT_EQ(cases[i].expected_num_ipv4_components, 708 host_info.num_ipv4_components); 709 } 710 } 711 } 712 713 TEST(URLCanonTest, IPv6) { 714 IPAddressCase cases[] = { 715 // Empty is not an IP address. 716 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, 717 // Non-IPs with [:] characters are marked BROKEN. 718 {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 719 {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 720 {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 721 {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 722 {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 723 {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 724 {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 725 // Regular IP address is invalid without bounding '[' and ']'. 726 {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 727 {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 728 {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 729 // Regular IP addresses. 730 {"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"}, 731 {"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"}, 732 {"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"}, 733 734 // Leading zeros should be stripped. 735 {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"}, 736 737 // Upper case letters should be lowercased. 738 {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"}, 739 740 // The same address can be written with different contractions, but should 741 // get canonicalized to the same thing. 742 {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, 743 {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, 744 745 // Addresses with embedded IPv4. 746 {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"}, 747 {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, 748 {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"}, 749 {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"}, 750 {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 751 752 // IPv4 with last component missing. 753 {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0010002"}, 754 755 // IPv4 using hex. 756 // TODO(eroman): Should this format be disallowed? 757 {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, 758 759 // There may be zeros surrounding the "::" contraction. 760 {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"}, 761 762 {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"}, 763 764 // Can only have one "::" contraction in an IPv6 string literal. 765 {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 766 // No more than 2 consecutive ':'s. 767 {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 768 {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 769 // Non-IP addresses due to invalid characters. 770 {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 771 // If there are not enough components, the last one should fill them out. 772 // ... omitted at this time ... 773 // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses. 774 {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 775 {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 776 {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 777 // Too many bits (even though 8 comonents, the last one holds 32 bits). 778 {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 779 780 // Too many bits specified -- the contraction would have to be zero-length 781 // to not exceed 128 bits. 782 {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 783 784 // The contraction is for 16 bits of zero. 785 {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"}, 786 787 // Cannot have a trailing colon. 788 {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 789 {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 790 791 // Cannot have negative numbers. 792 {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 793 794 // Scope ID -- the URL may contain an optional ["%" <scope_id>] section. 795 // The scope_id should be included in the canonicalized URL, and is an 796 // unsigned decimal number. 797 798 // Invalid because no ID was given after the percent. 799 800 // Don't allow scope-id 801 {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 802 {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 803 {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 804 {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 805 {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 806 807 // Don't allow leading or trailing colons. 808 {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 809 {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 810 {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 811 812 // We allow a single trailing dot. 813 // ... omitted at this time ... 814 // Two dots in a row means not an IP address. 815 {"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 816 // Any non-first components get truncated to one byte. 817 // ... omitted at this time ... 818 // Spaces should be rejected. 819 {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, 820 }; 821 822 for (size_t i = 0; i < arraysize(cases); i++) { 823 // 8-bit version. 824 Component component(0, static_cast<int>(strlen(cases[i].input8))); 825 826 std::string out_str1; 827 StdStringCanonOutput output1(&out_str1); 828 CanonHostInfo host_info; 829 CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info); 830 output1.Complete(); 831 832 EXPECT_EQ(cases[i].expected_family, host_info.family); 833 EXPECT_EQ(std::string(cases[i].expected_address_hex), 834 BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8; 835 if (host_info.family == CanonHostInfo::IPV6) { 836 EXPECT_STREQ(cases[i].expected, out_str1.c_str()); 837 EXPECT_EQ(cases[i].expected_component.begin, 838 host_info.out_host.begin); 839 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); 840 } 841 842 // 16-bit version. 843 base::string16 input16(WStringToUTF16(cases[i].input16)); 844 component = Component(0, static_cast<int>(input16.length())); 845 846 std::string out_str2; 847 StdStringCanonOutput output2(&out_str2); 848 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); 849 output2.Complete(); 850 851 EXPECT_EQ(cases[i].expected_family, host_info.family); 852 EXPECT_EQ(std::string(cases[i].expected_address_hex), 853 BytesToHexString(host_info.address, host_info.AddressLength())); 854 if (host_info.family == CanonHostInfo::IPV6) { 855 EXPECT_STREQ(cases[i].expected, out_str2.c_str()); 856 EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); 857 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); 858 } 859 } 860 } 861 862 TEST(URLCanonTest, IPEmpty) { 863 std::string out_str1; 864 StdStringCanonOutput output1(&out_str1); 865 CanonHostInfo host_info; 866 867 // This tests tests. 868 const char spec[] = "192.168.0.1"; 869 CanonicalizeIPAddress(spec, Component(), &output1, &host_info); 870 EXPECT_FALSE(host_info.IsIPAddress()); 871 872 CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info); 873 EXPECT_FALSE(host_info.IsIPAddress()); 874 } 875 876 TEST(URLCanonTest, UserInfo) { 877 // Note that the canonicalizer should escape and treat empty components as 878 // not being there. 879 880 // We actually parse a full input URL so we can get the initial components. 881 struct UserComponentCase { 882 const char* input; 883 const char* expected; 884 Component expected_username; 885 Component expected_password; 886 bool expected_success; 887 } user_info_cases[] = { 888 {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true}, 889 {"http://@host.com/", "", Component(0, -1), Component(0, -1), true}, 890 {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true}, 891 {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true}, 892 {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true}, 893 {"http://^ :$\t (at) host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true}, 894 {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true}, 895 {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true }, 896 897 // IE7 compatability: old versions allowed backslashes in usernames, but 898 // IE7 does not. We disallow it as well. 899 {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true}, 900 }; 901 902 for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) { 903 int url_len = static_cast<int>(strlen(user_info_cases[i].input)); 904 Parsed parsed; 905 ParseStandardURL(user_info_cases[i].input, url_len, &parsed); 906 Component out_user, out_pass; 907 std::string out_str; 908 StdStringCanonOutput output1(&out_str); 909 910 bool success = CanonicalizeUserInfo(user_info_cases[i].input, 911 parsed.username, 912 user_info_cases[i].input, 913 parsed.password, 914 &output1, 915 &out_user, 916 &out_pass); 917 output1.Complete(); 918 919 EXPECT_EQ(user_info_cases[i].expected_success, success); 920 EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); 921 EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); 922 EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); 923 EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); 924 EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); 925 926 // Now try the wide version 927 out_str.clear(); 928 StdStringCanonOutput output2(&out_str); 929 base::string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input)); 930 success = CanonicalizeUserInfo(wide_input.c_str(), 931 parsed.username, 932 wide_input.c_str(), 933 parsed.password, 934 &output2, 935 &out_user, 936 &out_pass); 937 output2.Complete(); 938 939 EXPECT_EQ(user_info_cases[i].expected_success, success); 940 EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); 941 EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); 942 EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); 943 EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); 944 EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); 945 } 946 } 947 948 TEST(URLCanonTest, Port) { 949 // We only need to test that the number gets properly put into the output 950 // buffer. The parser unit tests will test scanning the number correctly. 951 // 952 // Note that the CanonicalizePort will always prepend a colon to the output 953 // to separate it from the colon that it assumes preceeds it. 954 struct PortCase { 955 const char* input; 956 int default_port; 957 const char* expected; 958 Component expected_component; 959 bool expected_success; 960 } port_cases[] = { 961 // Invalid input should be copied w/ failure. 962 {"as df", 80, ":as%20df", Component(1, 7), false}, 963 {"-2", 80, ":-2", Component(1, 2), false}, 964 // Default port should be omitted. 965 {"80", 80, "", Component(0, -1), true}, 966 {"8080", 80, ":8080", Component(1, 4), true}, 967 // PORT_UNSPECIFIED should mean always keep the port. 968 {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true}, 969 }; 970 971 for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) { 972 int url_len = static_cast<int>(strlen(port_cases[i].input)); 973 Component in_comp(0, url_len); 974 Component out_comp; 975 std::string out_str; 976 StdStringCanonOutput output1(&out_str); 977 bool success = CanonicalizePort(port_cases[i].input, 978 in_comp, 979 port_cases[i].default_port, 980 &output1, 981 &out_comp); 982 output1.Complete(); 983 984 EXPECT_EQ(port_cases[i].expected_success, success); 985 EXPECT_EQ(std::string(port_cases[i].expected), out_str); 986 EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); 987 EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); 988 989 // Now try the wide version 990 out_str.clear(); 991 StdStringCanonOutput output2(&out_str); 992 base::string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input)); 993 success = CanonicalizePort(wide_input.c_str(), 994 in_comp, 995 port_cases[i].default_port, 996 &output2, 997 &out_comp); 998 output2.Complete(); 999 1000 EXPECT_EQ(port_cases[i].expected_success, success); 1001 EXPECT_EQ(std::string(port_cases[i].expected), out_str); 1002 EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); 1003 EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); 1004 } 1005 } 1006 1007 TEST(URLCanonTest, Path) { 1008 DualComponentCase path_cases[] = { 1009 // ----- path collapsing tests ----- 1010 {"/././foo", L"/././foo", "/foo", Component(0, 4), true}, 1011 {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true}, 1012 {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true}, 1013 {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true}, 1014 // double dots followed by a slash or the end of the string count 1015 {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true}, 1016 {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true}, 1017 // don't count double dots when they aren't followed by a slash 1018 {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true}, 1019 // some in the middle 1020 {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true}, 1021 {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", Component(0, 2), true}, 1022 // we should not be able to go above the root 1023 {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true}, 1024 {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true}, 1025 // escaped dots should be unescaped and treated the same as dots 1026 {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true}, 1027 {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true}, 1028 {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", Component(0, 6), true}, 1029 // Multiple slashes in a row should be preserved and treated like empty 1030 // directory names. 1031 {"////../..", L"////../..", "//", Component(0, 2), true}, 1032 1033 // ----- escaping tests ----- 1034 {"/foo", L"/foo", "/foo", Component(0, 4), true}, 1035 // Valid escape sequence 1036 {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true}, 1037 // Invalid escape sequence we should pass through unchanged. 1038 {"/foo%", L"/foo%", "/foo%", Component(0, 5), true}, 1039 {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true}, 1040 // Invalid escape sequence: bad characters should be treated the same as 1041 // the sourrounding text, not as escaped (in this case, UTF-8). 1042 {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true}, 1043 {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", Component(0, 16), true}, 1044 {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22), true}, 1045 // Regular characters that are escaped should be unescaped 1046 {"/foo%41%7a", L"/foo%41%7a", "/fooAz", Component(0, 6), true}, 1047 // Funny characters that are unescaped should be escaped 1048 {"/foo\x09\x91%91", NULL, "/foo%09%91%91", Component(0, 13), true}, 1049 {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true}, 1050 // Invalid characters that are escaped should cause a failure. 1051 {"/foo%00%51", L"/foo%00%51", "/foo%00Q", Component(0, 8), false}, 1052 // Some characters should be passed through unchanged regardless of esc. 1053 {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13), true}, 1054 // Characters that are properly escaped should not have the case changed 1055 // of hex letters. 1056 {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13), true}, 1057 // Funny characters that are unescaped should be escaped 1058 {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true}, 1059 // Backslashes should get converted to forward slashes 1060 {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", Component(0, 8), true}, 1061 // Hashes found in paths (possibly only when the caller explicitly sets 1062 // the path on an already-parsed URL) should be escaped. 1063 {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true}, 1064 // %7f should be allowed and %3D should not be unescaped (these were wrong 1065 // in a previous version). 1066 {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true}, 1067 // @ should be passed through unchanged (escaped or unescaped). 1068 {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true}, 1069 1070 // ----- encoding tests ----- 1071 // Basic conversions 1072 {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", Component(0, 37), true}, 1073 // Invalid unicode characters should fail. We only do validation on 1074 // UTF-16 input, so this doesn't happen on 8-bit. 1075 {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", Component(0, 13), true}, 1076 {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", Component(0, 13), false}, 1077 }; 1078 1079 for (size_t i = 0; i < arraysize(path_cases); i++) { 1080 if (path_cases[i].input8) { 1081 int len = static_cast<int>(strlen(path_cases[i].input8)); 1082 Component in_comp(0, len); 1083 Component out_comp; 1084 std::string out_str; 1085 StdStringCanonOutput output(&out_str); 1086 bool success = 1087 CanonicalizePath(path_cases[i].input8, in_comp, &output, &out_comp); 1088 output.Complete(); 1089 1090 EXPECT_EQ(path_cases[i].expected_success, success); 1091 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); 1092 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); 1093 EXPECT_EQ(path_cases[i].expected, out_str); 1094 } 1095 1096 if (path_cases[i].input16) { 1097 base::string16 input16(WStringToUTF16(path_cases[i].input16)); 1098 int len = static_cast<int>(input16.length()); 1099 Component in_comp(0, len); 1100 Component out_comp; 1101 std::string out_str; 1102 StdStringCanonOutput output(&out_str); 1103 1104 bool success = 1105 CanonicalizePath(input16.c_str(), in_comp, &output, &out_comp); 1106 output.Complete(); 1107 1108 EXPECT_EQ(path_cases[i].expected_success, success); 1109 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); 1110 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); 1111 EXPECT_EQ(path_cases[i].expected, out_str); 1112 } 1113 } 1114 1115 // Manual test: embedded NULLs should be escaped and the URL should be marked 1116 // as invalid. 1117 const char path_with_null[] = "/ab\0c"; 1118 Component in_comp(0, 5); 1119 Component out_comp; 1120 1121 std::string out_str; 1122 StdStringCanonOutput output(&out_str); 1123 bool success = CanonicalizePath(path_with_null, in_comp, &output, &out_comp); 1124 output.Complete(); 1125 EXPECT_FALSE(success); 1126 EXPECT_EQ("/ab%00c", out_str); 1127 } 1128 1129 TEST(URLCanonTest, Query) { 1130 struct QueryCase { 1131 const char* input8; 1132 const wchar_t* input16; 1133 const char* expected; 1134 } query_cases[] = { 1135 // Regular ASCII case. 1136 {"foo=bar", L"foo=bar", "?foo=bar"}, 1137 // Allow question marks in the query without escaping 1138 {"as?df", L"as?df", "?as?df"}, 1139 // Always escape '#' since it would mark the ref. 1140 {"as#df", L"as#df", "?as%23df"}, 1141 // Escape some questionable 8-bit characters, but never unescape. 1142 {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"}, 1143 {"%40%41123", L"%40%41123", "?%40%41123"}, 1144 // Chinese input/output 1145 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"}, 1146 // Invalid UTF-8/16 input should be replaced with invalid characters. 1147 {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"}, 1148 // Don't allow < or > because sometimes they are used for XSS if the 1149 // URL is echoed in content. Firefox does this, IE doesn't. 1150 {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"}, 1151 // Escape double quotemarks in the query. 1152 {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"}, 1153 }; 1154 1155 for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) { 1156 Component out_comp; 1157 1158 if (query_cases[i].input8) { 1159 int len = static_cast<int>(strlen(query_cases[i].input8)); 1160 Component in_comp(0, len); 1161 std::string out_str; 1162 1163 StdStringCanonOutput output(&out_str); 1164 CanonicalizeQuery(query_cases[i].input8, in_comp, NULL, &output, 1165 &out_comp); 1166 output.Complete(); 1167 1168 EXPECT_EQ(query_cases[i].expected, out_str); 1169 } 1170 1171 if (query_cases[i].input16) { 1172 base::string16 input16(WStringToUTF16(query_cases[i].input16)); 1173 int len = static_cast<int>(input16.length()); 1174 Component in_comp(0, len); 1175 std::string out_str; 1176 1177 StdStringCanonOutput output(&out_str); 1178 CanonicalizeQuery(input16.c_str(), in_comp, NULL, &output, &out_comp); 1179 output.Complete(); 1180 1181 EXPECT_EQ(query_cases[i].expected, out_str); 1182 } 1183 } 1184 1185 // Extra test for input with embedded NULL; 1186 std::string out_str; 1187 StdStringCanonOutput output(&out_str); 1188 Component out_comp; 1189 CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); 1190 output.Complete(); 1191 EXPECT_EQ("?a%20%00z%01", out_str); 1192 } 1193 1194 TEST(URLCanonTest, Ref) { 1195 // Refs are trivial, it just checks the encoding. 1196 DualComponentCase ref_cases[] = { 1197 // Regular one, we shouldn't escape spaces, et al. 1198 {"hello, world", L"hello, world", "#hello, world", Component(1, 12), true}, 1199 // UTF-8/wide input should be preserved 1200 {"\xc2\xa9", L"\xa9", "#\xc2\xa9", Component(1, 2), true}, 1201 // Test a characer that takes > 16 bits (U+10300 = old italic letter A) 1202 {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#\xF0\x90\x8C\x80ss", Component(1, 6), true}, 1203 // Escaping should be preserved unchanged, even invalid ones 1204 {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true}, 1205 // Invalid UTF-8/16 input should be flagged and the input made valid 1206 {"\xc2", NULL, "#\xef\xbf\xbd", Component(1, 3), true}, 1207 {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", Component(1, 6), true}, 1208 // Test a Unicode invalid character. 1209 {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", Component(1, 4), true}, 1210 // Refs can have # signs and we should preserve them. 1211 {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true}, 1212 {"#asdf", L"#asdf", "##asdf", Component(1, 5), true}, 1213 }; 1214 1215 for (size_t i = 0; i < arraysize(ref_cases); i++) { 1216 // 8-bit input 1217 if (ref_cases[i].input8) { 1218 int len = static_cast<int>(strlen(ref_cases[i].input8)); 1219 Component in_comp(0, len); 1220 Component out_comp; 1221 1222 std::string out_str; 1223 StdStringCanonOutput output(&out_str); 1224 CanonicalizeRef(ref_cases[i].input8, in_comp, &output, &out_comp); 1225 output.Complete(); 1226 1227 EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); 1228 EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); 1229 EXPECT_EQ(ref_cases[i].expected, out_str); 1230 } 1231 1232 // 16-bit input 1233 if (ref_cases[i].input16) { 1234 base::string16 input16(WStringToUTF16(ref_cases[i].input16)); 1235 int len = static_cast<int>(input16.length()); 1236 Component in_comp(0, len); 1237 Component out_comp; 1238 1239 std::string out_str; 1240 StdStringCanonOutput output(&out_str); 1241 CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp); 1242 output.Complete(); 1243 1244 EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); 1245 EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); 1246 EXPECT_EQ(ref_cases[i].expected, out_str); 1247 } 1248 } 1249 1250 // Try one with an embedded NULL. It should be stripped. 1251 const char null_input[5] = "ab\x00z"; 1252 Component null_input_component(0, 4); 1253 Component out_comp; 1254 1255 std::string out_str; 1256 StdStringCanonOutput output(&out_str); 1257 CanonicalizeRef(null_input, null_input_component, &output, &out_comp); 1258 output.Complete(); 1259 1260 EXPECT_EQ(1, out_comp.begin); 1261 EXPECT_EQ(3, out_comp.len); 1262 EXPECT_EQ("#abz", out_str); 1263 } 1264 1265 TEST(URLCanonTest, CanonicalizeStandardURL) { 1266 // The individual component canonicalize tests should have caught the cases 1267 // for each of those components. Here, we just need to test that the various 1268 // parts are included or excluded properly, and have the correct separators. 1269 struct URLCase { 1270 const char* input; 1271 const char* expected; 1272 bool expected_success; 1273 } cases[] = { 1274 {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true}, 1275 {"http://[www.google.com]/", "http://[www.google.com]/", false}, 1276 {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false}, 1277 {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true}, 1278 {"www.google.com", ":www.google.com/", true}, 1279 {"http://192.0x00A80001", "http://192.168.0.1/", true}, 1280 {"http://www/foo%2Ehtml", "http://www/foo.html", true}, 1281 {"http://user:pass@/", "http://user:pass@/", false}, 1282 {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true}, 1283 1284 // Backslashes should get converted to forward slashes. 1285 {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true}, 1286 1287 // Busted refs shouldn't make the whole thing fail. 1288 {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true}, 1289 1290 // Basic port tests. 1291 {"http://foo:80/", "http://foo/", true}, 1292 {"http://foo:81/", "http://foo:81/", true}, 1293 {"httpa://foo:80/", "httpa://foo:80/", true}, 1294 {"http://foo:-80/", "http://foo:-80/", false}, 1295 1296 {"https://foo:443/", "https://foo/", true}, 1297 {"https://foo:80/", "https://foo:80/", true}, 1298 {"ftp://foo:21/", "ftp://foo/", true}, 1299 {"ftp://foo:80/", "ftp://foo:80/", true}, 1300 {"gopher://foo:70/", "gopher://foo/", true}, 1301 {"gopher://foo:443/", "gopher://foo:443/", true}, 1302 {"ws://foo:80/", "ws://foo/", true}, 1303 {"ws://foo:81/", "ws://foo:81/", true}, 1304 {"ws://foo:443/", "ws://foo:443/", true}, 1305 {"ws://foo:815/", "ws://foo:815/", true}, 1306 {"wss://foo:80/", "wss://foo:80/", true}, 1307 {"wss://foo:81/", "wss://foo:81/", true}, 1308 {"wss://foo:443/", "wss://foo/", true}, 1309 {"wss://foo:815/", "wss://foo:815/", true}, 1310 }; 1311 1312 for (size_t i = 0; i < ARRAYSIZE(cases); i++) { 1313 int url_len = static_cast<int>(strlen(cases[i].input)); 1314 Parsed parsed; 1315 ParseStandardURL(cases[i].input, url_len, &parsed); 1316 1317 Parsed out_parsed; 1318 std::string out_str; 1319 StdStringCanonOutput output(&out_str); 1320 bool success = CanonicalizeStandardURL( 1321 cases[i].input, url_len, parsed, NULL, &output, &out_parsed); 1322 output.Complete(); 1323 1324 EXPECT_EQ(cases[i].expected_success, success); 1325 EXPECT_EQ(cases[i].expected, out_str); 1326 } 1327 } 1328 1329 // The codepath here is the same as for regular canonicalization, so we just 1330 // need to test that things are replaced or not correctly. 1331 TEST(URLCanonTest, ReplaceStandardURL) { 1332 ReplaceCase replace_cases[] = { 1333 // Common case of truncating the path. 1334 {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"}, 1335 // Replace everything 1336 {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"}, 1337 // Replace nothing 1338 {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"}, 1339 // Replace scheme with filesystem. The result is garbage, but you asked 1340 // for it. 1341 {"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b (at) google.com:22/foo?baz@cat"}, 1342 }; 1343 1344 for (size_t i = 0; i < arraysize(replace_cases); i++) { 1345 const ReplaceCase& cur = replace_cases[i]; 1346 int base_len = static_cast<int>(strlen(cur.base)); 1347 Parsed parsed; 1348 ParseStandardURL(cur.base, base_len, &parsed); 1349 1350 Replacements<char> r; 1351 typedef Replacements<char> R; // Clean up syntax. 1352 1353 // Note that for the scheme we pass in a different clear function since 1354 // there is no function to clear the scheme. 1355 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); 1356 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); 1357 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); 1358 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); 1359 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); 1360 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); 1361 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); 1362 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); 1363 1364 std::string out_str; 1365 StdStringCanonOutput output(&out_str); 1366 Parsed out_parsed; 1367 ReplaceStandardURL(replace_cases[i].base, parsed, r, NULL, &output, 1368 &out_parsed); 1369 output.Complete(); 1370 1371 EXPECT_EQ(replace_cases[i].expected, out_str); 1372 } 1373 1374 // The path pointer should be ignored if the address is invalid. 1375 { 1376 const char src[] = "http://www.google.com/here_is_the_path"; 1377 int src_len = static_cast<int>(strlen(src)); 1378 1379 Parsed parsed; 1380 ParseStandardURL(src, src_len, &parsed); 1381 1382 // Replace the path to 0 length string. By using 1 as the string address, 1383 // the test should get an access violation if it tries to dereference it. 1384 Replacements<char> r; 1385 r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0)); 1386 std::string out_str1; 1387 StdStringCanonOutput output1(&out_str1); 1388 Parsed new_parsed; 1389 ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed); 1390 output1.Complete(); 1391 EXPECT_STREQ("http://www.google.com/", out_str1.c_str()); 1392 1393 // Same with an "invalid" path. 1394 r.SetPath(reinterpret_cast<char*>(0x00000001), Component()); 1395 std::string out_str2; 1396 StdStringCanonOutput output2(&out_str2); 1397 ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed); 1398 output2.Complete(); 1399 EXPECT_STREQ("http://www.google.com/", out_str2.c_str()); 1400 } 1401 } 1402 1403 TEST(URLCanonTest, ReplaceFileURL) { 1404 ReplaceCase replace_cases[] = { 1405 // Replace everything 1406 {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, 1407 // Replace nothing 1408 {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, 1409 // Clear non-path components (common) 1410 {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"}, 1411 // Replace path with something that doesn't begin with a slash and make 1412 // sure it gets added properly. 1413 {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, 1414 {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, 1415 {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"}, 1416 {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"}, 1417 {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, 1418 // Replace scheme -- shouldn't do anything. 1419 {"file:///C:/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, 1420 }; 1421 1422 for (size_t i = 0; i < arraysize(replace_cases); i++) { 1423 const ReplaceCase& cur = replace_cases[i]; 1424 int base_len = static_cast<int>(strlen(cur.base)); 1425 Parsed parsed; 1426 ParseFileURL(cur.base, base_len, &parsed); 1427 1428 Replacements<char> r; 1429 typedef Replacements<char> R; // Clean up syntax. 1430 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); 1431 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); 1432 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); 1433 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); 1434 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); 1435 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); 1436 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); 1437 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); 1438 1439 std::string out_str; 1440 StdStringCanonOutput output(&out_str); 1441 Parsed out_parsed; 1442 ReplaceFileURL(cur.base, parsed, r, NULL, &output, &out_parsed); 1443 output.Complete(); 1444 1445 EXPECT_EQ(replace_cases[i].expected, out_str); 1446 } 1447 } 1448 1449 TEST(URLCanonTest, ReplaceFileSystemURL) { 1450 ReplaceCase replace_cases[] = { 1451 // Replace everything in the outer URL. 1452 {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, "/foo", "b", "c", "filesystem:file:///temporary/foo?b#c"}, 1453 // Replace nothing 1454 {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:file:///temporary/gaba?query#ref"}, 1455 // Clear non-path components (common) 1456 {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "filesystem:file:///temporary/gaba"}, 1457 // Replace path with something that doesn't begin with a slash and make 1458 // sure it gets added properly. 1459 {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "filesystem:file:///temporary/interesting/?query#ref"}, 1460 // Replace scheme -- shouldn't do anything. 1461 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"}, 1462 // Replace username -- shouldn't do anything. 1463 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, "u2", NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"}, 1464 // Replace password -- shouldn't do anything. 1465 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, "pw2", NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"}, 1466 // Replace host -- shouldn't do anything. 1467 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, NULL, "foo.com", NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"}, 1468 // Replace port -- shouldn't do anything. 1469 {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", NULL, NULL, NULL, NULL, "41", NULL, NULL, NULL, "filesystem:http://u:p@bar.com:40/t/gaba?query#ref"}, 1470 }; 1471 1472 for (size_t i = 0; i < arraysize(replace_cases); i++) { 1473 const ReplaceCase& cur = replace_cases[i]; 1474 int base_len = static_cast<int>(strlen(cur.base)); 1475 Parsed parsed; 1476 ParseFileSystemURL(cur.base, base_len, &parsed); 1477 1478 Replacements<char> r; 1479 typedef Replacements<char> R; // Clean up syntax. 1480 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); 1481 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); 1482 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); 1483 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); 1484 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); 1485 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); 1486 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); 1487 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); 1488 1489 std::string out_str; 1490 StdStringCanonOutput output(&out_str); 1491 Parsed out_parsed; 1492 ReplaceFileSystemURL(cur.base, parsed, r, NULL, &output, &out_parsed); 1493 output.Complete(); 1494 1495 EXPECT_EQ(replace_cases[i].expected, out_str); 1496 } 1497 } 1498 1499 TEST(URLCanonTest, ReplacePathURL) { 1500 ReplaceCase replace_cases[] = { 1501 // Replace everything 1502 {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"}, 1503 // Replace nothing 1504 {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"}, 1505 // Replace one or the other 1506 {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"}, 1507 {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"}, 1508 {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"}, 1509 }; 1510 1511 for (size_t i = 0; i < arraysize(replace_cases); i++) { 1512 const ReplaceCase& cur = replace_cases[i]; 1513 int base_len = static_cast<int>(strlen(cur.base)); 1514 Parsed parsed; 1515 ParsePathURL(cur.base, base_len, false, &parsed); 1516 1517 Replacements<char> r; 1518 typedef Replacements<char> R; // Clean up syntax. 1519 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); 1520 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); 1521 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); 1522 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); 1523 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); 1524 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); 1525 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); 1526 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); 1527 1528 std::string out_str; 1529 StdStringCanonOutput output(&out_str); 1530 Parsed out_parsed; 1531 ReplacePathURL(cur.base, parsed, r, &output, &out_parsed); 1532 output.Complete(); 1533 1534 EXPECT_EQ(replace_cases[i].expected, out_str); 1535 } 1536 } 1537 1538 TEST(URLCanonTest, ReplaceMailtoURL) { 1539 ReplaceCase replace_cases[] = { 1540 // Replace everything 1541 {"mailto:jon (at) foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"}, 1542 // Replace nothing 1543 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon (at) foo.com?body=sup"}, 1544 // Replace the path 1545 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"}, 1546 // Replace the query 1547 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon (at) foo.com?custom=1"}, 1548 // Replace the path and query 1549 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"}, 1550 // Set the query to empty (should leave trailing question mark) 1551 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon (at) foo.com?"}, 1552 // Clear the query 1553 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon (at) foo.com"}, 1554 // Clear the path 1555 {"mailto:jon (at) foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"}, 1556 // Clear the path + query 1557 {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"}, 1558 // Setting the ref should have no effect 1559 {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"}, 1560 }; 1561 1562 for (size_t i = 0; i < arraysize(replace_cases); i++) { 1563 const ReplaceCase& cur = replace_cases[i]; 1564 int base_len = static_cast<int>(strlen(cur.base)); 1565 Parsed parsed; 1566 ParseMailtoURL(cur.base, base_len, &parsed); 1567 1568 Replacements<char> r; 1569 typedef Replacements<char> R; 1570 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); 1571 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); 1572 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); 1573 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); 1574 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); 1575 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); 1576 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); 1577 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); 1578 1579 std::string out_str; 1580 StdStringCanonOutput output(&out_str); 1581 Parsed out_parsed; 1582 ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed); 1583 output.Complete(); 1584 1585 EXPECT_EQ(replace_cases[i].expected, out_str); 1586 } 1587 } 1588 1589 TEST(URLCanonTest, CanonicalizeFileURL) { 1590 struct URLCase { 1591 const char* input; 1592 const char* expected; 1593 bool expected_success; 1594 Component expected_host; 1595 Component expected_path; 1596 } cases[] = { 1597 #ifdef _WIN32 1598 // Windows-style paths 1599 {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(), Component(7, 16)}, 1600 {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, Component(), Component(7, 19)}, 1601 {"file:", "file:///", true, Component(), Component(7, 1)}, 1602 {"file:UNChost/path", "file://unchost/path", true, Component(7, 7), Component(14, 5)}, 1603 // CanonicalizeFileURL supports absolute Windows style paths for IE 1604 // compatability. Note that the caller must decide that this is a file 1605 // URL itself so it can call the file canonicalizer. This is usually 1606 // done automatically as part of relative URL resolving. 1607 {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, 1608 {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, 1609 {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, 1610 {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, 1611 {"//server/file", "file://server/file", true, Component(7, 6), Component(13, 5)}, 1612 {"\\\\server\\file", "file://server/file", true, Component(7, 6), Component(13, 5)}, 1613 {"/\\server/file", "file://server/file", true, Component(7, 6), Component(13, 5)}, 1614 // We should preserve the number of slashes after the colon for IE 1615 // compatability, except when there is none, in which case we should 1616 // add one. 1617 {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), Component(7, 16)}, 1618 {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, Component(), Component(7, 19)}, 1619 // Three slashes should be non-UNC, even if there is no drive spec (IE 1620 // does this, which makes the resulting request invalid). 1621 {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(), Component(7, 12)}, 1622 // TODO(brettw) we should probably fail for invalid host names, which 1623 // would change the expected result on this test. We also currently allow 1624 // colon even though it's probably invalid, because its currently the 1625 // "natural" result of the way the canonicalizer is written. There doesn't 1626 // seem to be a strong argument for why allowing it here would be bad, so 1627 // we just tolerate it and the load will fail later. 1628 {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, Component(7, 2), Component(9, 16)}, 1629 {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5), Component(12, 8)}, 1630 // Make sure relative paths can't go above the "C:" 1631 {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, Component(), Component(7, 12)}, 1632 // Busted refs shouldn't make the whole thing fail. 1633 {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, Component(), Component(7, 8)}, 1634 #else 1635 // Unix-style paths 1636 {"file:///home/me", "file:///home/me", true, Component(), Component(7, 8)}, 1637 // Windowsy ones should get still treated as Unix-style. 1638 {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(), Component(7, 16)}, 1639 {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, Component(), Component(7, 19)}, 1640 // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html) 1641 {"//", "file:///", true, Component(), Component(7, 1)}, 1642 {"///", "file:///", true, Component(), Component(7, 1)}, 1643 {"///test", "file:///test", true, Component(), Component(7, 5)}, 1644 {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)}, 1645 {"file://localhost", "file://localhost/", true, Component(7, 9), Component(16, 1)}, 1646 {"file://localhost/", "file://localhost/", true, Component(7, 9), Component(16, 1)}, 1647 {"file://localhost/test", "file://localhost/test", true, Component(7, 9), Component(16, 5)}, 1648 #endif // _WIN32 1649 }; 1650 1651 for (size_t i = 0; i < ARRAYSIZE(cases); i++) { 1652 int url_len = static_cast<int>(strlen(cases[i].input)); 1653 Parsed parsed; 1654 ParseFileURL(cases[i].input, url_len, &parsed); 1655 1656 Parsed out_parsed; 1657 std::string out_str; 1658 StdStringCanonOutput output(&out_str); 1659 bool success = CanonicalizeFileURL(cases[i].input, url_len, parsed, NULL, 1660 &output, &out_parsed); 1661 output.Complete(); 1662 1663 EXPECT_EQ(cases[i].expected_success, success); 1664 EXPECT_EQ(cases[i].expected, out_str); 1665 1666 // Make sure the spec was properly identified, the file canonicalizer has 1667 // different code for writing the spec. 1668 EXPECT_EQ(0, out_parsed.scheme.begin); 1669 EXPECT_EQ(4, out_parsed.scheme.len); 1670 1671 EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin); 1672 EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len); 1673 1674 EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); 1675 EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); 1676 } 1677 } 1678 1679 TEST(URLCanonTest, CanonicalizeFileSystemURL) { 1680 struct URLCase { 1681 const char* input; 1682 const char* expected; 1683 bool expected_success; 1684 } cases[] = { 1685 {"Filesystem:htTp://www.Foo.com:80/tempoRary", "filesystem:http://www.foo.com/tempoRary/", true}, 1686 {"filesystem:httpS://www.foo.com/temporary/", "filesystem:https://www.foo.com/temporary/", true}, 1687 {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//", false}, 1688 {"filesystem:http://www.foo.com/persistent/bob?query#ref", "filesystem:http://www.foo.com/persistent/bob?query#ref", true}, 1689 {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true}, 1690 {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true}, 1691 {"filesystem:File:///temporary/Bob?qUery#reF", "filesystem:file:///temporary/Bob?qUery#reF", true}, 1692 }; 1693 1694 for (size_t i = 0; i < ARRAYSIZE(cases); i++) { 1695 int url_len = static_cast<int>(strlen(cases[i].input)); 1696 Parsed parsed; 1697 ParseFileSystemURL(cases[i].input, url_len, &parsed); 1698 1699 Parsed out_parsed; 1700 std::string out_str; 1701 StdStringCanonOutput output(&out_str); 1702 bool success = CanonicalizeFileSystemURL(cases[i].input, url_len, parsed, 1703 NULL, &output, &out_parsed); 1704 output.Complete(); 1705 1706 EXPECT_EQ(cases[i].expected_success, success); 1707 EXPECT_EQ(cases[i].expected, out_str); 1708 1709 // Make sure the spec was properly identified, the filesystem canonicalizer 1710 // has different code for writing the spec. 1711 EXPECT_EQ(0, out_parsed.scheme.begin); 1712 EXPECT_EQ(10, out_parsed.scheme.len); 1713 if (success) 1714 EXPECT_GT(out_parsed.path.len, 0); 1715 } 1716 } 1717 1718 TEST(URLCanonTest, CanonicalizePathURL) { 1719 // Path URLs should get canonicalized schemes but nothing else. 1720 struct PathCase { 1721 const char* input; 1722 const char* expected; 1723 } path_cases[] = { 1724 {"javascript:", "javascript:"}, 1725 {"JavaScript:Foo", "javascript:Foo"}, 1726 {":\":This /is interesting;?#", ":\":This /is interesting;?#"}, 1727 }; 1728 1729 for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) { 1730 int url_len = static_cast<int>(strlen(path_cases[i].input)); 1731 Parsed parsed; 1732 ParsePathURL(path_cases[i].input, url_len, true, &parsed); 1733 1734 Parsed out_parsed; 1735 std::string out_str; 1736 StdStringCanonOutput output(&out_str); 1737 bool success = CanonicalizePathURL(path_cases[i].input, url_len, parsed, 1738 &output, &out_parsed); 1739 output.Complete(); 1740 1741 EXPECT_TRUE(success); 1742 EXPECT_EQ(path_cases[i].expected, out_str); 1743 1744 EXPECT_EQ(0, out_parsed.host.begin); 1745 EXPECT_EQ(-1, out_parsed.host.len); 1746 1747 // When we end with a colon at the end, there should be no path. 1748 if (path_cases[i].input[url_len - 1] == ':') { 1749 EXPECT_EQ(0, out_parsed.GetContent().begin); 1750 EXPECT_EQ(-1, out_parsed.GetContent().len); 1751 } 1752 } 1753 } 1754 1755 TEST(URLCanonTest, CanonicalizeMailtoURL) { 1756 struct URLCase { 1757 const char* input; 1758 const char* expected; 1759 bool expected_success; 1760 Component expected_path; 1761 Component expected_query; 1762 } cases[] = { 1763 {"mailto:addr1", "mailto:addr1", true, Component(7, 5), Component()}, 1764 {"mailto:addr1 (at) foo.com", "mailto:addr1 (at) foo.com", true, Component(7, 13), Component()}, 1765 // Trailing whitespace is stripped. 1766 {"MaIlTo:addr1 \t ", "mailto:addr1", true, Component(7, 5), Component()}, 1767 {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, Component(7, 5), Component(13,6)}, 1768 {"mailto:addr1,addr2", "mailto:addr1,addr2", true, Component(7, 11), Component()}, 1769 {"mailto:addr1, addr2", "mailto:addr1, addr2", true, Component(7, 12), Component()}, 1770 {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, Component(7, 13), Component()}, 1771 {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, Component(7, 12), Component()}, 1772 // Null character should be escaped to %00 1773 {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, Component(7, 13), Component(21, 3)}, 1774 // Invalid -- UTF-8 encoded surrogate value. 1775 {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, Component(7, 9), Component()}, 1776 {"mailto:addr1?", "mailto:addr1?", true, Component(7, 5), Component(13, 0)}, 1777 }; 1778 1779 // Define outside of loop to catch bugs where components aren't reset 1780 Parsed parsed; 1781 Parsed out_parsed; 1782 1783 for (size_t i = 0; i < ARRAYSIZE(cases); i++) { 1784 int url_len = static_cast<int>(strlen(cases[i].input)); 1785 if (i == 8) { 1786 // The 9th test case purposely has a '\0' in it -- don't count it 1787 // as the string terminator. 1788 url_len = 22; 1789 } 1790 ParseMailtoURL(cases[i].input, url_len, &parsed); 1791 1792 std::string out_str; 1793 StdStringCanonOutput output(&out_str); 1794 bool success = CanonicalizeMailtoURL(cases[i].input, url_len, parsed, 1795 &output, &out_parsed); 1796 output.Complete(); 1797 1798 EXPECT_EQ(cases[i].expected_success, success); 1799 EXPECT_EQ(cases[i].expected, out_str); 1800 1801 // Make sure the spec was properly identified 1802 EXPECT_EQ(0, out_parsed.scheme.begin); 1803 EXPECT_EQ(6, out_parsed.scheme.len); 1804 1805 EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); 1806 EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); 1807 1808 EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin); 1809 EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len); 1810 } 1811 } 1812 1813 #ifndef WIN32 1814 1815 TEST(URLCanonTest, _itoa_s) { 1816 // We fill the buffer with 0xff to ensure that it's getting properly 1817 // null-terminated. We also allocate one byte more than what we tell 1818 // _itoa_s about, and ensure that the extra byte is untouched. 1819 char buf[6]; 1820 memset(buf, 0xff, sizeof(buf)); 1821 EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10)); 1822 EXPECT_STREQ("12", buf); 1823 EXPECT_EQ('\xFF', buf[3]); 1824 1825 // Test the edge cases - exactly the buffer size and one over 1826 memset(buf, 0xff, sizeof(buf)); 1827 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10)); 1828 EXPECT_STREQ("1234", buf); 1829 EXPECT_EQ('\xFF', buf[5]); 1830 1831 memset(buf, 0xff, sizeof(buf)); 1832 EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10)); 1833 EXPECT_EQ('\xFF', buf[5]); // should never write to this location 1834 1835 // Test the template overload (note that this will see the full buffer) 1836 memset(buf, 0xff, sizeof(buf)); 1837 EXPECT_EQ(0, _itoa_s(12, buf, 10)); 1838 EXPECT_STREQ("12", buf); 1839 EXPECT_EQ('\xFF', buf[3]); 1840 1841 memset(buf, 0xff, sizeof(buf)); 1842 EXPECT_EQ(0, _itoa_s(12345, buf, 10)); 1843 EXPECT_STREQ("12345", buf); 1844 1845 EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10)); 1846 1847 // Test that radix 16 is supported. 1848 memset(buf, 0xff, sizeof(buf)); 1849 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16)); 1850 EXPECT_STREQ("4d2", buf); 1851 EXPECT_EQ('\xFF', buf[5]); 1852 } 1853 1854 TEST(URLCanonTest, _itow_s) { 1855 // We fill the buffer with 0xff to ensure that it's getting properly 1856 // null-terminated. We also allocate one byte more than what we tell 1857 // _itoa_s about, and ensure that the extra byte is untouched. 1858 base::char16 buf[6]; 1859 const char fill_mem = 0xff; 1860 const base::char16 fill_char = 0xffff; 1861 memset(buf, fill_mem, sizeof(buf)); 1862 EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10)); 1863 EXPECT_EQ(WStringToUTF16(L"12"), base::string16(buf)); 1864 EXPECT_EQ(fill_char, buf[3]); 1865 1866 // Test the edge cases - exactly the buffer size and one over 1867 EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10)); 1868 EXPECT_EQ(WStringToUTF16(L"1234"), base::string16(buf)); 1869 EXPECT_EQ(fill_char, buf[5]); 1870 1871 memset(buf, fill_mem, sizeof(buf)); 1872 EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10)); 1873 EXPECT_EQ(fill_char, buf[5]); // should never write to this location 1874 1875 // Test the template overload (note that this will see the full buffer) 1876 memset(buf, fill_mem, sizeof(buf)); 1877 EXPECT_EQ(0, _itow_s(12, buf, 10)); 1878 EXPECT_EQ(WStringToUTF16(L"12"), base::string16(buf)); 1879 EXPECT_EQ(fill_char, buf[3]); 1880 1881 memset(buf, fill_mem, sizeof(buf)); 1882 EXPECT_EQ(0, _itow_s(12345, buf, 10)); 1883 EXPECT_EQ(WStringToUTF16(L"12345"), base::string16(buf)); 1884 1885 EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10)); 1886 } 1887 1888 #endif // !WIN32 1889 1890 // Returns true if the given two structures are the same. 1891 static bool ParsedIsEqual(const Parsed& a, const Parsed& b) { 1892 return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len && 1893 a.username.begin == b.username.begin && a.username.len == b.username.len && 1894 a.password.begin == b.password.begin && a.password.len == b.password.len && 1895 a.host.begin == b.host.begin && a.host.len == b.host.len && 1896 a.port.begin == b.port.begin && a.port.len == b.port.len && 1897 a.path.begin == b.path.begin && a.path.len == b.path.len && 1898 a.query.begin == b.query.begin && a.query.len == b.query.len && 1899 a.ref.begin == b.ref.begin && a.ref.len == b.ref.len; 1900 } 1901 1902 TEST(URLCanonTest, ResolveRelativeURL) { 1903 struct RelativeCase { 1904 const char* base; // Input base URL: MUST BE CANONICAL 1905 bool is_base_hier; // Is the base URL hierarchical 1906 bool is_base_file; // Tells us if the base is a file URL. 1907 const char* test; // Input URL to test against. 1908 bool succeed_relative; // Whether we expect IsRelativeURL to succeed 1909 bool is_rel; // Whether we expect |test| to be relative or not. 1910 bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed. 1911 const char* resolved; // What we expect in the result when resolving. 1912 } rel_cases[] = { 1913 // Basic absolute input. 1914 {"http://host/a", true, false, "http://another/", true, false, false, NULL}, 1915 {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, 1916 // Empty relative URLs should only remove the ref part of the URL, 1917 // leaving the rest unchanged. 1918 {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, 1919 {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"}, 1920 {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"}, 1921 // Spaces at the ends of the relative path should be ignored. 1922 {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, 1923 {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, 1924 {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"}, 1925 // Matching schemes without two slashes are treated as relative. 1926 {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"}, 1927 {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"}, 1928 {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"}, 1929 {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"}, 1930 // Nonmatching schemes are absolute. 1931 {"http://host/a", true, false, "https:host2", true, false, false, NULL}, 1932 {"http://host/a", true, false, "htto:/host2", true, false, false, NULL}, 1933 // Absolute path input 1934 {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"}, 1935 {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"}, 1936 {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"}, 1937 {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"}, 1938 {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"}, 1939 {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"}, 1940 // Relative path input 1941 {"http://host/a", true, false, "b", true, true, true, "http://host/b"}, 1942 {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"}, 1943 {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"}, 1944 {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"}, 1945 {"http://host/a/", true, false, "..", true, true, true, "http://host/"}, 1946 {"http://host/a/", true, false, "./..", true, true, true, "http://host/"}, 1947 {"http://host/a/", true, false, "../.", true, true, true, "http://host/"}, 1948 {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"}, 1949 {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"}, 1950 // Query input 1951 {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"}, 1952 {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"}, 1953 {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"}, 1954 // Ref input 1955 {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"}, 1956 {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"}, 1957 {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"}, 1958 // Non-hierarchical base: no relative handling. Relative input should 1959 // error, and if a scheme is present, it should be treated as absolute. 1960 {"data:foobar", false, false, "baz.html", false, false, false, NULL}, 1961 {"data:foobar", false, false, "data:baz", true, false, false, NULL}, 1962 {"data:foobar", false, false, "data:/base", true, false, false, NULL}, 1963 // Non-hierarchical base: absolute input should succeed. 1964 {"data:foobar", false, false, "http://host/", true, false, false, NULL}, 1965 {"data:foobar", false, false, "http:host", true, false, false, NULL}, 1966 // Invalid schemes should be treated as relative. 1967 {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"}, 1968 {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"}, 1969 {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"}, 1970 {"data:asdf", false, false, ":foo", false, false, false, NULL}, 1971 {"data:asdf", false, false, "bad(':foo')", false, false, false, NULL}, 1972 // We should treat semicolons like any other character in URL resolving 1973 {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"}, 1974 {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"}, 1975 {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"}, 1976 // Relative URLs can also be written as "//foo/bar" which is relative to 1977 // the scheme. In this case, it would take the old scheme, so for http 1978 // the example would resolve to "http://foo/bar". 1979 {"http://host/a", true, false, "//another", true, true, true, "http://another/"}, 1980 {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"}, 1981 {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"}, 1982 {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"}, 1983 {"http://host/a", true, false, "//", true, true, false, "http:"}, 1984 // IE will also allow one or the other to be a backslash to get the same 1985 // behavior. 1986 {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"}, 1987 {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"}, 1988 #ifdef WIN32 1989 // Resolving against Windows file base URLs. 1990 {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL}, 1991 {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"}, 1992 {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"}, 1993 {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"}, 1994 // But two backslashes on Windows should be UNC so should be treated 1995 // as absolute. 1996 {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL}, 1997 // IE doesn't support drive specs starting with two slashes. It fails 1998 // immediately and doesn't even try to load. We fix it up to either 1999 // an absolute path or UNC depending on what it looks like. 2000 {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"}, 2001 {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"}, 2002 // Windows drive specs should be allowed and treated as absolute. 2003 {"file:///C:/foo", true, true, "c:", true, false, false, NULL}, 2004 {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL}, 2005 {"http://host/a", true, false, "c:\\foo", true, false, false, NULL}, 2006 // Relative paths with drive letters should be allowed when the base is 2007 // also a file. 2008 {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"}, 2009 // Treat absolute paths as being off of the drive. 2010 {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"}, 2011 {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"}, 2012 {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"}, 2013 // On Windows, two slashes without a drive letter when the base is a file 2014 // means that the path is UNC. 2015 {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"}, 2016 {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"}, 2017 #else 2018 // On Unix we fall back to relative behavior since there's nothing else 2019 // reasonable to do. 2020 {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"}, 2021 #endif 2022 // Even on Windows, we don't allow relative drive specs when the base 2023 // is not file. 2024 {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"}, 2025 {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"}, 2026 // Ensure that ports aren't allowed for hosts relative to a file url. 2027 // Although the result string shows a host:port portion, the call to 2028 // resolve the relative URL returns false, indicating parse failure, 2029 // which is what is required. 2030 {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"}, 2031 // Filesystem URL tests; filesystem URLs are only valid and relative if 2032 // they have no scheme, e.g. "./index.html". There's no valid equivalent 2033 // to http:index.html. 2034 {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, 2035 {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL}, 2036 {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL}, 2037 {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, 2038 {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"}, 2039 {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"}, 2040 {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL}, 2041 // Absolute URLs are still not relative to a non-standard base URL. 2042 {"about:blank", false, false, "http://X/A", true, false, true, ""}, 2043 {"about:blank", false, false, "content://content.Provider/", true, false, true, ""}, 2044 }; 2045 2046 for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) { 2047 const RelativeCase& cur_case = rel_cases[i]; 2048 2049 Parsed parsed; 2050 int base_len = static_cast<int>(strlen(cur_case.base)); 2051 if (cur_case.is_base_file) 2052 ParseFileURL(cur_case.base, base_len, &parsed); 2053 else if (cur_case.is_base_hier) 2054 ParseStandardURL(cur_case.base, base_len, &parsed); 2055 else 2056 ParsePathURL(cur_case.base, base_len, false, &parsed); 2057 2058 // First see if it is relative. 2059 int test_len = static_cast<int>(strlen(cur_case.test)); 2060 bool is_relative; 2061 Component relative_component; 2062 bool succeed_is_rel = IsRelativeURL( 2063 cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier, 2064 &is_relative, &relative_component); 2065 2066 EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) << 2067 "succeed is rel failure on " << cur_case.test; 2068 EXPECT_EQ(cur_case.is_rel, is_relative) << 2069 "is rel failure on " << cur_case.test; 2070 // Now resolve it. 2071 if (succeed_is_rel && is_relative && cur_case.is_rel) { 2072 std::string resolved; 2073 StdStringCanonOutput output(&resolved); 2074 Parsed resolved_parsed; 2075 2076 bool succeed_resolve = ResolveRelativeURL( 2077 cur_case.base, parsed, cur_case.is_base_file, cur_case.test, 2078 relative_component, NULL, &output, &resolved_parsed); 2079 output.Complete(); 2080 2081 EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve); 2082 EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test; 2083 2084 // Verify that the output parsed structure is the same as parsing a 2085 // the URL freshly. 2086 Parsed ref_parsed; 2087 int resolved_len = static_cast<int>(resolved.size()); 2088 if (cur_case.is_base_file) { 2089 ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); 2090 } else if (cur_case.is_base_hier) { 2091 ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); 2092 } else { 2093 ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed); 2094 } 2095 EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); 2096 } 2097 } 2098 } 2099 2100 // It used to be when we did a replacement with a long buffer of UTF-16 2101 // characters, we would get invalid data in the URL. This is because the buffer 2102 // it used to hold the UTF-8 data was resized, while some pointers were still 2103 // kept to the old buffer that was removed. 2104 TEST(URLCanonTest, ReplacementOverflow) { 2105 const char src[] = "file:///C:/foo/bar"; 2106 int src_len = static_cast<int>(strlen(src)); 2107 Parsed parsed; 2108 ParseFileURL(src, src_len, &parsed); 2109 2110 // Override two components, the path with something short, and the query with 2111 // sonething long enough to trigger the bug. 2112 Replacements<base::char16> repl; 2113 base::string16 new_query; 2114 for (int i = 0; i < 4800; i++) 2115 new_query.push_back('a'); 2116 2117 base::string16 new_path(WStringToUTF16(L"/foo")); 2118 repl.SetPath(new_path.c_str(), Component(0, 4)); 2119 repl.SetQuery(new_query.c_str(), 2120 Component(0, static_cast<int>(new_query.length()))); 2121 2122 // Call ReplaceComponents on the string. It doesn't matter if we call it for 2123 // standard URLs, file URLs, etc, since they will go to the same replacement 2124 // function that was buggy. 2125 Parsed repl_parsed; 2126 std::string repl_str; 2127 StdStringCanonOutput repl_output(&repl_str); 2128 ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed); 2129 repl_output.Complete(); 2130 2131 // Generate the expected string and check. 2132 std::string expected("file:///foo?"); 2133 for (size_t i = 0; i < new_query.length(); i++) 2134 expected.push_back('a'); 2135 EXPECT_TRUE(expected == repl_str); 2136 } 2137 2138 } // namespace url 2139