1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/base/net_util.h" 6 7 #include <algorithm> 8 #include <map> 9 #include <unicode/ucnv.h> 10 #include <unicode/uidna.h> 11 #include <unicode/ulocdata.h> 12 #include <unicode/uniset.h> 13 #include <unicode/uscript.h> 14 #include <unicode/uset.h> 15 16 #include "build/build_config.h" 17 18 #if defined(OS_WIN) 19 #include <windows.h> 20 #include <winsock2.h> 21 #include <ws2tcpip.h> 22 #include <wspiapi.h> // Needed for Win2k compat. 23 #elif defined(OS_POSIX) 24 #include <netdb.h> 25 #include <sys/socket.h> 26 #include <fcntl.h> 27 #endif 28 29 #include "base/base64.h" 30 #include "base/basictypes.h" 31 #include "base/file_path.h" 32 #include "base/file_util.h" 33 #include "base/i18n/file_util_icu.h" 34 #include "base/i18n/icu_string_conversions.h" 35 #include "base/i18n/time_formatting.h" 36 #include "base/json/string_escape.h" 37 #include "base/lock.h" 38 #include "base/logging.h" 39 #include "base/message_loop.h" 40 #include "base/path_service.h" 41 #include "base/singleton.h" 42 #include "base/stl_util-inl.h" 43 #include "base/string_piece.h" 44 #include "base/string_tokenizer.h" 45 #include "base/string_util.h" 46 #include "base/sys_string_conversions.h" 47 #include "base/time.h" 48 #include "base/utf_offset_string_conversions.h" 49 #include "grit/net_resources.h" 50 #include "googleurl/src/gurl.h" 51 #include "googleurl/src/url_canon.h" 52 #include "googleurl/src/url_parse.h" 53 #include "net/base/escape.h" 54 #include "net/base/net_module.h" 55 #if defined(OS_WIN) 56 #include "net/base/winsock_init.h" 57 #endif 58 #include "unicode/datefmt.h" 59 60 61 using base::Time; 62 63 namespace { 64 65 // what we prepend to get a file URL 66 static const FilePath::CharType kFileURLPrefix[] = 67 FILE_PATH_LITERAL("file:///"); 68 69 // The general list of blocked ports. Will be blocked unless a specific 70 // protocol overrides it. (Ex: ftp can use ports 20 and 21) 71 static const int kRestrictedPorts[] = { 72 1, // tcpmux 73 7, // echo 74 9, // discard 75 11, // systat 76 13, // daytime 77 15, // netstat 78 17, // qotd 79 19, // chargen 80 20, // ftp data 81 21, // ftp access 82 22, // ssh 83 23, // telnet 84 25, // smtp 85 37, // time 86 42, // name 87 43, // nicname 88 53, // domain 89 77, // priv-rjs 90 79, // finger 91 87, // ttylink 92 95, // supdup 93 101, // hostriame 94 102, // iso-tsap 95 103, // gppitnp 96 104, // acr-nema 97 109, // pop2 98 110, // pop3 99 111, // sunrpc 100 113, // auth 101 115, // sftp 102 117, // uucp-path 103 119, // nntp 104 123, // NTP 105 135, // loc-srv /epmap 106 139, // netbios 107 143, // imap2 108 179, // BGP 109 389, // ldap 110 465, // smtp+ssl 111 512, // print / exec 112 513, // login 113 514, // shell 114 515, // printer 115 526, // tempo 116 530, // courier 117 531, // chat 118 532, // netnews 119 540, // uucp 120 556, // remotefs 121 563, // nntp+ssl 122 587, // stmp? 123 601, // ?? 124 636, // ldap+ssl 125 993, // ldap+ssl 126 995, // pop3+ssl 127 2049, // nfs 128 3659, // apple-sasl / PasswordServer 129 4045, // lockd 130 6000, // X11 131 }; 132 133 // FTP overrides the following restricted ports. 134 static const int kAllowedFtpPorts[] = { 135 21, // ftp data 136 22, // ssh 137 }; 138 139 template<typename STR> 140 STR GetSpecificHeaderT(const STR& headers, const STR& name) { 141 // We want to grab the Value from the "Key: Value" pairs in the headers, 142 // which should look like this (no leading spaces, \n-separated) (we format 143 // them this way in url_request_inet.cc): 144 // HTTP/1.1 200 OK\n 145 // ETag: "6d0b8-947-24f35ec0"\n 146 // Content-Length: 2375\n 147 // Content-Type: text/html; charset=UTF-8\n 148 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n 149 if (headers.empty()) 150 return STR(); 151 152 STR match; 153 match.push_back('\n'); 154 match.append(name); 155 match.push_back(':'); 156 157 typename STR::const_iterator begin = 158 search(headers.begin(), headers.end(), match.begin(), match.end(), 159 CaseInsensitiveCompareASCII<typename STR::value_type>()); 160 161 if (begin == headers.end()) 162 return STR(); 163 164 begin += match.length(); 165 166 typename STR::const_iterator end = find(begin, headers.end(), '\n'); 167 168 STR ret; 169 TrimWhitespace(STR(begin, end), TRIM_ALL, &ret); 170 return ret; 171 } 172 173 // TODO(jungshik): We have almost identical hex-decoding code else where. 174 // Consider refactoring and moving it somewhere(base?). Bug 1224311 175 inline bool IsHexDigit(unsigned char c) { 176 return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || 177 ('a' <= c && c <= 'f')); 178 } 179 180 inline unsigned char HexToInt(unsigned char c) { 181 DCHECK(IsHexDigit(c)); 182 static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u}; 183 return c - kOffset[(c >> 5) & 3]; 184 } 185 186 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence 187 // of bytes. If input is invalid, return false. 188 bool QPDecode(const std::string& input, std::string* output) { 189 std::string temp; 190 temp.reserve(input.size()); 191 std::string::const_iterator it = input.begin(); 192 while (it != input.end()) { 193 if (*it == '_') { 194 temp.push_back(' '); 195 } else if (*it == '=') { 196 if (input.end() - it < 3) { 197 return false; 198 } 199 if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) && 200 IsHexDigit(static_cast<unsigned char>(*(it + 2)))) { 201 unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2)); 202 temp.push_back(static_cast<char>(ch)); 203 ++it; 204 ++it; 205 } else { 206 return false; 207 } 208 } else if (0x20 < *it && *it < 0x7F) { 209 // In a Q-encoded word, only printable ASCII characters 210 // represent themselves. Besides, space, '=', '_' and '?' are 211 // not allowed, but they're already filtered out. 212 DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F); 213 temp.push_back(*it); 214 } else { 215 return false; 216 } 217 ++it; 218 } 219 output->swap(temp); 220 return true; 221 } 222 223 enum RFC2047EncodingType {Q_ENCODING, B_ENCODING}; 224 bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type, 225 const std::string& charset, std::string* output) { 226 std::string decoded; 227 if (enc_type == B_ENCODING) { 228 if (!base::Base64Decode(part, &decoded)) { 229 return false; 230 } 231 } else { 232 if (!QPDecode(part, &decoded)) { 233 return false; 234 } 235 } 236 237 UErrorCode err = U_ZERO_ERROR; 238 UConverter* converter(ucnv_open(charset.c_str(), &err)); 239 if (U_FAILURE(err)) { 240 return false; 241 } 242 243 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. 244 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes 245 // in UTF-8. Therefore, the expansion ratio is 3 at most. 246 int length = static_cast<int>(decoded.length()); 247 char* buf = WriteInto(output, length * 3); 248 length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3, 249 decoded.data(), length, &err); 250 ucnv_close(converter); 251 if (U_FAILURE(err)) { 252 return false; 253 } 254 output->resize(length); 255 return true; 256 } 257 258 bool DecodeWord(const std::string& encoded_word, 259 const std::string& referrer_charset, 260 bool *is_rfc2047, 261 std::string* output) { 262 if (!IsStringASCII(encoded_word)) { 263 // Try UTF-8, referrer_charset and the native OS default charset in turn. 264 if (IsStringUTF8(encoded_word)) { 265 *output = encoded_word; 266 } else { 267 std::wstring wide_output; 268 if (!referrer_charset.empty() && 269 base::CodepageToWide(encoded_word, referrer_charset.c_str(), 270 base::OnStringConversionError::FAIL, 271 &wide_output)) { 272 *output = WideToUTF8(wide_output); 273 } else { 274 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 275 } 276 } 277 *is_rfc2047 = false; 278 return true; 279 } 280 281 // RFC 2047 : one of encoding methods supported by Firefox and relatively 282 // widely used by web servers. 283 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. 284 // We don't care about the length restriction (72 bytes) because 285 // many web servers generate encoded words longer than the limit. 286 std::string tmp; 287 *is_rfc2047 = true; 288 int part_index = 0; 289 std::string charset; 290 StringTokenizer t(encoded_word, "?"); 291 RFC2047EncodingType enc_type = Q_ENCODING; 292 while (*is_rfc2047 && t.GetNext()) { 293 std::string part = t.token(); 294 switch (part_index) { 295 case 0: 296 if (part != "=") { 297 *is_rfc2047 = false; 298 break; 299 } 300 ++part_index; 301 break; 302 case 1: 303 // Do we need charset validity check here? 304 charset = part; 305 ++part_index; 306 break; 307 case 2: 308 if (part.size() > 1 || 309 part.find_first_of("bBqQ") == std::string::npos) { 310 *is_rfc2047 = false; 311 break; 312 } 313 if (part[0] == 'b' || part[0] == 'B') { 314 enc_type = B_ENCODING; 315 } 316 ++part_index; 317 break; 318 case 3: 319 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); 320 if (!*is_rfc2047) { 321 // Last minute failure. Invalid B/Q encoding. Rather than 322 // passing it through, return now. 323 return false; 324 } 325 ++part_index; 326 break; 327 case 4: 328 if (part != "=") { 329 // Another last minute failure ! 330 // Likely to be a case of two encoded-words in a row or 331 // an encoded word followed by a non-encoded word. We can be 332 // generous, but it does not help much in terms of compatibility, 333 // I believe. Return immediately. 334 *is_rfc2047 = false; 335 return false; 336 } 337 ++part_index; 338 break; 339 default: 340 *is_rfc2047 = false; 341 return false; 342 } 343 } 344 345 if (*is_rfc2047) { 346 if (*(encoded_word.end() - 1) == '=') { 347 output->swap(tmp); 348 return true; 349 } 350 // encoded_word ending prematurelly with '?' or extra '?' 351 *is_rfc2047 = false; 352 return false; 353 } 354 355 // We're not handling 'especial' characters quoted with '\', but 356 // it should be Ok because we're not an email client but a 357 // web browser. 358 359 // What IE6/7 does: %-escaped UTF-8. We could extend this to 360 // support a rudimentary form of RFC 2231 with charset label, but 361 // it'd gain us little in terms of compatibility. 362 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); 363 if (IsStringUTF8(tmp)) { 364 output->swap(tmp); 365 return true; 366 // We can try either the OS default charset or 'origin charset' here, 367 // As far as I can tell, IE does not support it. However, I've seen 368 // web servers emit %-escaped string in a legacy encoding (usually 369 // origin charset). 370 // TODO(jungshik) : Test IE further and consider adding a fallback here. 371 } 372 return false; 373 } 374 375 bool DecodeParamValue(const std::string& input, 376 const std::string& referrer_charset, 377 std::string* output) { 378 std::string tmp; 379 // Tokenize with whitespace characters. 380 StringTokenizer t(input, " \t\n\r"); 381 t.set_options(StringTokenizer::RETURN_DELIMS); 382 bool is_previous_token_rfc2047 = true; 383 while (t.GetNext()) { 384 if (t.token_is_delim()) { 385 // If the previous non-delimeter token is not RFC2047-encoded, 386 // put in a space in its place. Otheriwse, skip over it. 387 if (!is_previous_token_rfc2047) { 388 tmp.push_back(' '); 389 } 390 continue; 391 } 392 // We don't support a single multibyte character split into 393 // adjacent encoded words. Some broken mail clients emit headers 394 // with that problem, but most web servers usually encode a filename 395 // in a single encoded-word. Firefox/Thunderbird do not support 396 // it, either. 397 std::string decoded; 398 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 399 &decoded)) 400 return false; 401 tmp.append(decoded); 402 } 403 output->swap(tmp); 404 return true; 405 } 406 407 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm 408 // sure this doesn't properly handle all (most?) cases. 409 template<typename STR> 410 STR GetHeaderParamValueT(const STR& header, const STR& param_name) { 411 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value". 412 typename STR::const_iterator param_begin = 413 search(header.begin(), header.end(), param_name.begin(), param_name.end(), 414 CaseInsensitiveCompareASCII<typename STR::value_type>()); 415 416 if (param_begin == header.end()) 417 return STR(); 418 param_begin += param_name.length(); 419 420 STR whitespace; 421 whitespace.push_back(' '); 422 whitespace.push_back('\t'); 423 const typename STR::size_type equals_offset = 424 header.find_first_not_of(whitespace, param_begin - header.begin()); 425 if (equals_offset == STR::npos || header.at(equals_offset) != '=') 426 return STR(); 427 428 param_begin = header.begin() + equals_offset + 1; 429 if (param_begin == header.end()) 430 return STR(); 431 432 typename STR::const_iterator param_end; 433 if (*param_begin == '"') { 434 param_end = find(param_begin+1, header.end(), '"'); 435 if (param_end == header.end()) 436 return STR(); // poorly formatted param? 437 438 ++param_begin; // skip past the quote. 439 } else { 440 param_end = find(param_begin+1, header.end(), ';'); 441 } 442 443 return STR(param_begin, param_end); 444 } 445 446 // Does some simple normalization of scripts so we can allow certain scripts 447 // to exist together. 448 // TODO(brettw) bug 880223: we should allow some other languages to be 449 // oombined such as Chinese and Latin. We will probably need a more 450 // complicated system of language pairs to have more fine-grained control. 451 UScriptCode NormalizeScript(UScriptCode code) { 452 switch (code) { 453 case USCRIPT_KATAKANA: 454 case USCRIPT_HIRAGANA: 455 case USCRIPT_KATAKANA_OR_HIRAGANA: 456 case USCRIPT_HANGUL: // This one is arguable. 457 return USCRIPT_HAN; 458 default: 459 return code; 460 } 461 } 462 463 bool IsIDNComponentInSingleScript(const char16* str, int str_len) { 464 UScriptCode first_script = USCRIPT_INVALID_CODE; 465 bool is_first = true; 466 467 int i = 0; 468 while (i < str_len) { 469 unsigned code_point; 470 U16_NEXT(str, i, str_len, code_point); 471 472 UErrorCode err = U_ZERO_ERROR; 473 UScriptCode cur_script = uscript_getScript(code_point, &err); 474 if (err != U_ZERO_ERROR) 475 return false; // Report mixed on error. 476 cur_script = NormalizeScript(cur_script); 477 478 // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. 479 if (is_first && cur_script != USCRIPT_COMMON) { 480 first_script = cur_script; 481 is_first = false; 482 } else { 483 if (cur_script != USCRIPT_COMMON && cur_script != first_script) 484 return false; 485 } 486 } 487 return true; 488 } 489 490 // Check if the script of a language can be 'safely' mixed with 491 // Latin letters in the ASCII range. 492 bool IsCompatibleWithASCIILetters(const std::string& lang) { 493 // For now, just list Chinese, Japanese and Korean (positive list). 494 // An alternative is negative-listing (languages using Greek and 495 // Cyrillic letters), but it can be more dangerous. 496 return !lang.substr(0, 2).compare("zh") || 497 !lang.substr(0, 2).compare("ja") || 498 !lang.substr(0, 2).compare("ko"); 499 } 500 501 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; 502 503 class LangToExemplarSet { 504 private: 505 LangToExemplarSetMap map; 506 LangToExemplarSet() { } 507 ~LangToExemplarSet() { 508 STLDeleteContainerPairSecondPointers(map.begin(), map.end()); 509 } 510 511 friend class Singleton<LangToExemplarSet>; 512 friend struct DefaultSingletonTraits<LangToExemplarSet>; 513 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); 514 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); 515 516 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); 517 }; 518 519 bool GetExemplarSetForLang(const std::string& lang, 520 icu::UnicodeSet** lang_set) { 521 const LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; 522 LangToExemplarSetMap::const_iterator pos = map.find(lang); 523 if (pos != map.end()) { 524 *lang_set = pos->second; 525 return true; 526 } 527 return false; 528 } 529 530 void SetExemplarSetForLang(const std::string& lang, 531 icu::UnicodeSet* lang_set) { 532 LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; 533 map.insert(std::make_pair(lang, lang_set)); 534 } 535 536 static Lock lang_set_lock; 537 538 // Returns true if all the characters in component_characters are used by 539 // the language |lang|. 540 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, 541 const std::string& lang) { 542 static const icu::UnicodeSet kASCIILetters(0x61, 0x7a); // [a-z] 543 icu::UnicodeSet* lang_set; 544 // We're called from both the UI thread and the history thread. 545 { 546 AutoLock lock(lang_set_lock); 547 if (!GetExemplarSetForLang(lang, &lang_set)) { 548 UErrorCode status = U_ZERO_ERROR; 549 ULocaleData* uld = ulocdata_open(lang.c_str(), &status); 550 // TODO(jungshik) Turn this check on when the ICU data file is 551 // rebuilt with the minimal subset of locale data for languages 552 // to which Chrome is not localized but which we offer in the list 553 // of languages selectable for Accept-Languages. With the rebuilt ICU 554 // data, ulocdata_open never should fall back to the default locale. 555 // (issue 2078) 556 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); 557 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { 558 lang_set = reinterpret_cast<icu::UnicodeSet *>( 559 ulocdata_getExemplarSet(uld, NULL, 0, 560 ULOCDATA_ES_STANDARD, &status)); 561 // If |lang| is compatible with ASCII Latin letters, add them. 562 if (IsCompatibleWithASCIILetters(lang)) 563 lang_set->addAll(kASCIILetters); 564 } else { 565 lang_set = new icu::UnicodeSet(1, 0); 566 } 567 lang_set->freeze(); 568 SetExemplarSetForLang(lang, lang_set); 569 ulocdata_close(uld); 570 } 571 } 572 return !lang_set->isEmpty() && lang_set->containsAll(component_characters); 573 } 574 575 // Returns true if the given Unicode host component is safe to display to the 576 // user. 577 bool IsIDNComponentSafe(const char16* str, 578 int str_len, 579 const std::wstring& languages) { 580 // Most common cases (non-IDN) do not reach here so that we don't 581 // need a fast return path. 582 // TODO(jungshik) : Check if there's any character inappropriate 583 // (although allowed) for domain names. 584 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and 585 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt 586 // For now, we borrow the list from Mozilla and tweaked it slightly. 587 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because 588 // they're gonna be canonicalized to U+0020 and full stop before 589 // reaching here.) 590 // The original list is available at 591 // http://kb.mozillazine.org/Network.IDN.blacklist_chars and 592 // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 593 594 UErrorCode status = U_ZERO_ERROR; 595 #ifdef U_WCHAR_IS_UTF16 596 icu::UnicodeSet dangerous_characters(icu::UnicodeString( 597 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338" 598 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" 599 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" 600 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" 601 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" 602 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" 603 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" 604 L"[\ufffa-\ufffd]]"), status); 605 #else 606 icu::UnicodeSet dangerous_characters(icu::UnicodeString( 607 "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338" 608 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" 609 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" 610 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" 611 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" 612 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" 613 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" 614 "[\\ufffa-\\ufffd]]", -1, US_INV), status); 615 #endif 616 DCHECK(U_SUCCESS(status)); 617 icu::UnicodeSet component_characters; 618 component_characters.addAll(icu::UnicodeString(str, str_len)); 619 if (dangerous_characters.containsSome(component_characters)) 620 return false; 621 622 // If the language list is empty, the result is completely determined 623 // by whether a component is a single script or not. This will block 624 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are 625 // allowed with |languages| (while it blocks Chinese + Latin letters with 626 // an accent as should be the case), but we want to err on the safe side 627 // when |languages| is empty. 628 if (languages.empty()) 629 return IsIDNComponentInSingleScript(str, str_len); 630 631 // |common_characters| is made up of ASCII numbers, hyphen, plus and 632 // underscore that are used across scripts and allowed in domain names. 633 // (sync'd with characters allowed in url_canon_host with square 634 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. 635 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), 636 status); 637 DCHECK(U_SUCCESS(status)); 638 // Subtract common characters because they're always allowed so that 639 // we just have to check if a language-specific set contains 640 // the remainder. 641 component_characters.removeAll(common_characters); 642 643 std::string languages_list(WideToASCII(languages)); 644 StringTokenizer t(languages_list, ","); 645 while (t.GetNext()) { 646 if (IsComponentCoveredByLang(component_characters, t.token())) 647 return true; 648 } 649 return false; 650 } 651 652 // Converts one component of a host (between dots) to IDN if safe. The result 653 // will be APPENDED to the given output string and will be the same as the input 654 // if it is not IDN or the IDN is unsafe to display. Returns whether any 655 // conversion was performed. 656 bool IDNToUnicodeOneComponent(const char16* comp, 657 size_t comp_len, 658 const std::wstring& languages, 659 string16* out) { 660 DCHECK(out); 661 if (comp_len == 0) 662 return false; 663 664 // Only transform if the input can be an IDN component. 665 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; 666 if ((comp_len > arraysize(kIdnPrefix)) && 667 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) { 668 // Repeatedly expand the output string until it's big enough. It looks like 669 // ICU will return the required size of the buffer, but that's not 670 // documented, so we'll just grow by 2x. This should be rare and is not on a 671 // critical path. 672 size_t original_length = out->length(); 673 for (int extra_space = 64; ; extra_space *= 2) { 674 UErrorCode status = U_ZERO_ERROR; 675 out->resize(out->length() + extra_space); 676 int output_chars = uidna_IDNToUnicode(comp, 677 static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space, 678 UIDNA_DEFAULT, NULL, &status); 679 if (status == U_ZERO_ERROR) { 680 // Converted successfully. 681 out->resize(original_length + output_chars); 682 if (IsIDNComponentSafe(out->data() + original_length, output_chars, 683 languages)) 684 return true; 685 } 686 687 if (status != U_BUFFER_OVERFLOW_ERROR) 688 break; 689 } 690 // Failed, revert back to original string. 691 out->resize(original_length); 692 } 693 694 // We get here with no IDN or on error, in which case we just append the 695 // literal input. 696 out->append(comp, comp_len); 697 return false; 698 } 699 700 // Helper for FormatUrl(). 701 std::wstring FormatViewSourceUrl(const GURL& url, 702 const std::wstring& languages, 703 bool omit_username_password, 704 UnescapeRule::Type unescape_rules, 705 url_parse::Parsed* new_parsed, 706 size_t* prefix_end, 707 size_t* offset_for_adjustment) { 708 DCHECK(new_parsed); 709 const wchar_t* const kWideViewSource = L"view-source:"; 710 const size_t kViewSourceLengthPlus1 = 12; 711 712 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1)); 713 size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ? 714 std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1); 715 size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ? 716 NULL : &temp_offset; 717 std::wstring result = net::FormatUrl(real_url, languages, 718 omit_username_password, unescape_rules, new_parsed, prefix_end, 719 temp_offset_ptr); 720 result.insert(0, kWideViewSource); 721 722 // Adjust position values. 723 if (new_parsed->scheme.is_nonempty()) { 724 // Assume "view-source:real-scheme" as a scheme. 725 new_parsed->scheme.len += kViewSourceLengthPlus1; 726 } else { 727 new_parsed->scheme.begin = 0; 728 new_parsed->scheme.len = kViewSourceLengthPlus1 - 1; 729 } 730 if (new_parsed->username.is_nonempty()) 731 new_parsed->username.begin += kViewSourceLengthPlus1; 732 if (new_parsed->password.is_nonempty()) 733 new_parsed->password.begin += kViewSourceLengthPlus1; 734 if (new_parsed->host.is_nonempty()) 735 new_parsed->host.begin += kViewSourceLengthPlus1; 736 if (new_parsed->port.is_nonempty()) 737 new_parsed->port.begin += kViewSourceLengthPlus1; 738 if (new_parsed->path.is_nonempty()) 739 new_parsed->path.begin += kViewSourceLengthPlus1; 740 if (new_parsed->query.is_nonempty()) 741 new_parsed->query.begin += kViewSourceLengthPlus1; 742 if (new_parsed->ref.is_nonempty()) 743 new_parsed->ref.begin += kViewSourceLengthPlus1; 744 if (prefix_end) 745 *prefix_end += kViewSourceLengthPlus1; 746 if (temp_offset_ptr) { 747 *offset_for_adjustment = (temp_offset == std::wstring::npos) ? 748 std::wstring::npos : (temp_offset + kViewSourceLengthPlus1); 749 } 750 return result; 751 } 752 753 } // namespace 754 755 namespace net { 756 757 std::set<int> explicitly_allowed_ports; 758 759 // Appends the substring |in_component| inside of the URL |spec| to |output|, 760 // and the resulting range will be filled into |out_component|. |unescape_rules| 761 // defines how to clean the URL for human readability. |offset_for_adjustment| 762 // is an offset into |output| which will be adjusted based on how it maps to the 763 // component being converted; if it is less than output->length(), it will be 764 // untouched, and if it is greater than output->length() + in_component.len it 765 // will be shortened by the difference in lengths between the input and output 766 // components. Otherwise it points into the component being converted, and is 767 // adjusted to point to the same logical place in |output|. 768 // |offset_for_adjustment| may not be NULL. 769 static void AppendFormattedComponent(const std::string& spec, 770 const url_parse::Component& in_component, 771 UnescapeRule::Type unescape_rules, 772 std::wstring* output, 773 url_parse::Component* out_component, 774 size_t* offset_for_adjustment); 775 776 GURL FilePathToFileURL(const FilePath& path) { 777 // Produce a URL like "file:///C:/foo" for a regular file, or 778 // "file://///server/path" for UNC. The URL canonicalizer will fix up the 779 // latter case to be the canonical UNC form: "file://server/path" 780 FilePath::StringType url_string(kFileURLPrefix); 781 url_string.append(path.value()); 782 783 // Now do replacement of some characters. Since we assume the input is a 784 // literal filename, anything the URL parser might consider special should 785 // be escaped here. 786 787 // must be the first substitution since others will introduce percents as the 788 // escape character 789 ReplaceSubstringsAfterOffset(&url_string, 0, 790 FILE_PATH_LITERAL("%"), FILE_PATH_LITERAL("%25")); 791 792 // semicolon is supposed to be some kind of separator according to RFC 2396 793 ReplaceSubstringsAfterOffset(&url_string, 0, 794 FILE_PATH_LITERAL(";"), FILE_PATH_LITERAL("%3B")); 795 796 ReplaceSubstringsAfterOffset(&url_string, 0, 797 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23")); 798 799 #if defined(OS_POSIX) 800 ReplaceSubstringsAfterOffset(&url_string, 0, 801 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C")); 802 #endif 803 804 return GURL(url_string); 805 } 806 807 std::wstring GetSpecificHeader(const std::wstring& headers, 808 const std::wstring& name) { 809 return GetSpecificHeaderT(headers, name); 810 } 811 812 std::string GetSpecificHeader(const std::string& headers, 813 const std::string& name) { 814 return GetSpecificHeaderT(headers, name); 815 } 816 817 std::string GetFileNameFromCD(const std::string& header, 818 const std::string& referrer_charset) { 819 std::string param_value = GetHeaderParamValue(header, "filename"); 820 if (param_value.empty()) { 821 // Some servers use 'name' parameter. 822 param_value = GetHeaderParamValue(header, "name"); 823 } 824 if (param_value.empty()) 825 return std::string(); 826 std::string decoded; 827 if (DecodeParamValue(param_value, referrer_charset, &decoded)) 828 return decoded; 829 return std::string(); 830 } 831 832 std::wstring GetHeaderParamValue(const std::wstring& field, 833 const std::wstring& param_name) { 834 return GetHeaderParamValueT(field, param_name); 835 } 836 837 std::string GetHeaderParamValue(const std::string& field, 838 const std::string& param_name) { 839 return GetHeaderParamValueT(field, param_name); 840 } 841 842 // TODO(brettw) bug 734373: check the scripts for each host component and 843 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for 844 // scripts that the user has installed. For now, just put the entire 845 // path through IDN. Maybe this feature can be implemented in ICU itself? 846 // 847 // We may want to skip this step in the case of file URLs to allow unicode 848 // UNC hostnames regardless of encodings. 849 std::wstring IDNToUnicode(const char* host, 850 size_t host_len, 851 const std::wstring& languages, 852 size_t* offset_for_adjustment) { 853 // Convert the ASCII input to a wide string for ICU. 854 string16 input16; 855 input16.reserve(host_len); 856 std::copy(host, host + host_len, std::back_inserter(input16)); 857 858 string16 out16; 859 size_t output_offset = offset_for_adjustment ? 860 *offset_for_adjustment : std::wstring::npos; 861 862 // Do each component of the host separately, since we enforce script matching 863 // on a per-component basis. 864 for (size_t component_start = 0, component_end; 865 component_start < input16.length(); 866 component_start = component_end + 1) { 867 // Find the end of the component. 868 component_end = input16.find('.', component_start); 869 if (component_end == string16::npos) 870 component_end = input16.length(); // For getting the last component. 871 size_t component_length = component_end - component_start; 872 873 size_t output_component_start = out16.length(); 874 bool converted_idn = false; 875 if (component_end > component_start) { 876 // Add the substring that we just found. 877 converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start, 878 component_length, languages, &out16); 879 } 880 size_t output_component_length = out16.length() - output_component_start; 881 882 if ((output_offset != std::wstring::npos) && 883 (*offset_for_adjustment > component_start)) { 884 if ((*offset_for_adjustment < component_end) && converted_idn) 885 output_offset = std::wstring::npos; 886 else 887 output_offset += output_component_length - component_length; 888 } 889 890 // Need to add the dot we just found (if we found one). 891 if (component_end < input16.length()) 892 out16.push_back('.'); 893 } 894 895 if (offset_for_adjustment) 896 *offset_for_adjustment = output_offset; 897 898 return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment); 899 } 900 901 std::string CanonicalizeHost(const std::string& host, 902 url_canon::CanonHostInfo* host_info) { 903 // Try to canonicalize the host. 904 const url_parse::Component raw_host_component( 905 0, static_cast<int>(host.length())); 906 std::string canon_host; 907 url_canon::StdStringCanonOutput canon_host_output(&canon_host); 908 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component, 909 &canon_host_output, host_info); 910 911 if (host_info->out_host.is_nonempty() && 912 host_info->family != url_canon::CanonHostInfo::BROKEN) { 913 // Success! Assert that there's no extra garbage. 914 canon_host_output.Complete(); 915 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length())); 916 } else { 917 // Empty host, or canonicalization failed. We'll return empty. 918 canon_host.clear(); 919 } 920 921 return canon_host; 922 } 923 924 std::string CanonicalizeHost(const std::wstring& host, 925 url_canon::CanonHostInfo* host_info) { 926 std::string converted_host; 927 WideToUTF8(host.c_str(), host.length(), &converted_host); 928 return CanonicalizeHost(converted_host, host_info); 929 } 930 931 std::string GetDirectoryListingHeader(const string16& title) { 932 static const base::StringPiece header( 933 NetModule::GetResource(IDR_DIR_HEADER_HTML)); 934 // This can be null in unit tests. 935 DLOG_IF(WARNING, header.empty()) << 936 "Missing resource: directory listing header"; 937 938 std::string result; 939 if (!header.empty()) 940 result.assign(header.data(), header.size()); 941 942 result.append("<script>start("); 943 base::JsonDoubleQuote(title, true, &result); 944 result.append(");</script>\n"); 945 946 return result; 947 } 948 949 inline bool IsHostCharAlpha(char c) { 950 // We can just check lowercase because uppercase characters have already been 951 // normalized. 952 return (c >= 'a') && (c <= 'z'); 953 } 954 955 inline bool IsHostCharDigit(char c) { 956 return (c >= '0') && (c <= '9'); 957 } 958 959 bool IsCanonicalizedHostCompliant(const std::string& host) { 960 if (host.empty()) 961 return false; 962 963 bool in_component = false; 964 bool most_recent_component_started_alpha = false; 965 bool last_char_was_hyphen_or_underscore = false; 966 967 for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) { 968 const char c = *i; 969 if (!in_component) { 970 most_recent_component_started_alpha = IsHostCharAlpha(c); 971 if (!most_recent_component_started_alpha && !IsHostCharDigit(c)) 972 return false; 973 in_component = true; 974 } else { 975 if (c == '.') { 976 if (last_char_was_hyphen_or_underscore) 977 return false; 978 in_component = false; 979 } else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) { 980 last_char_was_hyphen_or_underscore = false; 981 } else if ((c == '-') || (c == '_')) { 982 last_char_was_hyphen_or_underscore = true; 983 } else { 984 return false; 985 } 986 } 987 } 988 989 return most_recent_component_started_alpha; 990 } 991 992 std::string GetDirectoryListingEntry(const string16& name, 993 const std::string& raw_bytes, 994 bool is_dir, 995 int64 size, 996 Time modified) { 997 std::string result; 998 result.append("<script>addRow("); 999 base::JsonDoubleQuote(name, true, &result); 1000 result.append(","); 1001 if (raw_bytes.empty()) { 1002 base::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)), 1003 true, &result); 1004 } else { 1005 base::JsonDoubleQuote(EscapePath(raw_bytes), true, &result); 1006 } 1007 if (is_dir) { 1008 result.append(",1,"); 1009 } else { 1010 result.append(",0,"); 1011 } 1012 1013 base::JsonDoubleQuote( 1014 WideToUTF16Hack(FormatBytes(size, GetByteDisplayUnits(size), true)), true, 1015 &result); 1016 1017 result.append(","); 1018 1019 string16 modified_str; 1020 // |modified| can be NULL in FTP listings. 1021 if (!modified.is_null()) { 1022 modified_str = WideToUTF16Hack(base::TimeFormatShortDateAndTime(modified)); 1023 } 1024 base::JsonDoubleQuote(modified_str, true, &result); 1025 1026 result.append(");</script>\n"); 1027 1028 return result; 1029 } 1030 1031 std::wstring StripWWW(const std::wstring& text) { 1032 const std::wstring www(L"www."); 1033 return (text.compare(0, www.length(), www) == 0) ? 1034 text.substr(www.length()) : text; 1035 } 1036 1037 FilePath GetSuggestedFilename(const GURL& url, 1038 const std::string& content_disposition, 1039 const std::string& referrer_charset, 1040 const FilePath& default_name) { 1041 // We don't translate this fallback string, "download". If localization is 1042 // needed, the caller should provide localized fallback default_name. 1043 static const FilePath::CharType kFinalFallbackName[] = 1044 FILE_PATH_LITERAL("download"); 1045 1046 // about: and data: URLs don't have file names, but esp. data: URLs may 1047 // contain parts that look like ones (i.e., contain a slash). 1048 // Therefore we don't attempt to divine a file name out of them. 1049 if (url.SchemeIs("about") || url.SchemeIs("data")) { 1050 return default_name.empty() ? FilePath(kFinalFallbackName) : default_name; 1051 } 1052 1053 const std::string filename_from_cd = GetFileNameFromCD(content_disposition, 1054 referrer_charset); 1055 #if defined(OS_WIN) 1056 FilePath::StringType filename = UTF8ToWide(filename_from_cd); 1057 #elif defined(OS_POSIX) 1058 FilePath::StringType filename = filename_from_cd; 1059 #endif 1060 1061 if (!filename.empty()) { 1062 // Remove any path information the server may have sent, take the name 1063 // only. 1064 filename = FilePath(filename).BaseName().value(); 1065 1066 // Next, remove "." from the beginning and end of the file name to avoid 1067 // tricks with hidden files, "..", and "." 1068 TrimString(filename, FILE_PATH_LITERAL("."), &filename); 1069 } 1070 if (filename.empty()) { 1071 if (url.is_valid()) { 1072 const std::string unescaped_url_filename = UnescapeURLComponent( 1073 url.ExtractFileName(), 1074 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); 1075 #if defined(OS_WIN) 1076 filename = UTF8ToWide(unescaped_url_filename); 1077 #elif defined(OS_POSIX) 1078 filename = unescaped_url_filename; 1079 #endif 1080 } 1081 } 1082 1083 // Trim '.' once more. 1084 TrimString(filename, FILE_PATH_LITERAL("."), &filename); 1085 1086 // If there's no filename or it gets trimed to be empty, use 1087 // the URL hostname or default_name 1088 if (filename.empty()) { 1089 if (!default_name.empty()) { 1090 filename = default_name.value(); 1091 } else if (url.is_valid()) { 1092 // Some schemes (e.g. file) do not have a hostname. Even though it's 1093 // not likely to reach here, let's hardcode the last fallback name. 1094 // TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451) 1095 filename = url.host().empty() ? kFinalFallbackName : 1096 #if defined(OS_WIN) 1097 UTF8ToWide(url.host()); 1098 #elif defined(OS_POSIX) 1099 url.host(); 1100 #endif 1101 } else { 1102 NOTREACHED(); 1103 } 1104 } 1105 1106 file_util::ReplaceIllegalCharactersInPath(&filename, '-'); 1107 return FilePath(filename); 1108 } 1109 1110 bool IsPortAllowedByDefault(int port) { 1111 int array_size = arraysize(kRestrictedPorts); 1112 for (int i = 0; i < array_size; i++) { 1113 if (kRestrictedPorts[i] == port) { 1114 return false; 1115 } 1116 } 1117 return true; 1118 } 1119 1120 bool IsPortAllowedByFtp(int port) { 1121 int array_size = arraysize(kAllowedFtpPorts); 1122 for (int i = 0; i < array_size; i++) { 1123 if (kAllowedFtpPorts[i] == port) { 1124 return true; 1125 } 1126 } 1127 // Port not explicitly allowed by FTP, so return the default restrictions. 1128 return IsPortAllowedByDefault(port); 1129 } 1130 1131 bool IsPortAllowedByOverride(int port) { 1132 if (explicitly_allowed_ports.empty()) 1133 return false; 1134 1135 std::set<int>::const_iterator it = 1136 std::find(explicitly_allowed_ports.begin(), 1137 explicitly_allowed_ports.end(), 1138 port); 1139 1140 return it != explicitly_allowed_ports.end(); 1141 } 1142 1143 int SetNonBlocking(int fd) { 1144 #if defined(OS_WIN) 1145 unsigned long no_block = 1; 1146 return ioctlsocket(fd, FIONBIO, &no_block); 1147 #elif defined(OS_POSIX) 1148 int flags = fcntl(fd, F_GETFL, 0); 1149 if (-1 == flags) 1150 flags = 0; 1151 return fcntl(fd, F_SETFL, flags | O_NONBLOCK); 1152 #endif 1153 } 1154 1155 bool ParseHostAndPort(std::string::const_iterator host_and_port_begin, 1156 std::string::const_iterator host_and_port_end, 1157 std::string* host, 1158 int* port) { 1159 if (host_and_port_begin >= host_and_port_end) 1160 return false; 1161 1162 // When using url_parse, we use char*. 1163 const char* auth_begin = &(*host_and_port_begin); 1164 int auth_len = host_and_port_end - host_and_port_begin; 1165 1166 url_parse::Component auth_component(0, auth_len); 1167 url_parse::Component username_component; 1168 url_parse::Component password_component; 1169 url_parse::Component hostname_component; 1170 url_parse::Component port_component; 1171 1172 url_parse::ParseAuthority(auth_begin, auth_component, &username_component, 1173 &password_component, &hostname_component, &port_component); 1174 1175 // There shouldn't be a username/password. 1176 if (username_component.is_valid() || password_component.is_valid()) 1177 return false; 1178 1179 if (!hostname_component.is_nonempty()) 1180 return false; // Failed parsing. 1181 1182 int parsed_port_number = -1; 1183 if (port_component.is_nonempty()) { 1184 parsed_port_number = url_parse::ParsePort(auth_begin, port_component); 1185 1186 // If parsing failed, port_number will be either PORT_INVALID or 1187 // PORT_UNSPECIFIED, both of which are negative. 1188 if (parsed_port_number < 0) 1189 return false; // Failed parsing the port number. 1190 } 1191 1192 if (port_component.len == 0) 1193 return false; // Reject inputs like "foo:" 1194 1195 // Pass results back to caller. 1196 host->assign(auth_begin + hostname_component.begin, hostname_component.len); 1197 *port = parsed_port_number; 1198 1199 return true; // Success. 1200 } 1201 1202 bool ParseHostAndPort(const std::string& host_and_port, 1203 std::string* host, 1204 int* port) { 1205 return ParseHostAndPort( 1206 host_and_port.begin(), host_and_port.end(), host, port); 1207 } 1208 1209 std::string GetHostAndPort(const GURL& url) { 1210 // For IPv6 literals, GURL::host() already includes the brackets so it is 1211 // safe to just append a colon. 1212 return StringPrintf("%s:%d", url.host().c_str(), url.EffectiveIntPort()); 1213 } 1214 1215 std::string GetHostAndOptionalPort(const GURL& url) { 1216 // For IPv6 literals, GURL::host() already includes the brackets 1217 // so it is safe to just append a colon. 1218 if (url.has_port()) 1219 return StringPrintf("%s:%s", url.host().c_str(), url.port().c_str()); 1220 return url.host(); 1221 } 1222 1223 std::string NetAddressToString(const struct addrinfo* net_address) { 1224 #if defined(OS_WIN) 1225 EnsureWinsockInit(); 1226 #endif 1227 1228 // This buffer is large enough to fit the biggest IPv6 string. 1229 char buffer[INET6_ADDRSTRLEN]; 1230 1231 int result = getnameinfo(net_address->ai_addr, 1232 net_address->ai_addrlen, buffer, sizeof(buffer), NULL, 0, NI_NUMERICHOST); 1233 1234 if (result != 0) { 1235 DLOG(INFO) << "getnameinfo() failed with " << result; 1236 buffer[0] = '\0'; 1237 } 1238 return std::string(buffer); 1239 } 1240 1241 std::string GetHostName() { 1242 #if defined(OS_WIN) 1243 EnsureWinsockInit(); 1244 #endif 1245 1246 // Host names are limited to 255 bytes. 1247 char buffer[256]; 1248 int result = gethostname(buffer, sizeof(buffer)); 1249 if (result != 0) { 1250 DLOG(INFO) << "gethostname() failed with " << result; 1251 buffer[0] = '\0'; 1252 } 1253 return std::string(buffer); 1254 } 1255 1256 void GetIdentityFromURL(const GURL& url, 1257 std::wstring* username, 1258 std::wstring* password) { 1259 UnescapeRule::Type flags = UnescapeRule::SPACES; 1260 *username = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.username(), 1261 flags, NULL)); 1262 *password = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.password(), 1263 flags, NULL)); 1264 } 1265 1266 void AppendFormattedHost(const GURL& url, 1267 const std::wstring& languages, 1268 std::wstring* output, 1269 url_parse::Parsed* new_parsed, 1270 size_t* offset_for_adjustment) { 1271 DCHECK(output); 1272 const url_parse::Component& host = 1273 url.parsed_for_possibly_invalid_spec().host; 1274 1275 if (host.is_nonempty()) { 1276 // Handle possible IDN in the host name. 1277 int new_host_begin = static_cast<int>(output->length()); 1278 if (new_parsed) 1279 new_parsed->host.begin = new_host_begin; 1280 size_t offset_past_current_output = 1281 (!offset_for_adjustment || 1282 (*offset_for_adjustment == std::wstring::npos) || 1283 (*offset_for_adjustment < output->length())) ? 1284 std::wstring::npos : (*offset_for_adjustment - output->length()); 1285 size_t* offset_into_host = 1286 (offset_past_current_output >= static_cast<size_t>(host.len)) ? 1287 NULL : &offset_past_current_output; 1288 1289 const std::string& spec = url.possibly_invalid_spec(); 1290 DCHECK(host.begin >= 0 && 1291 ((spec.length() == 0 && host.begin == 0) || 1292 host.begin < static_cast<int>(spec.length()))); 1293 output->append(net::IDNToUnicode(&spec[host.begin], 1294 static_cast<size_t>(host.len), languages, offset_into_host)); 1295 1296 int new_host_len = static_cast<int>(output->length()) - new_host_begin; 1297 if (new_parsed) 1298 new_parsed->host.len = new_host_len; 1299 if (offset_into_host) { 1300 *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ? 1301 std::wstring::npos : (new_host_begin + *offset_into_host); 1302 } else if (offset_past_current_output != std::wstring::npos) { 1303 *offset_for_adjustment += new_host_len - host.len; 1304 } 1305 } else if (new_parsed) { 1306 new_parsed->host.reset(); 1307 } 1308 } 1309 1310 /* static */ 1311 void AppendFormattedComponent(const std::string& spec, 1312 const url_parse::Component& in_component, 1313 UnescapeRule::Type unescape_rules, 1314 std::wstring* output, 1315 url_parse::Component* out_component, 1316 size_t* offset_for_adjustment) { 1317 DCHECK(output); 1318 DCHECK(offset_for_adjustment); 1319 if (in_component.is_nonempty()) { 1320 out_component->begin = static_cast<int>(output->length()); 1321 size_t offset_past_current_output = 1322 ((*offset_for_adjustment == std::wstring::npos) || 1323 (*offset_for_adjustment < output->length())) ? 1324 std::wstring::npos : (*offset_for_adjustment - output->length()); 1325 size_t* offset_into_component = 1326 (offset_past_current_output >= static_cast<size_t>(in_component.len)) ? 1327 NULL : &offset_past_current_output; 1328 if (unescape_rules == UnescapeRule::NONE) { 1329 output->append(UTF8ToWideAndAdjustOffset( 1330 spec.substr(in_component.begin, in_component.len), 1331 offset_into_component)); 1332 } else { 1333 output->append(UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent( 1334 spec.substr(in_component.begin, in_component.len), unescape_rules, 1335 offset_into_component))); 1336 } 1337 out_component->len = 1338 static_cast<int>(output->length()) - out_component->begin; 1339 if (offset_into_component) { 1340 *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ? 1341 std::wstring::npos : (out_component->begin + *offset_into_component); 1342 } else if (offset_past_current_output != std::wstring::npos) { 1343 *offset_for_adjustment += out_component->len - in_component.len; 1344 } 1345 } else { 1346 out_component->reset(); 1347 } 1348 } 1349 1350 std::wstring FormatUrl(const GURL& url, 1351 const std::wstring& languages, 1352 bool omit_username_password, 1353 UnescapeRule::Type unescape_rules, 1354 url_parse::Parsed* new_parsed, 1355 size_t* prefix_end, 1356 size_t* offset_for_adjustment) { 1357 url_parse::Parsed parsed_temp; 1358 if (!new_parsed) 1359 new_parsed = &parsed_temp; 1360 size_t offset_temp = std::wstring::npos; 1361 if (!offset_for_adjustment) 1362 offset_for_adjustment = &offset_temp; 1363 1364 std::wstring url_string; 1365 1366 // Check for empty URLs or 0 available text width. 1367 if (url.is_empty()) { 1368 if (prefix_end) 1369 *prefix_end = 0; 1370 *offset_for_adjustment = std::wstring::npos; 1371 return url_string; 1372 } 1373 1374 // Special handling for view-source:. Don't use chrome::kViewSourceScheme 1375 // because this library shouldn't depend on chrome. 1376 const char* const kViewSource = "view-source"; 1377 const char* const kViewSourceTwice = "view-source:view-source:"; 1378 // Rejects view-source:view-source:... to avoid deep recursive call. 1379 if (url.SchemeIs(kViewSource) && 1380 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { 1381 return FormatViewSourceUrl(url, languages, omit_username_password, 1382 unescape_rules, new_parsed, prefix_end, offset_for_adjustment); 1383 } 1384 1385 // We handle both valid and invalid URLs (this will give us the spec 1386 // regardless of validity). 1387 const std::string& spec = url.possibly_invalid_spec(); 1388 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); 1389 if (*offset_for_adjustment >= spec.length()) 1390 *offset_for_adjustment = std::wstring::npos; 1391 1392 // Copy everything before the username (the scheme and the separators.) 1393 // These are ASCII. 1394 std::copy(spec.begin(), 1395 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, 1396 true), 1397 std::back_inserter(url_string)); 1398 new_parsed->scheme = parsed.scheme; 1399 1400 if (omit_username_password) { 1401 // Remove the username and password fields. We don't want to display those 1402 // to the user since they can be used for attacks, 1403 // e.g. "http://google.com:search@evil.ru/" 1404 new_parsed->username.reset(); 1405 new_parsed->password.reset(); 1406 if ((*offset_for_adjustment != std::wstring::npos) && 1407 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) { 1408 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { 1409 // The seeming off-by-one and off-by-two in these first two lines are to 1410 // account for the ':' after the username and '@' after the password. 1411 if (*offset_for_adjustment > 1412 static_cast<size_t>(parsed.password.end())) { 1413 *offset_for_adjustment -= 1414 (parsed.username.len + parsed.password.len + 2); 1415 } else if (*offset_for_adjustment > 1416 static_cast<size_t>(parsed.username.begin)) { 1417 *offset_for_adjustment = std::wstring::npos; 1418 } 1419 } else { 1420 const url_parse::Component* nonempty_component = 1421 parsed.username.is_nonempty() ? &parsed.username : &parsed.password; 1422 // The seeming off-by-one in these first two lines is to account for the 1423 // '@' after the username/password. 1424 if (*offset_for_adjustment > 1425 static_cast<size_t>(nonempty_component->end())) { 1426 *offset_for_adjustment -= (nonempty_component->len + 1); 1427 } else if (*offset_for_adjustment > 1428 static_cast<size_t>(nonempty_component->begin)) { 1429 *offset_for_adjustment = std::wstring::npos; 1430 } 1431 } 1432 } 1433 } else { 1434 AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string, 1435 &new_parsed->username, offset_for_adjustment); 1436 if (parsed.password.is_valid()) { 1437 url_string.push_back(':'); 1438 } 1439 AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string, 1440 &new_parsed->password, offset_for_adjustment); 1441 if (parsed.username.is_valid() || parsed.password.is_valid()) { 1442 url_string.push_back('@'); 1443 } 1444 } 1445 if (prefix_end) 1446 *prefix_end = static_cast<size_t>(url_string.length()); 1447 1448 AppendFormattedHost(url, languages, &url_string, new_parsed, 1449 offset_for_adjustment); 1450 1451 // Port. 1452 if (parsed.port.is_nonempty()) { 1453 url_string.push_back(':'); 1454 new_parsed->port.begin = url_string.length(); 1455 std::copy(spec.begin() + parsed.port.begin, 1456 spec.begin() + parsed.port.end(), std::back_inserter(url_string)); 1457 new_parsed->port.len = url_string.length() - new_parsed->port.begin; 1458 } else { 1459 new_parsed->port.reset(); 1460 } 1461 1462 // Path and query both get the same general unescape & convert treatment. 1463 AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string, 1464 &new_parsed->path, offset_for_adjustment); 1465 if (parsed.query.is_valid()) 1466 url_string.push_back('?'); 1467 AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string, 1468 &new_parsed->query, offset_for_adjustment); 1469 1470 // Reference is stored in valid, unescaped UTF-8, so we can just convert. 1471 if (parsed.ref.is_valid()) { 1472 url_string.push_back('#'); 1473 new_parsed->ref.begin = url_string.length(); 1474 size_t offset_past_current_output = 1475 ((*offset_for_adjustment == std::wstring::npos) || 1476 (*offset_for_adjustment < url_string.length())) ? 1477 std::wstring::npos : (*offset_for_adjustment - url_string.length()); 1478 size_t* offset_into_ref = 1479 (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ? 1480 NULL : &offset_past_current_output; 1481 if (parsed.ref.len > 0) { 1482 url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin, 1483 parsed.ref.len), 1484 offset_into_ref)); 1485 } 1486 new_parsed->ref.len = url_string.length() - new_parsed->ref.begin; 1487 if (offset_into_ref) { 1488 *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ? 1489 std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref); 1490 } else if (offset_past_current_output != std::wstring::npos) { 1491 // We clamped the offset near the beginning of this function to ensure it 1492 // was within the input URL. If we reach here, the input was something 1493 // invalid and non-parseable such that the offset was past any component 1494 // we could figure out. In this case it won't be represented in the 1495 // output string, so reset it. 1496 *offset_for_adjustment = std::wstring::npos; 1497 } 1498 } 1499 1500 return url_string; 1501 } 1502 1503 GURL SimplifyUrlForRequest(const GURL& url) { 1504 DCHECK(url.is_valid()); 1505 GURL::Replacements replacements; 1506 replacements.ClearUsername(); 1507 replacements.ClearPassword(); 1508 replacements.ClearRef(); 1509 return url.ReplaceComponents(replacements); 1510 } 1511 1512 // Specifies a comma separated list of port numbers that should be accepted 1513 // despite bans. If the string is invalid no allowed ports are stored. 1514 void SetExplicitlyAllowedPorts(const std::wstring& allowed_ports) { 1515 if (allowed_ports.empty()) 1516 return; 1517 1518 std::set<int> ports; 1519 size_t last = 0; 1520 size_t size = allowed_ports.size(); 1521 // The comma delimiter. 1522 const std::wstring::value_type kComma = L','; 1523 1524 // Overflow is still possible for evil user inputs. 1525 for (size_t i = 0; i <= size; ++i) { 1526 // The string should be composed of only digits and commas. 1527 if (i != size && !IsAsciiDigit(allowed_ports[i]) && 1528 (allowed_ports[i] != kComma)) 1529 return; 1530 if (i == size || allowed_ports[i] == kComma) { 1531 size_t length = i - last; 1532 if (length > 0) 1533 ports.insert(StringToInt(WideToASCII( 1534 allowed_ports.substr(last, length)))); 1535 last = i + 1; 1536 } 1537 } 1538 explicitly_allowed_ports = ports; 1539 } 1540 1541 } // namespace net 1542