1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_util.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_util.h" 9 #include "base/strings/stringprintf.h" 10 #include "chrome/browser/google/google_util.h" 11 #include "crypto/sha2.h" 12 #include "net/base/escape.h" 13 #include "url/gurl.h" 14 #include "url/url_util.h" 15 16 #if defined(OS_WIN) 17 #include "chrome/installer/util/browser_distribution.h" 18 #endif 19 20 static const char kReportParams[] = "?tpl=%s&url=%s"; 21 22 // SBChunk --------------------------------------------------------------------- 23 24 SBChunk::SBChunk() 25 : chunk_number(0), 26 list_id(0), 27 is_add(false) { 28 } 29 30 SBChunk::~SBChunk() {} 31 32 // SBChunkList ----------------------------------------------------------------- 33 34 SBChunkList::SBChunkList() {} 35 36 SBChunkList::~SBChunkList() { 37 clear(); 38 } 39 40 void SBChunkList::clear() { 41 for (std::vector<SBChunk>::iterator citer = chunks_.begin(); 42 citer != chunks_.end(); ++citer) { 43 for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin(); 44 hiter != citer->hosts.end(); ++hiter) { 45 if (hiter->entry) { 46 hiter->entry->Destroy(); 47 hiter->entry = NULL; 48 } 49 } 50 } 51 chunks_.clear(); 52 } 53 54 // SBListChunkRanges ----------------------------------------------------------- 55 56 SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {} 57 58 // SBChunkDelete --------------------------------------------------------------- 59 60 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {} 61 62 SBChunkDelete::~SBChunkDelete() {} 63 64 // SBEntry --------------------------------------------------------------------- 65 66 // static 67 SBEntry* SBEntry::Create(Type type, int prefix_count) { 68 int size = Size(type, prefix_count); 69 SBEntry *rv = static_cast<SBEntry*>(malloc(size)); 70 memset(rv, 0, size); 71 rv->set_type(type); 72 rv->set_prefix_count(prefix_count); 73 return rv; 74 } 75 76 void SBEntry::Destroy() { 77 free(this); 78 } 79 80 // static 81 int SBEntry::PrefixSize(Type type) { 82 switch (type) { 83 case ADD_PREFIX: 84 return sizeof(SBPrefix); 85 case ADD_FULL_HASH: 86 return sizeof(SBFullHash); 87 case SUB_PREFIX: 88 return sizeof(SBSubPrefix); 89 case SUB_FULL_HASH: 90 return sizeof(SBSubFullHash); 91 default: 92 NOTREACHED(); 93 return 0; 94 } 95 } 96 97 int SBEntry::Size() const { 98 return Size(type(), prefix_count()); 99 } 100 101 // static 102 int SBEntry::Size(Type type, int prefix_count) { 103 return sizeof(Data) + prefix_count * PrefixSize(type); 104 } 105 106 int SBEntry::ChunkIdAtPrefix(int index) const { 107 if (type() == SUB_PREFIX) 108 return sub_prefixes_[index].add_chunk; 109 return (type() == SUB_FULL_HASH) ? 110 sub_full_hashes_[index].add_chunk : chunk_id(); 111 } 112 113 void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) { 114 DCHECK(IsSub()); 115 116 if (type() == SUB_PREFIX) 117 sub_prefixes_[index].add_chunk = chunk_id; 118 else 119 sub_full_hashes_[index].add_chunk = chunk_id; 120 } 121 122 const SBPrefix& SBEntry::PrefixAt(int index) const { 123 DCHECK(IsPrefix()); 124 125 return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix; 126 } 127 128 const SBFullHash& SBEntry::FullHashAt(int index) const { 129 DCHECK(!IsPrefix()); 130 131 return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix; 132 } 133 134 void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) { 135 DCHECK(IsPrefix()); 136 137 if (IsAdd()) 138 add_prefixes_[index] = prefix; 139 else 140 sub_prefixes_[index].prefix = prefix; 141 } 142 143 void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) { 144 DCHECK(!IsPrefix()); 145 146 if (IsAdd()) 147 add_full_hashes_[index] = full_hash; 148 else 149 sub_full_hashes_[index].prefix = full_hash; 150 } 151 152 153 // Utility functions ----------------------------------------------------------- 154 155 namespace safe_browsing_util { 156 157 // Listnames that browser can process. 158 const char kMalwareList[] = "goog-malware-shavar"; 159 const char kPhishingList[] = "goog-phish-shavar"; 160 const char kBinUrlList[] = "goog-badbinurl-shavar"; 161 // We don't use the bad binary digest list anymore. Use a fake listname to be 162 // sure we don't request it accidentally. 163 const char kBinHashList[] = "goog-badbin-digestvar-disabled"; 164 const char kCsdWhiteList[] = "goog-csdwhite-sha256"; 165 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256"; 166 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar"; 167 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar"; 168 169 ListType GetListId(const std::string& name) { 170 ListType id; 171 if (name == safe_browsing_util::kMalwareList) { 172 id = MALWARE; 173 } else if (name == safe_browsing_util::kPhishingList) { 174 id = PHISH; 175 } else if (name == safe_browsing_util::kBinUrlList) { 176 id = BINURL; 177 } else if (name == safe_browsing_util::kBinHashList) { 178 id = BINHASH; 179 } else if (name == safe_browsing_util::kCsdWhiteList) { 180 id = CSDWHITELIST; 181 } else if (name == safe_browsing_util::kDownloadWhiteList) { 182 id = DOWNLOADWHITELIST; 183 } else if (name == safe_browsing_util::kExtensionBlacklist) { 184 id = EXTENSIONBLACKLIST; 185 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) { 186 id = SIDEEFFECTFREEWHITELIST; 187 } else { 188 id = INVALID; 189 } 190 return id; 191 } 192 193 bool GetListName(ListType list_id, std::string* list) { 194 switch (list_id) { 195 case MALWARE: 196 *list = safe_browsing_util::kMalwareList; 197 break; 198 case PHISH: 199 *list = safe_browsing_util::kPhishingList; 200 break; 201 case BINURL: 202 *list = safe_browsing_util::kBinUrlList; 203 break; 204 case BINHASH: 205 *list = safe_browsing_util::kBinHashList; 206 break; 207 case CSDWHITELIST: 208 *list = safe_browsing_util::kCsdWhiteList; 209 break; 210 case DOWNLOADWHITELIST: 211 *list = safe_browsing_util::kDownloadWhiteList; 212 break; 213 case EXTENSIONBLACKLIST: 214 *list = safe_browsing_util::kExtensionBlacklist; 215 break; 216 case SIDEEFFECTFREEWHITELIST: 217 *list = safe_browsing_util::kSideEffectFreeWhitelist; 218 break; 219 default: 220 return false; 221 } 222 return true; 223 } 224 225 std::string Unescape(const std::string& url) { 226 std::string unescaped_str(url); 227 std::string old_unescaped_str; 228 const int kMaxLoopIterations = 1024; 229 int loop_var = 0; 230 do { 231 old_unescaped_str = unescaped_str; 232 unescaped_str = net::UnescapeURLComponent(old_unescaped_str, 233 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES | 234 net::UnescapeRule::URL_SPECIAL_CHARS); 235 } while (unescaped_str != old_unescaped_str && ++loop_var <= 236 kMaxLoopIterations); 237 238 return unescaped_str; 239 } 240 241 std::string Escape(const std::string& url) { 242 std::string escaped_str; 243 const char* kHexString = "0123456789ABCDEF"; 244 for (size_t i = 0; i < url.length(); i++) { 245 unsigned char c = static_cast<unsigned char>(url[i]); 246 if (c <= ' ' || c > '~' || c == '#' || c == '%') { 247 escaped_str.push_back('%'); 248 escaped_str.push_back(kHexString[c >> 4]); 249 escaped_str.push_back(kHexString[c & 0xf]); 250 } else { 251 escaped_str.push_back(c); 252 } 253 } 254 255 return escaped_str; 256 } 257 258 std::string RemoveConsecutiveChars(const std::string& str, const char c) { 259 std::string output(str); 260 std::string string_to_find; 261 std::string::size_type loc = 0; 262 string_to_find.append(2, c); 263 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { 264 output.erase(loc, 1); 265 } 266 267 return output; 268 } 269 270 // Canonicalizes url as per Google Safe Browsing Specification. 271 // See section 6.1 in 272 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. 273 void CanonicalizeUrl(const GURL& url, 274 std::string* canonicalized_hostname, 275 std::string* canonicalized_path, 276 std::string* canonicalized_query) { 277 DCHECK(url.is_valid()); 278 279 // We only canonicalize "normal" URLs. 280 if (!url.IsStandard()) 281 return; 282 283 // Following canonicalization steps are excluded since url parsing takes care 284 // of those :- 285 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. 286 // (Exclude escaped version of these chars). 287 // 2. Normalize hostname to 4 dot-seperated decimal values. 288 // 3. Lowercase hostname. 289 // 4. Resolve path sequences "/../" and "/./". 290 291 // That leaves us with the following :- 292 // 1. Remove fragment in URL. 293 GURL url_without_fragment; 294 GURL::Replacements f_replacements; 295 f_replacements.ClearRef(); 296 f_replacements.ClearUsername(); 297 f_replacements.ClearPassword(); 298 url_without_fragment = url.ReplaceComponents(f_replacements); 299 300 // 2. Do URL unescaping until no more hex encoded characters exist. 301 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); 302 url_parse::Parsed parsed; 303 url_parse::ParseStandardURL(url_unescaped_str.data(), 304 url_unescaped_str.length(), &parsed); 305 306 // 3. In hostname, remove all leading and trailing dots. 307 const std::string host = 308 (parsed.host.len > 0) 309 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len) 310 : std::string(); 311 const char kCharsToTrim[] = "."; 312 std::string host_without_end_dots; 313 TrimString(host, kCharsToTrim, &host_without_end_dots); 314 315 // 4. In hostname, replace consecutive dots with a single dot. 316 std::string host_without_consecutive_dots(RemoveConsecutiveChars( 317 host_without_end_dots, '.')); 318 319 // 5. In path, replace runs of consecutive slashes with a single slash. 320 std::string path = 321 (parsed.path.len > 0) 322 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len) 323 : std::string(); 324 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/')); 325 326 url_canon::Replacements<char> hp_replacements; 327 hp_replacements.SetHost(host_without_consecutive_dots.data(), 328 url_parse::Component(0, host_without_consecutive_dots.length())); 329 hp_replacements.SetPath(path_without_consecutive_slash.data(), 330 url_parse::Component(0, path_without_consecutive_slash.length())); 331 332 std::string url_unescaped_with_can_hostpath; 333 url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); 334 url_parse::Parsed temp_parsed; 335 url_util::ReplaceComponents(url_unescaped_str.data(), 336 url_unescaped_str.length(), parsed, 337 hp_replacements, NULL, &output, &temp_parsed); 338 output.Complete(); 339 340 // 6. Step needed to revert escaping done in url_util::ReplaceComponents. 341 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); 342 343 // 7. After performing all above steps, percent-escape all chars in url which 344 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. 345 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); 346 url_parse::Parsed final_parsed; 347 url_parse::ParseStandardURL(escaped_canon_url_str.data(), 348 escaped_canon_url_str.length(), &final_parsed); 349 350 if (canonicalized_hostname && final_parsed.host.len > 0) { 351 *canonicalized_hostname = 352 escaped_canon_url_str.substr(final_parsed.host.begin, 353 final_parsed.host.len); 354 } 355 if (canonicalized_path && final_parsed.path.len > 0) { 356 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, 357 final_parsed.path.len); 358 } 359 if (canonicalized_query && final_parsed.query.len > 0) { 360 *canonicalized_query = escaped_canon_url_str.substr( 361 final_parsed.query.begin, final_parsed.query.len); 362 } 363 } 364 365 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 366 hosts->clear(); 367 368 std::string canon_host; 369 CanonicalizeUrl(url, &canon_host, NULL, NULL); 370 371 const std::string host = canon_host; // const sidesteps GCC bugs below! 372 if (host.empty()) 373 return; 374 375 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 376 // hostnames formed by starting with the last 5 components and successively 377 // removing the leading component. The last component isn't examined alone, 378 // since it's the TLD or a subcomponent thereof. 379 // 380 // Note that we don't need to be clever about stopping at the "real" eTLD -- 381 // the data on the server side has been filtered to ensure it will not 382 // blacklist a whole TLD, and it's not significantly slower on our side to 383 // just check too much. 384 // 385 // Also note that because we have a simple blacklist, not some sort of complex 386 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check 387 // these in. 388 const size_t kMaxHostsToCheck = 4; 389 bool skipped_last_component = false; 390 for (std::string::const_reverse_iterator i(host.rbegin()); 391 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { 392 if (*i == '.') { 393 if (skipped_last_component) 394 hosts->push_back(std::string(i.base(), host.end())); 395 else 396 skipped_last_component = true; 397 } 398 } 399 hosts->push_back(host); 400 } 401 402 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 403 paths->clear(); 404 405 std::string canon_path; 406 std::string canon_query; 407 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 408 409 const std::string path = canon_path; // const sidesteps GCC bugs below! 410 const std::string query = canon_query; 411 if (path.empty()) 412 return; 413 414 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without 415 // the query parameters, and also up to 4 paths formed by starting at the root 416 // and adding more path components. 417 // 418 // As with the hosts above, it doesn't matter what order we check these in. 419 const size_t kMaxPathsToCheck = 4; 420 for (std::string::const_iterator i(path.begin()); 421 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { 422 if (*i == '/') 423 paths->push_back(std::string(path.begin(), i + 1)); 424 } 425 426 if (!paths->empty() && paths->back() != path) 427 paths->push_back(path); 428 429 if (!query.empty()) 430 paths->push_back(path + "?" + query); 431 } 432 433 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 434 std::vector<std::string> hosts, paths; 435 GenerateHostsToCheck(url, &hosts); 436 GeneratePathsToCheck(url, &paths); 437 for (size_t h = 0; h < hosts.size(); ++h) { 438 for (size_t p = 0; p < paths.size(); ++p) { 439 urls->push_back(hosts[h] + paths[p]); 440 } 441 } 442 } 443 444 int GetHashIndex(const SBFullHash& hash, 445 const std::vector<SBFullHashResult>& full_hashes) { 446 for (size_t i = 0; i < full_hashes.size(); ++i) { 447 if (hash == full_hashes[i].hash) 448 return static_cast<int>(i); 449 } 450 return -1; 451 } 452 453 int GetUrlHashIndex(const GURL& url, 454 const std::vector<SBFullHashResult>& full_hashes) { 455 if (full_hashes.empty()) 456 return -1; 457 458 std::vector<std::string> patterns; 459 GeneratePatternsToCheck(url, &patterns); 460 461 for (size_t i = 0; i < patterns.size(); ++i) { 462 SBFullHash key; 463 crypto::SHA256HashString(patterns[i], key.full_hash, sizeof(SBFullHash)); 464 int index = GetHashIndex(key, full_hashes); 465 if (index != -1) 466 return index; 467 } 468 return -1; 469 } 470 471 bool IsPhishingList(const std::string& list_name) { 472 return list_name.compare(kPhishingList) == 0; 473 } 474 475 bool IsMalwareList(const std::string& list_name) { 476 return list_name.compare(kMalwareList) == 0; 477 } 478 479 bool IsBadbinurlList(const std::string& list_name) { 480 return list_name.compare(kBinUrlList) == 0; 481 } 482 483 bool IsBadbinhashList(const std::string& list_name) { 484 return list_name.compare(kBinHashList) == 0; 485 } 486 487 bool IsExtensionList(const std::string& list_name) { 488 return list_name.compare(kExtensionBlacklist) == 0; 489 } 490 491 GURL GeneratePhishingReportUrl(const std::string& report_page, 492 const std::string& url_to_report, 493 bool is_client_side_detection) { 494 const std::string current_esc = net::EscapeQueryParamValue(url_to_report, 495 true); 496 497 #if defined(OS_WIN) 498 BrowserDistribution* dist = BrowserDistribution::GetDistribution(); 499 std::string client_name(dist->GetSafeBrowsingName()); 500 #else 501 std::string client_name("googlechrome"); 502 #endif 503 if (is_client_side_detection) 504 client_name.append("_csd"); 505 506 GURL report_url(report_page + base::StringPrintf(kReportParams, 507 client_name.c_str(), 508 current_esc.c_str())); 509 return google_util::AppendGoogleLocaleParam(report_url); 510 } 511 512 SBFullHash StringToSBFullHash(const std::string& hash_in) { 513 DCHECK_EQ(crypto::kSHA256Length, hash_in.size()); 514 SBFullHash hash_out; 515 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length); 516 return hash_out; 517 } 518 519 std::string SBFullHashToString(const SBFullHash& hash) { 520 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash)); 521 return std::string(hash.full_hash, sizeof(hash.full_hash)); 522 } 523 524 } // namespace safe_browsing_util 525