1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_util.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_util.h" 9 #include "base/strings/stringprintf.h" 10 #include "chrome/browser/browser_process.h" 11 #include "chrome/browser/safe_browsing/chunk.pb.h" 12 #include "components/google/core/browser/google_util.h" 13 #include "crypto/sha2.h" 14 #include "net/base/escape.h" 15 #include "url/gurl.h" 16 #include "url/url_util.h" 17 18 #if defined(OS_WIN) 19 #include "chrome/installer/util/browser_distribution.h" 20 #endif 21 22 static const char kReportParams[] = "?tpl=%s&url=%s"; 23 24 SBFullHash SBFullHashForString(const base::StringPiece& str) { 25 SBFullHash h; 26 crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash)); 27 return h; 28 } 29 30 // SBCachedFullHashResult ------------------------------------------------------ 31 32 SBCachedFullHashResult::SBCachedFullHashResult() {} 33 34 SBCachedFullHashResult::SBCachedFullHashResult( 35 const base::Time& in_expire_after) 36 : expire_after(in_expire_after) {} 37 38 SBCachedFullHashResult::~SBCachedFullHashResult() {} 39 40 // SBChunkData ----------------------------------------------------------------- 41 42 // TODO(shess): Right now this contains a scoped_ptr<ChunkData> so that the 43 // proto buffer isn't copied all over the place, then these are contained in a 44 // ScopedVector for purposes of passing things around between tasks. This seems 45 // convoluted. Maybe it would make sense to have an overall container class 46 // returning references to a nested per-chunk class? 47 48 SBChunkData::SBChunkData() { 49 } 50 51 SBChunkData::SBChunkData(safe_browsing::ChunkData* raw_data) 52 : chunk_data_(raw_data) { 53 DCHECK(chunk_data_.get()); 54 } 55 56 SBChunkData::~SBChunkData() { 57 } 58 59 bool SBChunkData::ParseFrom(const unsigned char* data, size_t length) { 60 scoped_ptr<safe_browsing::ChunkData> chunk(new safe_browsing::ChunkData()); 61 if (!chunk->ParseFromArray(data, length)) 62 return false; 63 64 if (chunk->chunk_type() != safe_browsing::ChunkData::ADD && 65 chunk->chunk_type() != safe_browsing::ChunkData::SUB) { 66 return false; 67 } 68 69 size_t hash_size = 0; 70 if (chunk->prefix_type() == safe_browsing::ChunkData::PREFIX_4B) { 71 hash_size = sizeof(SBPrefix); 72 } else if (chunk->prefix_type() == safe_browsing::ChunkData::FULL_32B) { 73 hash_size = sizeof(SBFullHash); 74 } else { 75 return false; 76 } 77 78 const size_t hash_count = chunk->hashes().size() / hash_size; 79 if (hash_count * hash_size != chunk->hashes().size()) 80 return false; 81 82 if (chunk->chunk_type() == safe_browsing::ChunkData::SUB && 83 static_cast<size_t>(chunk->add_numbers_size()) != hash_count) { 84 return false; 85 } 86 87 chunk_data_.swap(chunk); 88 return true; 89 } 90 91 int SBChunkData::ChunkNumber() const { 92 return chunk_data_->chunk_number(); 93 } 94 95 bool SBChunkData::IsAdd() const { 96 return chunk_data_->chunk_type() == safe_browsing::ChunkData::ADD; 97 } 98 99 bool SBChunkData::IsSub() const { 100 return chunk_data_->chunk_type() == safe_browsing::ChunkData::SUB; 101 } 102 103 int SBChunkData::AddChunkNumberAt(size_t i) const { 104 DCHECK(IsSub()); 105 DCHECK((IsPrefix() && i < PrefixCount()) || 106 (IsFullHash() && i < FullHashCount())); 107 return chunk_data_->add_numbers(i); 108 } 109 110 bool SBChunkData::IsPrefix() const { 111 return chunk_data_->prefix_type() == safe_browsing::ChunkData::PREFIX_4B; 112 } 113 114 size_t SBChunkData::PrefixCount() const { 115 DCHECK(IsPrefix()); 116 return chunk_data_->hashes().size() / sizeof(SBPrefix); 117 } 118 119 SBPrefix SBChunkData::PrefixAt(size_t i) const { 120 DCHECK(IsPrefix()); 121 DCHECK_LT(i, PrefixCount()); 122 123 SBPrefix prefix; 124 memcpy(&prefix, chunk_data_->hashes().data() + i * sizeof(SBPrefix), 125 sizeof(SBPrefix)); 126 return prefix; 127 } 128 129 bool SBChunkData::IsFullHash() const { 130 return chunk_data_->prefix_type() == safe_browsing::ChunkData::FULL_32B; 131 } 132 133 size_t SBChunkData::FullHashCount() const { 134 DCHECK(IsFullHash()); 135 return chunk_data_->hashes().size() / sizeof(SBFullHash); 136 } 137 138 SBFullHash SBChunkData::FullHashAt(size_t i) const { 139 DCHECK(IsFullHash()); 140 DCHECK_LT(i, FullHashCount()); 141 142 SBFullHash full_hash; 143 memcpy(&full_hash, chunk_data_->hashes().data() + i * sizeof(SBFullHash), 144 sizeof(SBFullHash)); 145 return full_hash; 146 } 147 148 // SBListChunkRanges ----------------------------------------------------------- 149 150 SBListChunkRanges::SBListChunkRanges(const std::string& n) 151 : name(n) { 152 } 153 154 // SBChunkDelete --------------------------------------------------------------- 155 156 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {} 157 158 SBChunkDelete::~SBChunkDelete() {} 159 160 // Utility functions ----------------------------------------------------------- 161 162 namespace { 163 bool IsKnownList(const std::string& name) { 164 for (size_t i = 0; i < arraysize(safe_browsing_util::kAllLists); ++i) { 165 if (!strcmp(safe_browsing_util::kAllLists[i], name.c_str())) { 166 return true; 167 } 168 } 169 return false; 170 } 171 } // namespace 172 173 namespace safe_browsing_util { 174 175 // Listnames that browser can process. 176 // TODO(shess): This shouldn't be OS-driven <http://crbug.com/394379> 177 #if defined(OS_ANDROID) 178 // NOTE(shess): This difference is also reflected in the store name in 179 // safe_browsing_database.cc. 180 const char kMalwareList[] = "goog-mobilemalware-shavar"; 181 const char kPhishingList[] = "goog-mobilephish-shavar"; 182 #else 183 const char kMalwareList[] = "goog-malware-shavar"; 184 const char kPhishingList[] = "goog-phish-shavar"; 185 #endif 186 const char kBinUrlList[] = "goog-badbinurl-shavar"; 187 const char kCsdWhiteList[] = "goog-csdwhite-sha256"; 188 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256"; 189 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar"; 190 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar"; 191 const char kIPBlacklist[] = "goog-badip-digest256"; 192 193 const char* kAllLists[8] = { 194 kMalwareList, 195 kPhishingList, 196 kBinUrlList, 197 kCsdWhiteList, 198 kDownloadWhiteList, 199 kExtensionBlacklist, 200 kSideEffectFreeWhitelist, 201 kIPBlacklist, 202 }; 203 204 ListType GetListId(const base::StringPiece& name) { 205 ListType id; 206 if (name == safe_browsing_util::kMalwareList) { 207 id = MALWARE; 208 } else if (name == safe_browsing_util::kPhishingList) { 209 id = PHISH; 210 } else if (name == safe_browsing_util::kBinUrlList) { 211 id = BINURL; 212 } else if (name == safe_browsing_util::kCsdWhiteList) { 213 id = CSDWHITELIST; 214 } else if (name == safe_browsing_util::kDownloadWhiteList) { 215 id = DOWNLOADWHITELIST; 216 } else if (name == safe_browsing_util::kExtensionBlacklist) { 217 id = EXTENSIONBLACKLIST; 218 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) { 219 id = SIDEEFFECTFREEWHITELIST; 220 } else if (name == safe_browsing_util::kIPBlacklist) { 221 id = IPBLACKLIST; 222 } else { 223 id = INVALID; 224 } 225 return id; 226 } 227 228 bool GetListName(ListType list_id, std::string* list) { 229 switch (list_id) { 230 case MALWARE: 231 *list = safe_browsing_util::kMalwareList; 232 break; 233 case PHISH: 234 *list = safe_browsing_util::kPhishingList; 235 break; 236 case BINURL: 237 *list = safe_browsing_util::kBinUrlList; 238 break; 239 case CSDWHITELIST: 240 *list = safe_browsing_util::kCsdWhiteList; 241 break; 242 case DOWNLOADWHITELIST: 243 *list = safe_browsing_util::kDownloadWhiteList; 244 break; 245 case EXTENSIONBLACKLIST: 246 *list = safe_browsing_util::kExtensionBlacklist; 247 break; 248 case SIDEEFFECTFREEWHITELIST: 249 *list = safe_browsing_util::kSideEffectFreeWhitelist; 250 break; 251 case IPBLACKLIST: 252 *list = safe_browsing_util::kIPBlacklist; 253 break; 254 default: 255 return false; 256 } 257 DCHECK(IsKnownList(*list)); 258 return true; 259 } 260 261 std::string Unescape(const std::string& url) { 262 std::string unescaped_str(url); 263 std::string old_unescaped_str; 264 const int kMaxLoopIterations = 1024; 265 int loop_var = 0; 266 do { 267 old_unescaped_str = unescaped_str; 268 unescaped_str = net::UnescapeURLComponent(old_unescaped_str, 269 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES | 270 net::UnescapeRule::URL_SPECIAL_CHARS); 271 } while (unescaped_str != old_unescaped_str && ++loop_var <= 272 kMaxLoopIterations); 273 274 return unescaped_str; 275 } 276 277 std::string Escape(const std::string& url) { 278 std::string escaped_str; 279 const char* kHexString = "0123456789ABCDEF"; 280 for (size_t i = 0; i < url.length(); i++) { 281 unsigned char c = static_cast<unsigned char>(url[i]); 282 if (c <= ' ' || c > '~' || c == '#' || c == '%') { 283 escaped_str.push_back('%'); 284 escaped_str.push_back(kHexString[c >> 4]); 285 escaped_str.push_back(kHexString[c & 0xf]); 286 } else { 287 escaped_str.push_back(c); 288 } 289 } 290 291 return escaped_str; 292 } 293 294 std::string RemoveConsecutiveChars(const std::string& str, const char c) { 295 std::string output(str); 296 std::string string_to_find; 297 std::string::size_type loc = 0; 298 string_to_find.append(2, c); 299 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { 300 output.erase(loc, 1); 301 } 302 303 return output; 304 } 305 306 // Canonicalizes url as per Google Safe Browsing Specification. 307 // See section 6.1 in 308 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. 309 void CanonicalizeUrl(const GURL& url, 310 std::string* canonicalized_hostname, 311 std::string* canonicalized_path, 312 std::string* canonicalized_query) { 313 DCHECK(url.is_valid()); 314 315 // We only canonicalize "normal" URLs. 316 if (!url.IsStandard()) 317 return; 318 319 // Following canonicalization steps are excluded since url parsing takes care 320 // of those :- 321 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. 322 // (Exclude escaped version of these chars). 323 // 2. Normalize hostname to 4 dot-seperated decimal values. 324 // 3. Lowercase hostname. 325 // 4. Resolve path sequences "/../" and "/./". 326 327 // That leaves us with the following :- 328 // 1. Remove fragment in URL. 329 GURL url_without_fragment; 330 GURL::Replacements f_replacements; 331 f_replacements.ClearRef(); 332 f_replacements.ClearUsername(); 333 f_replacements.ClearPassword(); 334 url_without_fragment = url.ReplaceComponents(f_replacements); 335 336 // 2. Do URL unescaping until no more hex encoded characters exist. 337 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); 338 url::Parsed parsed; 339 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(), 340 &parsed); 341 342 // 3. In hostname, remove all leading and trailing dots. 343 const std::string host = 344 (parsed.host.len > 0) 345 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len) 346 : std::string(); 347 std::string host_without_end_dots; 348 base::TrimString(host, ".", &host_without_end_dots); 349 350 // 4. In hostname, replace consecutive dots with a single dot. 351 std::string host_without_consecutive_dots(RemoveConsecutiveChars( 352 host_without_end_dots, '.')); 353 354 // 5. In path, replace runs of consecutive slashes with a single slash. 355 std::string path = 356 (parsed.path.len > 0) 357 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len) 358 : std::string(); 359 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/')); 360 361 url::Replacements<char> hp_replacements; 362 hp_replacements.SetHost( 363 host_without_consecutive_dots.data(), 364 url::Component(0, host_without_consecutive_dots.length())); 365 hp_replacements.SetPath( 366 path_without_consecutive_slash.data(), 367 url::Component(0, path_without_consecutive_slash.length())); 368 369 std::string url_unescaped_with_can_hostpath; 370 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); 371 url::Parsed temp_parsed; 372 url::ReplaceComponents(url_unescaped_str.data(), 373 url_unescaped_str.length(), 374 parsed, 375 hp_replacements, 376 NULL, 377 &output, 378 &temp_parsed); 379 output.Complete(); 380 381 // 6. Step needed to revert escaping done in url::ReplaceComponents. 382 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); 383 384 // 7. After performing all above steps, percent-escape all chars in url which 385 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. 386 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); 387 url::Parsed final_parsed; 388 url::ParseStandardURL(escaped_canon_url_str.data(), 389 escaped_canon_url_str.length(), 390 &final_parsed); 391 392 if (canonicalized_hostname && final_parsed.host.len > 0) { 393 *canonicalized_hostname = 394 escaped_canon_url_str.substr(final_parsed.host.begin, 395 final_parsed.host.len); 396 } 397 if (canonicalized_path && final_parsed.path.len > 0) { 398 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, 399 final_parsed.path.len); 400 } 401 if (canonicalized_query && final_parsed.query.len > 0) { 402 *canonicalized_query = escaped_canon_url_str.substr( 403 final_parsed.query.begin, final_parsed.query.len); 404 } 405 } 406 407 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 408 hosts->clear(); 409 410 std::string canon_host; 411 CanonicalizeUrl(url, &canon_host, NULL, NULL); 412 413 const std::string host = canon_host; // const sidesteps GCC bugs below! 414 if (host.empty()) 415 return; 416 417 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 418 // hostnames formed by starting with the last 5 components and successively 419 // removing the leading component. The last component isn't examined alone, 420 // since it's the TLD or a subcomponent thereof. 421 // 422 // Note that we don't need to be clever about stopping at the "real" eTLD -- 423 // the data on the server side has been filtered to ensure it will not 424 // blacklist a whole TLD, and it's not significantly slower on our side to 425 // just check too much. 426 // 427 // Also note that because we have a simple blacklist, not some sort of complex 428 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check 429 // these in. 430 const size_t kMaxHostsToCheck = 4; 431 bool skipped_last_component = false; 432 for (std::string::const_reverse_iterator i(host.rbegin()); 433 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { 434 if (*i == '.') { 435 if (skipped_last_component) 436 hosts->push_back(std::string(i.base(), host.end())); 437 else 438 skipped_last_component = true; 439 } 440 } 441 hosts->push_back(host); 442 } 443 444 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 445 paths->clear(); 446 447 std::string canon_path; 448 std::string canon_query; 449 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 450 451 const std::string path = canon_path; // const sidesteps GCC bugs below! 452 const std::string query = canon_query; 453 if (path.empty()) 454 return; 455 456 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without 457 // the query parameters, and also up to 4 paths formed by starting at the root 458 // and adding more path components. 459 // 460 // As with the hosts above, it doesn't matter what order we check these in. 461 const size_t kMaxPathsToCheck = 4; 462 for (std::string::const_iterator i(path.begin()); 463 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { 464 if (*i == '/') 465 paths->push_back(std::string(path.begin(), i + 1)); 466 } 467 468 if (!paths->empty() && paths->back() != path) 469 paths->push_back(path); 470 471 if (!query.empty()) 472 paths->push_back(path + "?" + query); 473 } 474 475 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 476 std::vector<std::string> hosts, paths; 477 GenerateHostsToCheck(url, &hosts); 478 GeneratePathsToCheck(url, &paths); 479 for (size_t h = 0; h < hosts.size(); ++h) { 480 for (size_t p = 0; p < paths.size(); ++p) { 481 urls->push_back(hosts[h] + paths[p]); 482 } 483 } 484 } 485 486 GURL GeneratePhishingReportUrl(const std::string& report_page, 487 const std::string& url_to_report, 488 bool is_client_side_detection) { 489 const std::string current_esc = net::EscapeQueryParamValue(url_to_report, 490 true); 491 492 #if defined(OS_WIN) 493 BrowserDistribution* dist = BrowserDistribution::GetDistribution(); 494 std::string client_name(dist->GetSafeBrowsingName()); 495 #else 496 std::string client_name("googlechrome"); 497 #endif 498 if (is_client_side_detection) 499 client_name.append("_csd"); 500 501 GURL report_url(report_page + base::StringPrintf(kReportParams, 502 client_name.c_str(), 503 current_esc.c_str())); 504 return google_util::AppendGoogleLocaleParam( 505 report_url, g_browser_process->GetApplicationLocale()); 506 } 507 508 SBFullHash StringToSBFullHash(const std::string& hash_in) { 509 DCHECK_EQ(crypto::kSHA256Length, hash_in.size()); 510 SBFullHash hash_out; 511 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length); 512 return hash_out; 513 } 514 515 std::string SBFullHashToString(const SBFullHash& hash) { 516 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash)); 517 return std::string(hash.full_hash, sizeof(hash.full_hash)); 518 } 519 520 } // namespace safe_browsing_util 521