1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/safe_browsing/safe_browsing_util.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_util.h" 9 #include "base/strings/stringprintf.h" 10 #include "chrome/browser/browser_process.h" 11 #include "chrome/browser/safe_browsing/chunk.pb.h" 12 #include "components/google/core/browser/google_util.h" 13 #include "crypto/sha2.h" 14 #include "net/base/escape.h" 15 #include "url/gurl.h" 16 #include "url/url_util.h" 17 18 #if defined(OS_WIN) 19 #include "chrome/installer/util/browser_distribution.h" 20 #endif 21 22 static const char kReportParams[] = "?tpl=%s&url=%s"; 23 24 SBFullHash SBFullHashForString(const base::StringPiece& str) { 25 SBFullHash h; 26 crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash)); 27 return h; 28 } 29 30 // SBChunkData ----------------------------------------------------------------- 31 32 // TODO(shess): Right now this contains a scoped_ptr<ChunkData> so that the 33 // proto buffer isn't copied all over the place, then these are contained in a 34 // ScopedVector for purposes of passing things around between tasks. This seems 35 // convoluted. Maybe it would make sense to have an overall container class 36 // returning references to a nested per-chunk class? 37 38 SBChunkData::SBChunkData() { 39 } 40 41 SBChunkData::SBChunkData(safe_browsing::ChunkData* raw_data) 42 : chunk_data_(raw_data) { 43 DCHECK(chunk_data_.get()); 44 } 45 46 SBChunkData::~SBChunkData() { 47 } 48 49 bool SBChunkData::ParseFrom(const unsigned char* data, size_t length) { 50 scoped_ptr<safe_browsing::ChunkData> chunk(new safe_browsing::ChunkData()); 51 if (!chunk->ParseFromArray(data, length)) 52 return false; 53 54 if (chunk->chunk_type() != safe_browsing::ChunkData::ADD && 55 chunk->chunk_type() != safe_browsing::ChunkData::SUB) { 56 return false; 57 } 58 59 size_t hash_size = 0; 60 if (chunk->prefix_type() == safe_browsing::ChunkData::PREFIX_4B) { 61 hash_size = sizeof(SBPrefix); 62 } else if (chunk->prefix_type() == safe_browsing::ChunkData::FULL_32B) { 63 hash_size = sizeof(SBFullHash); 64 } else { 65 return false; 66 } 67 68 const size_t hash_count = chunk->hashes().size() / hash_size; 69 if (hash_count * hash_size != chunk->hashes().size()) 70 return false; 71 72 if (chunk->chunk_type() == safe_browsing::ChunkData::SUB && 73 static_cast<size_t>(chunk->add_numbers_size()) != hash_count) { 74 return false; 75 } 76 77 chunk_data_.swap(chunk); 78 return true; 79 } 80 81 int SBChunkData::ChunkNumber() const { 82 return chunk_data_->chunk_number(); 83 } 84 85 bool SBChunkData::IsAdd() const { 86 return chunk_data_->chunk_type() == safe_browsing::ChunkData::ADD; 87 } 88 89 bool SBChunkData::IsSub() const { 90 return chunk_data_->chunk_type() == safe_browsing::ChunkData::SUB; 91 } 92 93 int SBChunkData::AddChunkNumberAt(size_t i) const { 94 DCHECK(IsSub()); 95 DCHECK((IsPrefix() && i < PrefixCount()) || 96 (IsFullHash() && i < FullHashCount())); 97 return chunk_data_->add_numbers(i); 98 } 99 100 bool SBChunkData::IsPrefix() const { 101 return chunk_data_->prefix_type() == safe_browsing::ChunkData::PREFIX_4B; 102 } 103 104 size_t SBChunkData::PrefixCount() const { 105 DCHECK(IsPrefix()); 106 return chunk_data_->hashes().size() / sizeof(SBPrefix); 107 } 108 109 SBPrefix SBChunkData::PrefixAt(size_t i) const { 110 DCHECK(IsPrefix()); 111 DCHECK_LT(i, PrefixCount()); 112 113 SBPrefix prefix; 114 memcpy(&prefix, chunk_data_->hashes().data() + i * sizeof(SBPrefix), 115 sizeof(SBPrefix)); 116 return prefix; 117 } 118 119 bool SBChunkData::IsFullHash() const { 120 return chunk_data_->prefix_type() == safe_browsing::ChunkData::FULL_32B; 121 } 122 123 size_t SBChunkData::FullHashCount() const { 124 DCHECK(IsFullHash()); 125 return chunk_data_->hashes().size() / sizeof(SBFullHash); 126 } 127 128 SBFullHash SBChunkData::FullHashAt(size_t i) const { 129 DCHECK(IsFullHash()); 130 DCHECK_LT(i, FullHashCount()); 131 132 SBFullHash full_hash; 133 memcpy(&full_hash, chunk_data_->hashes().data() + i * sizeof(SBFullHash), 134 sizeof(SBFullHash)); 135 return full_hash; 136 } 137 138 // SBListChunkRanges ----------------------------------------------------------- 139 140 SBListChunkRanges::SBListChunkRanges(const std::string& n) 141 : name(n) { 142 } 143 144 // SBChunkDelete --------------------------------------------------------------- 145 146 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {} 147 148 SBChunkDelete::~SBChunkDelete() {} 149 150 // Utility functions ----------------------------------------------------------- 151 152 namespace { 153 bool IsKnownList(const std::string& name) { 154 for (size_t i = 0; i < arraysize(safe_browsing_util::kAllLists); ++i) { 155 if (!strcmp(safe_browsing_util::kAllLists[i], name.c_str())) { 156 return true; 157 } 158 } 159 return false; 160 } 161 } // namespace 162 163 namespace safe_browsing_util { 164 165 // Listnames that browser can process. 166 const char kMalwareList[] = "goog-malware-shavar"; 167 const char kPhishingList[] = "goog-phish-shavar"; 168 const char kBinUrlList[] = "goog-badbinurl-shavar"; 169 const char kCsdWhiteList[] = "goog-csdwhite-sha256"; 170 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256"; 171 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar"; 172 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar"; 173 const char kIPBlacklist[] = "goog-badip-digest256"; 174 175 const char* kAllLists[8] = { 176 kMalwareList, 177 kPhishingList, 178 kBinUrlList, 179 kCsdWhiteList, 180 kDownloadWhiteList, 181 kExtensionBlacklist, 182 kSideEffectFreeWhitelist, 183 kIPBlacklist, 184 }; 185 186 ListType GetListId(const base::StringPiece& name) { 187 ListType id; 188 if (name == safe_browsing_util::kMalwareList) { 189 id = MALWARE; 190 } else if (name == safe_browsing_util::kPhishingList) { 191 id = PHISH; 192 } else if (name == safe_browsing_util::kBinUrlList) { 193 id = BINURL; 194 } else if (name == safe_browsing_util::kCsdWhiteList) { 195 id = CSDWHITELIST; 196 } else if (name == safe_browsing_util::kDownloadWhiteList) { 197 id = DOWNLOADWHITELIST; 198 } else if (name == safe_browsing_util::kExtensionBlacklist) { 199 id = EXTENSIONBLACKLIST; 200 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) { 201 id = SIDEEFFECTFREEWHITELIST; 202 } else if (name == safe_browsing_util::kIPBlacklist) { 203 id = IPBLACKLIST; 204 } else { 205 id = INVALID; 206 } 207 return id; 208 } 209 210 bool GetListName(ListType list_id, std::string* list) { 211 switch (list_id) { 212 case MALWARE: 213 *list = safe_browsing_util::kMalwareList; 214 break; 215 case PHISH: 216 *list = safe_browsing_util::kPhishingList; 217 break; 218 case BINURL: 219 *list = safe_browsing_util::kBinUrlList; 220 break; 221 case CSDWHITELIST: 222 *list = safe_browsing_util::kCsdWhiteList; 223 break; 224 case DOWNLOADWHITELIST: 225 *list = safe_browsing_util::kDownloadWhiteList; 226 break; 227 case EXTENSIONBLACKLIST: 228 *list = safe_browsing_util::kExtensionBlacklist; 229 break; 230 case SIDEEFFECTFREEWHITELIST: 231 *list = safe_browsing_util::kSideEffectFreeWhitelist; 232 break; 233 case IPBLACKLIST: 234 *list = safe_browsing_util::kIPBlacklist; 235 break; 236 default: 237 return false; 238 } 239 DCHECK(IsKnownList(*list)); 240 return true; 241 } 242 243 std::string Unescape(const std::string& url) { 244 std::string unescaped_str(url); 245 std::string old_unescaped_str; 246 const int kMaxLoopIterations = 1024; 247 int loop_var = 0; 248 do { 249 old_unescaped_str = unescaped_str; 250 unescaped_str = net::UnescapeURLComponent(old_unescaped_str, 251 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES | 252 net::UnescapeRule::URL_SPECIAL_CHARS); 253 } while (unescaped_str != old_unescaped_str && ++loop_var <= 254 kMaxLoopIterations); 255 256 return unescaped_str; 257 } 258 259 std::string Escape(const std::string& url) { 260 std::string escaped_str; 261 const char* kHexString = "0123456789ABCDEF"; 262 for (size_t i = 0; i < url.length(); i++) { 263 unsigned char c = static_cast<unsigned char>(url[i]); 264 if (c <= ' ' || c > '~' || c == '#' || c == '%') { 265 escaped_str.push_back('%'); 266 escaped_str.push_back(kHexString[c >> 4]); 267 escaped_str.push_back(kHexString[c & 0xf]); 268 } else { 269 escaped_str.push_back(c); 270 } 271 } 272 273 return escaped_str; 274 } 275 276 std::string RemoveConsecutiveChars(const std::string& str, const char c) { 277 std::string output(str); 278 std::string string_to_find; 279 std::string::size_type loc = 0; 280 string_to_find.append(2, c); 281 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { 282 output.erase(loc, 1); 283 } 284 285 return output; 286 } 287 288 // Canonicalizes url as per Google Safe Browsing Specification. 289 // See section 6.1 in 290 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. 291 void CanonicalizeUrl(const GURL& url, 292 std::string* canonicalized_hostname, 293 std::string* canonicalized_path, 294 std::string* canonicalized_query) { 295 DCHECK(url.is_valid()); 296 297 // We only canonicalize "normal" URLs. 298 if (!url.IsStandard()) 299 return; 300 301 // Following canonicalization steps are excluded since url parsing takes care 302 // of those :- 303 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. 304 // (Exclude escaped version of these chars). 305 // 2. Normalize hostname to 4 dot-seperated decimal values. 306 // 3. Lowercase hostname. 307 // 4. Resolve path sequences "/../" and "/./". 308 309 // That leaves us with the following :- 310 // 1. Remove fragment in URL. 311 GURL url_without_fragment; 312 GURL::Replacements f_replacements; 313 f_replacements.ClearRef(); 314 f_replacements.ClearUsername(); 315 f_replacements.ClearPassword(); 316 url_without_fragment = url.ReplaceComponents(f_replacements); 317 318 // 2. Do URL unescaping until no more hex encoded characters exist. 319 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); 320 url::Parsed parsed; 321 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(), 322 &parsed); 323 324 // 3. In hostname, remove all leading and trailing dots. 325 const std::string host = 326 (parsed.host.len > 0) 327 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len) 328 : std::string(); 329 std::string host_without_end_dots; 330 base::TrimString(host, ".", &host_without_end_dots); 331 332 // 4. In hostname, replace consecutive dots with a single dot. 333 std::string host_without_consecutive_dots(RemoveConsecutiveChars( 334 host_without_end_dots, '.')); 335 336 // 5. In path, replace runs of consecutive slashes with a single slash. 337 std::string path = 338 (parsed.path.len > 0) 339 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len) 340 : std::string(); 341 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/')); 342 343 url::Replacements<char> hp_replacements; 344 hp_replacements.SetHost( 345 host_without_consecutive_dots.data(), 346 url::Component(0, host_without_consecutive_dots.length())); 347 hp_replacements.SetPath( 348 path_without_consecutive_slash.data(), 349 url::Component(0, path_without_consecutive_slash.length())); 350 351 std::string url_unescaped_with_can_hostpath; 352 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); 353 url::Parsed temp_parsed; 354 url::ReplaceComponents(url_unescaped_str.data(), 355 url_unescaped_str.length(), 356 parsed, 357 hp_replacements, 358 NULL, 359 &output, 360 &temp_parsed); 361 output.Complete(); 362 363 // 6. Step needed to revert escaping done in url::ReplaceComponents. 364 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); 365 366 // 7. After performing all above steps, percent-escape all chars in url which 367 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. 368 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); 369 url::Parsed final_parsed; 370 url::ParseStandardURL(escaped_canon_url_str.data(), 371 escaped_canon_url_str.length(), 372 &final_parsed); 373 374 if (canonicalized_hostname && final_parsed.host.len > 0) { 375 *canonicalized_hostname = 376 escaped_canon_url_str.substr(final_parsed.host.begin, 377 final_parsed.host.len); 378 } 379 if (canonicalized_path && final_parsed.path.len > 0) { 380 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, 381 final_parsed.path.len); 382 } 383 if (canonicalized_query && final_parsed.query.len > 0) { 384 *canonicalized_query = escaped_canon_url_str.substr( 385 final_parsed.query.begin, final_parsed.query.len); 386 } 387 } 388 389 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 390 hosts->clear(); 391 392 std::string canon_host; 393 CanonicalizeUrl(url, &canon_host, NULL, NULL); 394 395 const std::string host = canon_host; // const sidesteps GCC bugs below! 396 if (host.empty()) 397 return; 398 399 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 400 // hostnames formed by starting with the last 5 components and successively 401 // removing the leading component. The last component isn't examined alone, 402 // since it's the TLD or a subcomponent thereof. 403 // 404 // Note that we don't need to be clever about stopping at the "real" eTLD -- 405 // the data on the server side has been filtered to ensure it will not 406 // blacklist a whole TLD, and it's not significantly slower on our side to 407 // just check too much. 408 // 409 // Also note that because we have a simple blacklist, not some sort of complex 410 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check 411 // these in. 412 const size_t kMaxHostsToCheck = 4; 413 bool skipped_last_component = false; 414 for (std::string::const_reverse_iterator i(host.rbegin()); 415 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { 416 if (*i == '.') { 417 if (skipped_last_component) 418 hosts->push_back(std::string(i.base(), host.end())); 419 else 420 skipped_last_component = true; 421 } 422 } 423 hosts->push_back(host); 424 } 425 426 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 427 paths->clear(); 428 429 std::string canon_path; 430 std::string canon_query; 431 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 432 433 const std::string path = canon_path; // const sidesteps GCC bugs below! 434 const std::string query = canon_query; 435 if (path.empty()) 436 return; 437 438 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without 439 // the query parameters, and also up to 4 paths formed by starting at the root 440 // and adding more path components. 441 // 442 // As with the hosts above, it doesn't matter what order we check these in. 443 const size_t kMaxPathsToCheck = 4; 444 for (std::string::const_iterator i(path.begin()); 445 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { 446 if (*i == '/') 447 paths->push_back(std::string(path.begin(), i + 1)); 448 } 449 450 if (!paths->empty() && paths->back() != path) 451 paths->push_back(path); 452 453 if (!query.empty()) 454 paths->push_back(path + "?" + query); 455 } 456 457 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 458 std::vector<std::string> hosts, paths; 459 GenerateHostsToCheck(url, &hosts); 460 GeneratePathsToCheck(url, &paths); 461 for (size_t h = 0; h < hosts.size(); ++h) { 462 for (size_t p = 0; p < paths.size(); ++p) { 463 urls->push_back(hosts[h] + paths[p]); 464 } 465 } 466 } 467 468 GURL GeneratePhishingReportUrl(const std::string& report_page, 469 const std::string& url_to_report, 470 bool is_client_side_detection) { 471 const std::string current_esc = net::EscapeQueryParamValue(url_to_report, 472 true); 473 474 #if defined(OS_WIN) 475 BrowserDistribution* dist = BrowserDistribution::GetDistribution(); 476 std::string client_name(dist->GetSafeBrowsingName()); 477 #else 478 std::string client_name("googlechrome"); 479 #endif 480 if (is_client_side_detection) 481 client_name.append("_csd"); 482 483 GURL report_url(report_page + base::StringPrintf(kReportParams, 484 client_name.c_str(), 485 current_esc.c_str())); 486 return google_util::AppendGoogleLocaleParam( 487 report_url, g_browser_process->GetApplicationLocale()); 488 } 489 490 SBFullHash StringToSBFullHash(const std::string& hash_in) { 491 DCHECK_EQ(crypto::kSHA256Length, hash_in.size()); 492 SBFullHash hash_out; 493 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length); 494 return hash_out; 495 } 496 497 std::string SBFullHashToString(const SBFullHash& hash) { 498 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash)); 499 return std::string(hash.full_hash, sizeof(hash.full_hash)); 500 } 501 502 } // namespace safe_browsing_util 503