Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "base/strings/stringprintf.h"
     10 #include "chrome/browser/google/google_util.h"
     11 #include "crypto/sha2.h"
     12 #include "net/base/escape.h"
     13 #include "url/gurl.h"
     14 #include "url/url_util.h"
     15 
     16 #if defined(OS_WIN)
     17 #include "chrome/installer/util/browser_distribution.h"
     18 #endif
     19 
     20 static const char kReportParams[] = "?tpl=%s&url=%s";
     21 
     22 // SBChunk ---------------------------------------------------------------------
     23 
     24 SBChunk::SBChunk()
     25     : chunk_number(0),
     26       list_id(0),
     27       is_add(false) {
     28 }
     29 
     30 SBChunk::~SBChunk() {}
     31 
     32 // SBChunkList -----------------------------------------------------------------
     33 
     34 SBChunkList::SBChunkList() {}
     35 
     36 SBChunkList::~SBChunkList() {
     37   clear();
     38 }
     39 
     40 void SBChunkList::clear() {
     41   for (std::vector<SBChunk>::iterator citer = chunks_.begin();
     42        citer != chunks_.end(); ++citer) {
     43     for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin();
     44          hiter != citer->hosts.end(); ++hiter) {
     45       if (hiter->entry) {
     46         hiter->entry->Destroy();
     47         hiter->entry = NULL;
     48       }
     49     }
     50   }
     51   chunks_.clear();
     52 }
     53 
     54 // SBListChunkRanges -----------------------------------------------------------
     55 
     56 SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {}
     57 
     58 // SBChunkDelete ---------------------------------------------------------------
     59 
     60 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
     61 
     62 SBChunkDelete::~SBChunkDelete() {}
     63 
     64 // SBEntry ---------------------------------------------------------------------
     65 
     66 // static
     67 SBEntry* SBEntry::Create(Type type, int prefix_count) {
     68   int size = Size(type, prefix_count);
     69   SBEntry *rv = static_cast<SBEntry*>(malloc(size));
     70   memset(rv, 0, size);
     71   rv->set_type(type);
     72   rv->set_prefix_count(prefix_count);
     73   return rv;
     74 }
     75 
     76 void SBEntry::Destroy() {
     77   free(this);
     78 }
     79 
     80 // static
     81 int SBEntry::PrefixSize(Type type) {
     82   switch (type) {
     83     case ADD_PREFIX:
     84       return sizeof(SBPrefix);
     85     case ADD_FULL_HASH:
     86       return sizeof(SBFullHash);
     87     case SUB_PREFIX:
     88       return sizeof(SBSubPrefix);
     89     case SUB_FULL_HASH:
     90       return sizeof(SBSubFullHash);
     91     default:
     92       NOTREACHED();
     93       return 0;
     94   }
     95 }
     96 
     97 int SBEntry::Size() const {
     98   return Size(type(), prefix_count());
     99 }
    100 
    101 // static
    102 int SBEntry::Size(Type type, int prefix_count) {
    103   return sizeof(Data) + prefix_count * PrefixSize(type);
    104 }
    105 
    106 int SBEntry::ChunkIdAtPrefix(int index) const {
    107   if (type() == SUB_PREFIX)
    108     return sub_prefixes_[index].add_chunk;
    109   return (type() == SUB_FULL_HASH) ?
    110       sub_full_hashes_[index].add_chunk : chunk_id();
    111 }
    112 
    113 void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) {
    114   DCHECK(IsSub());
    115 
    116   if (type() == SUB_PREFIX)
    117     sub_prefixes_[index].add_chunk = chunk_id;
    118   else
    119     sub_full_hashes_[index].add_chunk = chunk_id;
    120 }
    121 
    122 const SBPrefix& SBEntry::PrefixAt(int index) const {
    123   DCHECK(IsPrefix());
    124 
    125   return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix;
    126 }
    127 
    128 const SBFullHash& SBEntry::FullHashAt(int index) const {
    129   DCHECK(!IsPrefix());
    130 
    131   return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix;
    132 }
    133 
    134 void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) {
    135   DCHECK(IsPrefix());
    136 
    137   if (IsAdd())
    138     add_prefixes_[index] = prefix;
    139   else
    140     sub_prefixes_[index].prefix = prefix;
    141 }
    142 
    143 void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) {
    144   DCHECK(!IsPrefix());
    145 
    146   if (IsAdd())
    147     add_full_hashes_[index] = full_hash;
    148   else
    149     sub_full_hashes_[index].prefix = full_hash;
    150 }
    151 
    152 
    153 // Utility functions -----------------------------------------------------------
    154 
    155 namespace safe_browsing_util {
    156 
    157 // Listnames that browser can process.
    158 const char kMalwareList[] = "goog-malware-shavar";
    159 const char kPhishingList[] = "goog-phish-shavar";
    160 const char kBinUrlList[] = "goog-badbinurl-shavar";
    161 // We don't use the bad binary digest list anymore.  Use a fake listname to be
    162 // sure we don't request it accidentally.
    163 const char kBinHashList[] = "goog-badbin-digestvar-disabled";
    164 const char kCsdWhiteList[] = "goog-csdwhite-sha256";
    165 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
    166 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
    167 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar";
    168 
    169 ListType GetListId(const std::string& name) {
    170   ListType id;
    171   if (name == safe_browsing_util::kMalwareList) {
    172     id = MALWARE;
    173   } else if (name == safe_browsing_util::kPhishingList) {
    174     id = PHISH;
    175   } else if (name == safe_browsing_util::kBinUrlList) {
    176     id = BINURL;
    177   } else if (name == safe_browsing_util::kBinHashList) {
    178     id = BINHASH;
    179   } else if (name == safe_browsing_util::kCsdWhiteList) {
    180     id = CSDWHITELIST;
    181   } else if (name == safe_browsing_util::kDownloadWhiteList) {
    182     id = DOWNLOADWHITELIST;
    183   } else if (name == safe_browsing_util::kExtensionBlacklist) {
    184     id = EXTENSIONBLACKLIST;
    185   } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) {
    186     id = SIDEEFFECTFREEWHITELIST;
    187   } else {
    188     id = INVALID;
    189   }
    190   return id;
    191 }
    192 
    193 bool GetListName(ListType list_id, std::string* list) {
    194   switch (list_id) {
    195     case MALWARE:
    196       *list = safe_browsing_util::kMalwareList;
    197       break;
    198     case PHISH:
    199       *list = safe_browsing_util::kPhishingList;
    200       break;
    201     case BINURL:
    202       *list = safe_browsing_util::kBinUrlList;
    203       break;
    204     case BINHASH:
    205       *list = safe_browsing_util::kBinHashList;
    206       break;
    207     case CSDWHITELIST:
    208       *list = safe_browsing_util::kCsdWhiteList;
    209       break;
    210     case DOWNLOADWHITELIST:
    211       *list = safe_browsing_util::kDownloadWhiteList;
    212       break;
    213     case EXTENSIONBLACKLIST:
    214       *list = safe_browsing_util::kExtensionBlacklist;
    215       break;
    216     case SIDEEFFECTFREEWHITELIST:
    217       *list = safe_browsing_util::kSideEffectFreeWhitelist;
    218       break;
    219     default:
    220       return false;
    221   }
    222   return true;
    223 }
    224 
    225 std::string Unescape(const std::string& url) {
    226   std::string unescaped_str(url);
    227   std::string old_unescaped_str;
    228   const int kMaxLoopIterations = 1024;
    229   int loop_var = 0;
    230   do {
    231     old_unescaped_str = unescaped_str;
    232     unescaped_str = net::UnescapeURLComponent(old_unescaped_str,
    233         net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES |
    234         net::UnescapeRule::URL_SPECIAL_CHARS);
    235   } while (unescaped_str != old_unescaped_str && ++loop_var <=
    236            kMaxLoopIterations);
    237 
    238   return unescaped_str;
    239 }
    240 
    241 std::string Escape(const std::string& url) {
    242   std::string escaped_str;
    243   const char* kHexString = "0123456789ABCDEF";
    244   for (size_t i = 0; i < url.length(); i++) {
    245     unsigned char c = static_cast<unsigned char>(url[i]);
    246     if (c <= ' ' || c > '~' || c == '#' || c == '%') {
    247       escaped_str.push_back('%');
    248       escaped_str.push_back(kHexString[c >> 4]);
    249       escaped_str.push_back(kHexString[c & 0xf]);
    250     } else {
    251       escaped_str.push_back(c);
    252     }
    253   }
    254 
    255   return escaped_str;
    256 }
    257 
    258 std::string RemoveConsecutiveChars(const std::string& str, const char c) {
    259   std::string output(str);
    260   std::string string_to_find;
    261   std::string::size_type loc = 0;
    262   string_to_find.append(2, c);
    263   while ((loc = output.find(string_to_find, loc)) != std::string::npos) {
    264     output.erase(loc, 1);
    265   }
    266 
    267   return output;
    268 }
    269 
    270 // Canonicalizes url as per Google Safe Browsing Specification.
    271 // See section 6.1 in
    272 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
    273 void CanonicalizeUrl(const GURL& url,
    274                      std::string* canonicalized_hostname,
    275                      std::string* canonicalized_path,
    276                      std::string* canonicalized_query) {
    277   DCHECK(url.is_valid());
    278 
    279   // We only canonicalize "normal" URLs.
    280   if (!url.IsStandard())
    281     return;
    282 
    283   // Following canonicalization steps are excluded since url parsing takes care
    284   // of those :-
    285   // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
    286   //    (Exclude escaped version of these chars).
    287   // 2. Normalize hostname to 4 dot-seperated decimal values.
    288   // 3. Lowercase hostname.
    289   // 4. Resolve path sequences "/../" and "/./".
    290 
    291   // That leaves us with the following :-
    292   // 1. Remove fragment in URL.
    293   GURL url_without_fragment;
    294   GURL::Replacements f_replacements;
    295   f_replacements.ClearRef();
    296   f_replacements.ClearUsername();
    297   f_replacements.ClearPassword();
    298   url_without_fragment = url.ReplaceComponents(f_replacements);
    299 
    300   // 2. Do URL unescaping until no more hex encoded characters exist.
    301   std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
    302   url_parse::Parsed parsed;
    303   url_parse::ParseStandardURL(url_unescaped_str.data(),
    304       url_unescaped_str.length(), &parsed);
    305 
    306   // 3. In hostname, remove all leading and trailing dots.
    307   const std::string host =
    308       (parsed.host.len > 0)
    309           ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len)
    310           : std::string();
    311   const char kCharsToTrim[] = ".";
    312   std::string host_without_end_dots;
    313   TrimString(host, kCharsToTrim, &host_without_end_dots);
    314 
    315   // 4. In hostname, replace consecutive dots with a single dot.
    316   std::string host_without_consecutive_dots(RemoveConsecutiveChars(
    317       host_without_end_dots, '.'));
    318 
    319   // 5. In path, replace runs of consecutive slashes with a single slash.
    320   std::string path =
    321       (parsed.path.len > 0)
    322           ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len)
    323           : std::string();
    324   std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
    325 
    326   url_canon::Replacements<char> hp_replacements;
    327   hp_replacements.SetHost(host_without_consecutive_dots.data(),
    328   url_parse::Component(0, host_without_consecutive_dots.length()));
    329   hp_replacements.SetPath(path_without_consecutive_slash.data(),
    330   url_parse::Component(0, path_without_consecutive_slash.length()));
    331 
    332   std::string url_unescaped_with_can_hostpath;
    333   url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
    334   url_parse::Parsed temp_parsed;
    335   url_util::ReplaceComponents(url_unescaped_str.data(),
    336                               url_unescaped_str.length(), parsed,
    337                               hp_replacements, NULL, &output, &temp_parsed);
    338   output.Complete();
    339 
    340   // 6. Step needed to revert escaping done in url_util::ReplaceComponents.
    341   url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
    342 
    343   // 7. After performing all above steps, percent-escape all chars in url which
    344   // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
    345   std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
    346   url_parse::Parsed final_parsed;
    347   url_parse::ParseStandardURL(escaped_canon_url_str.data(),
    348                               escaped_canon_url_str.length(), &final_parsed);
    349 
    350   if (canonicalized_hostname && final_parsed.host.len > 0) {
    351     *canonicalized_hostname =
    352         escaped_canon_url_str.substr(final_parsed.host.begin,
    353                                      final_parsed.host.len);
    354   }
    355   if (canonicalized_path && final_parsed.path.len > 0) {
    356     *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
    357                                                        final_parsed.path.len);
    358   }
    359   if (canonicalized_query && final_parsed.query.len > 0) {
    360     *canonicalized_query = escaped_canon_url_str.substr(
    361         final_parsed.query.begin, final_parsed.query.len);
    362   }
    363 }
    364 
    365 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
    366   hosts->clear();
    367 
    368   std::string canon_host;
    369   CanonicalizeUrl(url, &canon_host, NULL, NULL);
    370 
    371   const std::string host = canon_host;  // const sidesteps GCC bugs below!
    372   if (host.empty())
    373     return;
    374 
    375   // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
    376   // hostnames formed by starting with the last 5 components and successively
    377   // removing the leading component.  The last component isn't examined alone,
    378   // since it's the TLD or a subcomponent thereof.
    379   //
    380   // Note that we don't need to be clever about stopping at the "real" eTLD --
    381   // the data on the server side has been filtered to ensure it will not
    382   // blacklist a whole TLD, and it's not significantly slower on our side to
    383   // just check too much.
    384   //
    385   // Also note that because we have a simple blacklist, not some sort of complex
    386   // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
    387   // these in.
    388   const size_t kMaxHostsToCheck = 4;
    389   bool skipped_last_component = false;
    390   for (std::string::const_reverse_iterator i(host.rbegin());
    391        i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
    392     if (*i == '.') {
    393       if (skipped_last_component)
    394         hosts->push_back(std::string(i.base(), host.end()));
    395       else
    396         skipped_last_component = true;
    397     }
    398   }
    399   hosts->push_back(host);
    400 }
    401 
    402 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
    403   paths->clear();
    404 
    405   std::string canon_path;
    406   std::string canon_query;
    407   CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
    408 
    409   const std::string path = canon_path;   // const sidesteps GCC bugs below!
    410   const std::string query = canon_query;
    411   if (path.empty())
    412     return;
    413 
    414   // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
    415   // the query parameters, and also up to 4 paths formed by starting at the root
    416   // and adding more path components.
    417   //
    418   // As with the hosts above, it doesn't matter what order we check these in.
    419   const size_t kMaxPathsToCheck = 4;
    420   for (std::string::const_iterator i(path.begin());
    421        i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
    422     if (*i == '/')
    423       paths->push_back(std::string(path.begin(), i + 1));
    424   }
    425 
    426   if (!paths->empty() && paths->back() != path)
    427     paths->push_back(path);
    428 
    429   if (!query.empty())
    430     paths->push_back(path + "?" + query);
    431 }
    432 
    433 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
    434   std::vector<std::string> hosts, paths;
    435   GenerateHostsToCheck(url, &hosts);
    436   GeneratePathsToCheck(url, &paths);
    437   for (size_t h = 0; h < hosts.size(); ++h) {
    438     for (size_t p = 0; p < paths.size(); ++p) {
    439       urls->push_back(hosts[h] + paths[p]);
    440     }
    441   }
    442 }
    443 
    444 int GetHashIndex(const SBFullHash& hash,
    445                  const std::vector<SBFullHashResult>& full_hashes) {
    446   for (size_t i = 0; i < full_hashes.size(); ++i) {
    447     if (hash == full_hashes[i].hash)
    448       return static_cast<int>(i);
    449   }
    450   return -1;
    451 }
    452 
    453 int GetUrlHashIndex(const GURL& url,
    454                     const std::vector<SBFullHashResult>& full_hashes) {
    455   if (full_hashes.empty())
    456     return -1;
    457 
    458   std::vector<std::string> patterns;
    459   GeneratePatternsToCheck(url, &patterns);
    460 
    461   for (size_t i = 0; i < patterns.size(); ++i) {
    462     SBFullHash key;
    463     crypto::SHA256HashString(patterns[i], key.full_hash, sizeof(SBFullHash));
    464     int index = GetHashIndex(key, full_hashes);
    465     if (index != -1)
    466       return index;
    467   }
    468   return -1;
    469 }
    470 
    471 bool IsPhishingList(const std::string& list_name) {
    472   return list_name.compare(kPhishingList) == 0;
    473 }
    474 
    475 bool IsMalwareList(const std::string& list_name) {
    476   return list_name.compare(kMalwareList) == 0;
    477 }
    478 
    479 bool IsBadbinurlList(const std::string& list_name) {
    480   return list_name.compare(kBinUrlList) == 0;
    481 }
    482 
    483 bool IsBadbinhashList(const std::string& list_name) {
    484   return list_name.compare(kBinHashList) == 0;
    485 }
    486 
    487 bool IsExtensionList(const std::string& list_name) {
    488   return list_name.compare(kExtensionBlacklist) == 0;
    489 }
    490 
    491 GURL GeneratePhishingReportUrl(const std::string& report_page,
    492                                const std::string& url_to_report,
    493                                bool is_client_side_detection) {
    494   const std::string current_esc = net::EscapeQueryParamValue(url_to_report,
    495                                                              true);
    496 
    497 #if defined(OS_WIN)
    498   BrowserDistribution* dist = BrowserDistribution::GetDistribution();
    499   std::string client_name(dist->GetSafeBrowsingName());
    500 #else
    501   std::string client_name("googlechrome");
    502 #endif
    503   if (is_client_side_detection)
    504     client_name.append("_csd");
    505 
    506   GURL report_url(report_page + base::StringPrintf(kReportParams,
    507                                                    client_name.c_str(),
    508                                                    current_esc.c_str()));
    509   return google_util::AppendGoogleLocaleParam(report_url);
    510 }
    511 
    512 SBFullHash StringToSBFullHash(const std::string& hash_in) {
    513   DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
    514   SBFullHash hash_out;
    515   memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
    516   return hash_out;
    517 }
    518 
    519 std::string SBFullHashToString(const SBFullHash& hash) {
    520   DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
    521   return std::string(hash.full_hash, sizeof(hash.full_hash));
    522 }
    523 
    524 }  // namespace safe_browsing_util
    525