Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "base/strings/stringprintf.h"
     10 #include "chrome/browser/google/google_util.h"
     11 #include "crypto/sha2.h"
     12 #include "net/base/escape.h"
     13 #include "url/gurl.h"
     14 #include "url/url_util.h"
     15 
     16 #if defined(OS_WIN)
     17 #include "chrome/installer/util/browser_distribution.h"
     18 #endif
     19 
     20 static const char kReportParams[] = "?tpl=%s&url=%s";
     21 
     22 // SBChunk ---------------------------------------------------------------------
     23 
     24 SBChunk::SBChunk()
     25     : chunk_number(0),
     26       list_id(0),
     27       is_add(false) {
     28 }
     29 
     30 SBChunk::~SBChunk() {}
     31 
     32 // SBChunkList -----------------------------------------------------------------
     33 
     34 SBChunkList::SBChunkList() {}
     35 
     36 SBChunkList::~SBChunkList() {
     37   clear();
     38 }
     39 
     40 void SBChunkList::clear() {
     41   for (std::vector<SBChunk>::iterator citer = chunks_.begin();
     42        citer != chunks_.end(); ++citer) {
     43     for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin();
     44          hiter != citer->hosts.end(); ++hiter) {
     45       if (hiter->entry) {
     46         hiter->entry->Destroy();
     47         hiter->entry = NULL;
     48       }
     49     }
     50   }
     51   chunks_.clear();
     52 }
     53 
     54 // SBListChunkRanges -----------------------------------------------------------
     55 
     56 SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {}
     57 
     58 // SBChunkDelete ---------------------------------------------------------------
     59 
     60 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
     61 
     62 SBChunkDelete::~SBChunkDelete() {}
     63 
     64 // SBEntry ---------------------------------------------------------------------
     65 
     66 // static
     67 SBEntry* SBEntry::Create(Type type, int prefix_count) {
     68   int size = Size(type, prefix_count);
     69   SBEntry *rv = static_cast<SBEntry*>(malloc(size));
     70   memset(rv, 0, size);
     71   rv->set_type(type);
     72   rv->set_prefix_count(prefix_count);
     73   return rv;
     74 }
     75 
     76 void SBEntry::Destroy() {
     77   free(this);
     78 }
     79 
     80 // static
     81 int SBEntry::PrefixSize(Type type) {
     82   switch (type) {
     83     case ADD_PREFIX:
     84       return sizeof(SBPrefix);
     85     case ADD_FULL_HASH:
     86       return sizeof(SBFullHash);
     87     case SUB_PREFIX:
     88       return sizeof(SBSubPrefix);
     89     case SUB_FULL_HASH:
     90       return sizeof(SBSubFullHash);
     91     default:
     92       NOTREACHED();
     93       return 0;
     94   }
     95 }
     96 
     97 int SBEntry::Size() const {
     98   return Size(type(), prefix_count());
     99 }
    100 
    101 // static
    102 int SBEntry::Size(Type type, int prefix_count) {
    103   return sizeof(Data) + prefix_count * PrefixSize(type);
    104 }
    105 
    106 int SBEntry::ChunkIdAtPrefix(int index) const {
    107   if (type() == SUB_PREFIX)
    108     return sub_prefixes_[index].add_chunk;
    109   return (type() == SUB_FULL_HASH) ?
    110       sub_full_hashes_[index].add_chunk : chunk_id();
    111 }
    112 
    113 void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) {
    114   DCHECK(IsSub());
    115 
    116   if (type() == SUB_PREFIX)
    117     sub_prefixes_[index].add_chunk = chunk_id;
    118   else
    119     sub_full_hashes_[index].add_chunk = chunk_id;
    120 }
    121 
    122 const SBPrefix& SBEntry::PrefixAt(int index) const {
    123   DCHECK(IsPrefix());
    124 
    125   return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix;
    126 }
    127 
    128 const SBFullHash& SBEntry::FullHashAt(int index) const {
    129   DCHECK(!IsPrefix());
    130 
    131   return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix;
    132 }
    133 
    134 void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) {
    135   DCHECK(IsPrefix());
    136 
    137   if (IsAdd())
    138     add_prefixes_[index] = prefix;
    139   else
    140     sub_prefixes_[index].prefix = prefix;
    141 }
    142 
    143 void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) {
    144   DCHECK(!IsPrefix());
    145 
    146   if (IsAdd())
    147     add_full_hashes_[index] = full_hash;
    148   else
    149     sub_full_hashes_[index].prefix = full_hash;
    150 }
    151 
    152 
    153 // Utility functions -----------------------------------------------------------
    154 
    155 namespace {
    156 bool IsKnownList(const std::string& name) {
    157   for (size_t i = 0; i < arraysize(safe_browsing_util::kAllLists); ++i) {
    158     if (!strcmp(safe_browsing_util::kAllLists[i], name.c_str())) {
    159       return true;
    160     }
    161   }
    162   return false;
    163 }
    164 }  // namespace
    165 
    166 namespace safe_browsing_util {
    167 
    168 // Listnames that browser can process.
    169 const char kMalwareList[] = "goog-malware-shavar";
    170 const char kPhishingList[] = "goog-phish-shavar";
    171 const char kBinUrlList[] = "goog-badbinurl-shavar";
    172 // We don't use the bad binary digest list anymore.  Use a fake listname to be
    173 // sure we don't request it accidentally.
    174 const char kBinHashList[] = "goog-badbin-digestvar-disabled";
    175 const char kCsdWhiteList[] = "goog-csdwhite-sha256";
    176 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
    177 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
    178 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar";
    179 const char kIPBlacklist[] = "goog-badip-digest256";
    180 
    181 const char* kAllLists[10] = {
    182   kMalwareList,
    183   kPhishingList,
    184   kBinUrlList,
    185   kBinHashList,
    186   kCsdWhiteList,
    187   kDownloadWhiteList,
    188   kDownloadWhiteList,
    189   kExtensionBlacklist,
    190   kSideEffectFreeWhitelist,
    191   kIPBlacklist,
    192 };
    193 
    194 ListType GetListId(const std::string& name) {
    195   ListType id;
    196   if (name == safe_browsing_util::kMalwareList) {
    197     id = MALWARE;
    198   } else if (name == safe_browsing_util::kPhishingList) {
    199     id = PHISH;
    200   } else if (name == safe_browsing_util::kBinUrlList) {
    201     id = BINURL;
    202   } else if (name == safe_browsing_util::kBinHashList) {
    203     id = BINHASH;
    204   } else if (name == safe_browsing_util::kCsdWhiteList) {
    205     id = CSDWHITELIST;
    206   } else if (name == safe_browsing_util::kDownloadWhiteList) {
    207     id = DOWNLOADWHITELIST;
    208   } else if (name == safe_browsing_util::kExtensionBlacklist) {
    209     id = EXTENSIONBLACKLIST;
    210   } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) {
    211     id = SIDEEFFECTFREEWHITELIST;
    212   } else if (name == safe_browsing_util::kIPBlacklist) {
    213     id = IPBLACKLIST;
    214   } else {
    215     id = INVALID;
    216   }
    217   return id;
    218 }
    219 
    220 bool GetListName(ListType list_id, std::string* list) {
    221   switch (list_id) {
    222     case MALWARE:
    223       *list = safe_browsing_util::kMalwareList;
    224       break;
    225     case PHISH:
    226       *list = safe_browsing_util::kPhishingList;
    227       break;
    228     case BINURL:
    229       *list = safe_browsing_util::kBinUrlList;
    230       break;
    231     case BINHASH:
    232       *list = safe_browsing_util::kBinHashList;
    233       break;
    234     case CSDWHITELIST:
    235       *list = safe_browsing_util::kCsdWhiteList;
    236       break;
    237     case DOWNLOADWHITELIST:
    238       *list = safe_browsing_util::kDownloadWhiteList;
    239       break;
    240     case EXTENSIONBLACKLIST:
    241       *list = safe_browsing_util::kExtensionBlacklist;
    242       break;
    243     case SIDEEFFECTFREEWHITELIST:
    244       *list = safe_browsing_util::kSideEffectFreeWhitelist;
    245       break;
    246     case IPBLACKLIST:
    247       *list = safe_browsing_util::kIPBlacklist;
    248       break;
    249     default:
    250       return false;
    251   }
    252   DCHECK(IsKnownList(*list));
    253   return true;
    254 }
    255 
    256 std::string Unescape(const std::string& url) {
    257   std::string unescaped_str(url);
    258   std::string old_unescaped_str;
    259   const int kMaxLoopIterations = 1024;
    260   int loop_var = 0;
    261   do {
    262     old_unescaped_str = unescaped_str;
    263     unescaped_str = net::UnescapeURLComponent(old_unescaped_str,
    264         net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES |
    265         net::UnescapeRule::URL_SPECIAL_CHARS);
    266   } while (unescaped_str != old_unescaped_str && ++loop_var <=
    267            kMaxLoopIterations);
    268 
    269   return unescaped_str;
    270 }
    271 
    272 std::string Escape(const std::string& url) {
    273   std::string escaped_str;
    274   const char* kHexString = "0123456789ABCDEF";
    275   for (size_t i = 0; i < url.length(); i++) {
    276     unsigned char c = static_cast<unsigned char>(url[i]);
    277     if (c <= ' ' || c > '~' || c == '#' || c == '%') {
    278       escaped_str.push_back('%');
    279       escaped_str.push_back(kHexString[c >> 4]);
    280       escaped_str.push_back(kHexString[c & 0xf]);
    281     } else {
    282       escaped_str.push_back(c);
    283     }
    284   }
    285 
    286   return escaped_str;
    287 }
    288 
    289 std::string RemoveConsecutiveChars(const std::string& str, const char c) {
    290   std::string output(str);
    291   std::string string_to_find;
    292   std::string::size_type loc = 0;
    293   string_to_find.append(2, c);
    294   while ((loc = output.find(string_to_find, loc)) != std::string::npos) {
    295     output.erase(loc, 1);
    296   }
    297 
    298   return output;
    299 }
    300 
    301 // Canonicalizes url as per Google Safe Browsing Specification.
    302 // See section 6.1 in
    303 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
    304 void CanonicalizeUrl(const GURL& url,
    305                      std::string* canonicalized_hostname,
    306                      std::string* canonicalized_path,
    307                      std::string* canonicalized_query) {
    308   DCHECK(url.is_valid());
    309 
    310   // We only canonicalize "normal" URLs.
    311   if (!url.IsStandard())
    312     return;
    313 
    314   // Following canonicalization steps are excluded since url parsing takes care
    315   // of those :-
    316   // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
    317   //    (Exclude escaped version of these chars).
    318   // 2. Normalize hostname to 4 dot-seperated decimal values.
    319   // 3. Lowercase hostname.
    320   // 4. Resolve path sequences "/../" and "/./".
    321 
    322   // That leaves us with the following :-
    323   // 1. Remove fragment in URL.
    324   GURL url_without_fragment;
    325   GURL::Replacements f_replacements;
    326   f_replacements.ClearRef();
    327   f_replacements.ClearUsername();
    328   f_replacements.ClearPassword();
    329   url_without_fragment = url.ReplaceComponents(f_replacements);
    330 
    331   // 2. Do URL unescaping until no more hex encoded characters exist.
    332   std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
    333   url_parse::Parsed parsed;
    334   url_parse::ParseStandardURL(url_unescaped_str.data(),
    335       url_unescaped_str.length(), &parsed);
    336 
    337   // 3. In hostname, remove all leading and trailing dots.
    338   const std::string host =
    339       (parsed.host.len > 0)
    340           ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len)
    341           : std::string();
    342   const char kCharsToTrim[] = ".";
    343   std::string host_without_end_dots;
    344   base::TrimString(host, kCharsToTrim, &host_without_end_dots);
    345 
    346   // 4. In hostname, replace consecutive dots with a single dot.
    347   std::string host_without_consecutive_dots(RemoveConsecutiveChars(
    348       host_without_end_dots, '.'));
    349 
    350   // 5. In path, replace runs of consecutive slashes with a single slash.
    351   std::string path =
    352       (parsed.path.len > 0)
    353           ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len)
    354           : std::string();
    355   std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
    356 
    357   url_canon::Replacements<char> hp_replacements;
    358   hp_replacements.SetHost(host_without_consecutive_dots.data(),
    359   url_parse::Component(0, host_without_consecutive_dots.length()));
    360   hp_replacements.SetPath(path_without_consecutive_slash.data(),
    361   url_parse::Component(0, path_without_consecutive_slash.length()));
    362 
    363   std::string url_unescaped_with_can_hostpath;
    364   url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
    365   url_parse::Parsed temp_parsed;
    366   url_util::ReplaceComponents(url_unescaped_str.data(),
    367                               url_unescaped_str.length(), parsed,
    368                               hp_replacements, NULL, &output, &temp_parsed);
    369   output.Complete();
    370 
    371   // 6. Step needed to revert escaping done in url_util::ReplaceComponents.
    372   url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
    373 
    374   // 7. After performing all above steps, percent-escape all chars in url which
    375   // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
    376   std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
    377   url_parse::Parsed final_parsed;
    378   url_parse::ParseStandardURL(escaped_canon_url_str.data(),
    379                               escaped_canon_url_str.length(), &final_parsed);
    380 
    381   if (canonicalized_hostname && final_parsed.host.len > 0) {
    382     *canonicalized_hostname =
    383         escaped_canon_url_str.substr(final_parsed.host.begin,
    384                                      final_parsed.host.len);
    385   }
    386   if (canonicalized_path && final_parsed.path.len > 0) {
    387     *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
    388                                                        final_parsed.path.len);
    389   }
    390   if (canonicalized_query && final_parsed.query.len > 0) {
    391     *canonicalized_query = escaped_canon_url_str.substr(
    392         final_parsed.query.begin, final_parsed.query.len);
    393   }
    394 }
    395 
    396 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
    397   hosts->clear();
    398 
    399   std::string canon_host;
    400   CanonicalizeUrl(url, &canon_host, NULL, NULL);
    401 
    402   const std::string host = canon_host;  // const sidesteps GCC bugs below!
    403   if (host.empty())
    404     return;
    405 
    406   // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
    407   // hostnames formed by starting with the last 5 components and successively
    408   // removing the leading component.  The last component isn't examined alone,
    409   // since it's the TLD or a subcomponent thereof.
    410   //
    411   // Note that we don't need to be clever about stopping at the "real" eTLD --
    412   // the data on the server side has been filtered to ensure it will not
    413   // blacklist a whole TLD, and it's not significantly slower on our side to
    414   // just check too much.
    415   //
    416   // Also note that because we have a simple blacklist, not some sort of complex
    417   // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
    418   // these in.
    419   const size_t kMaxHostsToCheck = 4;
    420   bool skipped_last_component = false;
    421   for (std::string::const_reverse_iterator i(host.rbegin());
    422        i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
    423     if (*i == '.') {
    424       if (skipped_last_component)
    425         hosts->push_back(std::string(i.base(), host.end()));
    426       else
    427         skipped_last_component = true;
    428     }
    429   }
    430   hosts->push_back(host);
    431 }
    432 
    433 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
    434   paths->clear();
    435 
    436   std::string canon_path;
    437   std::string canon_query;
    438   CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
    439 
    440   const std::string path = canon_path;   // const sidesteps GCC bugs below!
    441   const std::string query = canon_query;
    442   if (path.empty())
    443     return;
    444 
    445   // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
    446   // the query parameters, and also up to 4 paths formed by starting at the root
    447   // and adding more path components.
    448   //
    449   // As with the hosts above, it doesn't matter what order we check these in.
    450   const size_t kMaxPathsToCheck = 4;
    451   for (std::string::const_iterator i(path.begin());
    452        i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
    453     if (*i == '/')
    454       paths->push_back(std::string(path.begin(), i + 1));
    455   }
    456 
    457   if (!paths->empty() && paths->back() != path)
    458     paths->push_back(path);
    459 
    460   if (!query.empty())
    461     paths->push_back(path + "?" + query);
    462 }
    463 
    464 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
    465   std::vector<std::string> hosts, paths;
    466   GenerateHostsToCheck(url, &hosts);
    467   GeneratePathsToCheck(url, &paths);
    468   for (size_t h = 0; h < hosts.size(); ++h) {
    469     for (size_t p = 0; p < paths.size(); ++p) {
    470       urls->push_back(hosts[h] + paths[p]);
    471     }
    472   }
    473 }
    474 
    475 int GetHashIndex(const SBFullHash& hash,
    476                  const std::vector<SBFullHashResult>& full_hashes) {
    477   for (size_t i = 0; i < full_hashes.size(); ++i) {
    478     if (hash == full_hashes[i].hash)
    479       return static_cast<int>(i);
    480   }
    481   return -1;
    482 }
    483 
    484 int GetUrlHashIndex(const GURL& url,
    485                     const std::vector<SBFullHashResult>& full_hashes) {
    486   if (full_hashes.empty())
    487     return -1;
    488 
    489   std::vector<std::string> patterns;
    490   GeneratePatternsToCheck(url, &patterns);
    491 
    492   for (size_t i = 0; i < patterns.size(); ++i) {
    493     SBFullHash key;
    494     crypto::SHA256HashString(patterns[i], key.full_hash, sizeof(SBFullHash));
    495     int index = GetHashIndex(key, full_hashes);
    496     if (index != -1)
    497       return index;
    498   }
    499   return -1;
    500 }
    501 
    502 bool IsPhishingList(const std::string& list_name) {
    503   return list_name.compare(kPhishingList) == 0;
    504 }
    505 
    506 bool IsMalwareList(const std::string& list_name) {
    507   return list_name.compare(kMalwareList) == 0;
    508 }
    509 
    510 bool IsBadbinurlList(const std::string& list_name) {
    511   return list_name.compare(kBinUrlList) == 0;
    512 }
    513 
    514 bool IsBadbinhashList(const std::string& list_name) {
    515   return list_name.compare(kBinHashList) == 0;
    516 }
    517 
    518 bool IsExtensionList(const std::string& list_name) {
    519   return list_name.compare(kExtensionBlacklist) == 0;
    520 }
    521 
    522 GURL GeneratePhishingReportUrl(const std::string& report_page,
    523                                const std::string& url_to_report,
    524                                bool is_client_side_detection) {
    525   const std::string current_esc = net::EscapeQueryParamValue(url_to_report,
    526                                                              true);
    527 
    528 #if defined(OS_WIN)
    529   BrowserDistribution* dist = BrowserDistribution::GetDistribution();
    530   std::string client_name(dist->GetSafeBrowsingName());
    531 #else
    532   std::string client_name("googlechrome");
    533 #endif
    534   if (is_client_side_detection)
    535     client_name.append("_csd");
    536 
    537   GURL report_url(report_page + base::StringPrintf(kReportParams,
    538                                                    client_name.c_str(),
    539                                                    current_esc.c_str()));
    540   return google_util::AppendGoogleLocaleParam(report_url);
    541 }
    542 
    543 SBFullHash StringToSBFullHash(const std::string& hash_in) {
    544   DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
    545   SBFullHash hash_out;
    546   memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
    547   return hash_out;
    548 }
    549 
    550 std::string SBFullHashToString(const SBFullHash& hash) {
    551   DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
    552   return std::string(hash.full_hash, sizeof(hash.full_hash));
    553 }
    554 
    555 }  // namespace safe_browsing_util
    556