Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
      6 
      7 #include <map>
      8 #include <utility>
      9 
     10 #include "base/bind.h"
     11 #include "base/bind_helpers.h"
     12 #include "base/format_macros.h"
     13 #include "base/stl_util.h"
     14 #include "base/strings/stringprintf.h"
     15 #include "base/time/time.h"
     16 #include "chrome/browser/common/cancelable_request.h"
     17 #include "chrome/browser/history/history_service.h"
     18 #include "chrome/browser/history/history_service_factory.h"
     19 #include "chrome/browser/history/history_types.h"
     20 #include "chrome/browser/profiles/profile.h"
     21 #include "chrome/browser/safe_browsing/browser_features.h"
     22 #include "chrome/browser/safe_browsing/client_side_detection_service.h"
     23 #include "chrome/common/safe_browsing/csd.pb.h"
     24 #include "content/public/browser/browser_thread.h"
     25 #include "content/public/browser/navigation_controller.h"
     26 #include "content/public/browser/navigation_entry.h"
     27 #include "content/public/browser/web_contents.h"
     28 #include "content/public/common/page_transition_types.h"
     29 #include "url/gurl.h"
     30 
     31 using content::BrowserThread;
     32 using content::NavigationController;
     33 using content::NavigationEntry;
     34 using content::WebContents;
     35 
     36 namespace safe_browsing {
     37 
     38 const int BrowserFeatureExtractor::kMaxMalwareIPPerRequest = 5;
     39 
     40 BrowseInfo::BrowseInfo() : http_status_code(0) {}
     41 
     42 BrowseInfo::~BrowseInfo() {}
     43 
     44 static void AddFeature(const std::string& feature_name,
     45                        double feature_value,
     46                        ClientPhishingRequest* request) {
     47   DCHECK(request);
     48   ClientPhishingRequest::Feature* feature =
     49       request->add_non_model_feature_map();
     50   feature->set_name(feature_name);
     51   feature->set_value(feature_value);
     52   VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
     53 }
     54 
     55 static void AddMalwareFeature(const std::string& feature_name,
     56                               const std::set<std::string>& meta_infos,
     57                               double feature_value,
     58                               ClientMalwareRequest* request) {
     59   DCHECK(request);
     60   ClientMalwareRequest::Feature* feature =
     61       request->add_feature_map();
     62   feature->set_name(feature_name);
     63   feature->set_value(feature_value);
     64   for (std::set<std::string>::const_iterator it = meta_infos.begin();
     65        it != meta_infos.end(); ++it) {
     66     feature->add_metainfo(*it);
     67   }
     68   VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
     69 }
     70 
     71 static void AddNavigationFeatures(
     72     const std::string& feature_prefix,
     73     const NavigationController& controller,
     74     int index,
     75     const std::vector<GURL>& redirect_chain,
     76     ClientPhishingRequest* request) {
     77   NavigationEntry* entry = controller.GetEntryAtIndex(index);
     78   bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure();
     79   if (!is_secure_referrer) {
     80     AddFeature(base::StringPrintf("%s%s=%s",
     81                                   feature_prefix.c_str(),
     82                                   features::kReferrer,
     83                                   entry->GetReferrer().url.spec().c_str()),
     84                1.0,
     85                request);
     86   }
     87   AddFeature(feature_prefix + features::kHasSSLReferrer,
     88              is_secure_referrer ? 1.0 : 0.0,
     89              request);
     90   AddFeature(feature_prefix + features::kPageTransitionType,
     91              static_cast<double>(
     92                  content::PageTransitionStripQualifier(
     93                     entry->GetTransitionType())),
     94              request);
     95   AddFeature(feature_prefix + features::kIsFirstNavigation,
     96              index == 0 ? 1.0 : 0.0,
     97              request);
     98   // Redirect chain should always be at least of size one, as the rendered
     99   // url is the last element in the chain.
    100   if (redirect_chain.empty()) {
    101     NOTREACHED();
    102     return;
    103   }
    104   if (redirect_chain.back() != entry->GetURL()) {
    105     // I originally had this as a DCHECK but I saw a failure once that I
    106     // can't reproduce. It looks like it might be related to the
    107     // navigation controller only keeping a limited number of navigation
    108     // events. For now we'll just attach a feature specifying that this is
    109     // a mismatch and try and figure out what to do with it on the server.
    110     DLOG(WARNING) << "Expected:" << entry->GetURL()
    111                  << " Actual:" << redirect_chain.back();
    112     AddFeature(feature_prefix + features::kRedirectUrlMismatch,
    113                1.0,
    114                request);
    115     return;
    116   }
    117   // We skip the last element since it should just be the current url.
    118   for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
    119     std::string printable_redirect = redirect_chain[i].spec();
    120     if (redirect_chain[i].SchemeIsSecure()) {
    121       printable_redirect = features::kSecureRedirectValue;
    122     }
    123     AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
    124                                   feature_prefix.c_str(),
    125                                   features::kRedirect,
    126                                   i,
    127                                   printable_redirect.c_str()),
    128                1.0,
    129                request);
    130   }
    131 }
    132 
    133 BrowserFeatureExtractor::BrowserFeatureExtractor(
    134     WebContents* tab,
    135     ClientSideDetectionService* service)
    136     : tab_(tab),
    137       service_(service),
    138       weak_factory_(this) {
    139   DCHECK(tab);
    140 }
    141 
    142 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
    143   weak_factory_.InvalidateWeakPtrs();
    144   // Delete all the pending extractions (delete callback and request objects).
    145   STLDeleteContainerPairFirstPointers(pending_extractions_.begin(),
    146                                       pending_extractions_.end());
    147 
    148   // Also cancel all the pending history service queries.
    149   HistoryService* history;
    150   bool success = GetHistoryService(&history);
    151   DCHECK(success || pending_queries_.size() == 0);
    152   // Cancel all the pending history lookups and cleanup the memory.
    153   for (PendingQueriesMap::iterator it = pending_queries_.begin();
    154        it != pending_queries_.end(); ++it) {
    155     if (history) {
    156       history->CancelRequest(it->first);
    157     }
    158     ExtractionData& extraction = it->second;
    159     delete extraction.first;  // delete request
    160   }
    161   pending_queries_.clear();
    162 }
    163 
    164 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
    165                                               ClientPhishingRequest* request,
    166                                               const DoneCallback& callback) {
    167   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    168   DCHECK(request);
    169   DCHECK(info);
    170   DCHECK_EQ(0U, request->url().find("http:"));
    171   DCHECK(!callback.is_null());
    172   if (callback.is_null()) {
    173     DLOG(ERROR) << "ExtractFeatures called without a callback object";
    174     return;
    175   }
    176 
    177   // Extract features pertaining to this navigation.
    178   const NavigationController& controller = tab_->GetController();
    179   int url_index = -1;
    180   int first_host_index = -1;
    181 
    182   GURL request_url(request->url());
    183   int index = controller.GetCurrentEntryIndex();
    184   // The url that we are extracting features for should already be commited.
    185   DCHECK_NE(index, -1);
    186   for (; index >= 0; index--) {
    187     NavigationEntry* entry = controller.GetEntryAtIndex(index);
    188     if (url_index == -1 && entry->GetURL() == request_url) {
    189       // It's possible that we've been on the on the possibly phishy url before
    190       // in this tab, so make sure that we use the latest navigation for
    191       // features.
    192       // Note that it's possible that the url_index should always be the
    193       // latest entry, but I'm worried about possible races during a navigation
    194       // and transient entries (i.e. interstiatials) so for now we will just
    195       // be cautious.
    196       url_index = index;
    197     } else if (index < url_index) {
    198       if (entry->GetURL().host() == request_url.host()) {
    199         first_host_index = index;
    200       } else {
    201         // We have found the possibly phishing url, but we are no longer on the
    202         // host. No reason to look back any further.
    203         break;
    204       }
    205     }
    206   }
    207 
    208   // Add features pertaining to how we got to
    209   //   1) The candidate url
    210   //   2) The first url on the same host as the candidate url (assuming that
    211   //      it's different from the candidate url).
    212   if (url_index != -1) {
    213     AddNavigationFeatures(
    214         std::string(), controller, url_index, info->url_redirects, request);
    215   }
    216   if (first_host_index != -1) {
    217     AddNavigationFeatures(features::kHostPrefix,
    218                           controller,
    219                           first_host_index,
    220                           info->host_redirects,
    221                           request);
    222   }
    223 
    224   ExtractBrowseInfoFeatures(*info, request);
    225   pending_extractions_[request] = callback;
    226   base::MessageLoop::current()->PostTask(
    227       FROM_HERE,
    228       base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
    229                  weak_factory_.GetWeakPtr(), request, callback));
    230 }
    231 
    232 void BrowserFeatureExtractor::ExtractMalwareFeatures(
    233     const BrowseInfo* info,
    234     ClientMalwareRequest* request) {
    235   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    236   DCHECK(request);
    237   DCHECK(info);
    238   DCHECK_EQ(0U, request->url().find("http:"));
    239   // get the IPs and urls that match the malware blacklisted IP list.
    240   if (service_) {
    241     int matched_bad_ips = 0;
    242     for (IPUrlMap::const_iterator it = info->ips.begin();
    243          it != info->ips.end(); ++it) {
    244       if (service_->IsBadIpAddress(it->first)) {
    245         AddMalwareFeature(features::kBadIpFetch + it->first,
    246                           it->second, 1.0, request);
    247         ++matched_bad_ips;
    248         // Limit the number of matched bad IPs in one request to control
    249         // the request's size
    250         if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
    251           return;
    252         }
    253       }
    254     }
    255   }
    256 }
    257 
    258 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
    259     const BrowseInfo& info,
    260     ClientPhishingRequest* request) {
    261   if (service_) {
    262     for (IPUrlMap::const_iterator it = info.ips.begin();
    263          it != info.ips.end(); ++it) {
    264       if (service_->IsBadIpAddress(it->first)) {
    265         AddFeature(features::kBadIpFetch + it->first, 1.0, request);
    266       }
    267     }
    268   }
    269   if (info.unsafe_resource.get()) {
    270     // A SafeBrowsing interstitial was shown for the current URL.
    271     AddFeature(features::kSafeBrowsingMaliciousUrl +
    272                info.unsafe_resource->url.spec(),
    273                1.0,
    274                request);
    275     AddFeature(features::kSafeBrowsingOriginalUrl +
    276                info.unsafe_resource->original_url.spec(),
    277                1.0,
    278                request);
    279     AddFeature(features::kSafeBrowsingIsSubresource,
    280                info.unsafe_resource->is_subresource ? 1.0 : 0.0,
    281                request);
    282     AddFeature(features::kSafeBrowsingThreatType,
    283                static_cast<double>(info.unsafe_resource->threat_type),
    284                request);
    285   }
    286   if (info.http_status_code != 0) {
    287     AddFeature(features::kHttpStatusCode, info.http_status_code, request);
    288   }
    289 }
    290 
    291 void BrowserFeatureExtractor::StartExtractFeatures(
    292     ClientPhishingRequest* request,
    293     const DoneCallback& callback) {
    294   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    295   size_t removed = pending_extractions_.erase(request);
    296   DCHECK_EQ(1U, removed);
    297   HistoryService* history;
    298   if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
    299     callback.Run(false, request);
    300     return;
    301   }
    302   CancelableRequestProvider::Handle handle = history->QueryURL(
    303       GURL(request->url()),
    304       true /* wants_visits */,
    305       &request_consumer_,
    306       base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
    307                  base::Unretained(this)));
    308 
    309   StorePendingQuery(handle, request, callback);
    310 }
    311 
    312 void BrowserFeatureExtractor::QueryUrlHistoryDone(
    313     CancelableRequestProvider::Handle handle,
    314     bool success,
    315     const history::URLRow* row,
    316     history::VisitVector* visits) {
    317   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    318   ClientPhishingRequest* request;
    319   DoneCallback callback;
    320   if (!GetPendingQuery(handle, &request, &callback)) {
    321     DLOG(FATAL) << "No pending history query found";
    322     return;
    323   }
    324   DCHECK(request);
    325   DCHECK(!callback.is_null());
    326   if (!success) {
    327     // URL is not found in the history.  In practice this should not
    328     // happen (unless there is a real error) because we just visited
    329     // that URL.
    330     callback.Run(false, request);
    331     return;
    332   }
    333   AddFeature(features::kUrlHistoryVisitCount,
    334              static_cast<double>(row->visit_count()),
    335              request);
    336 
    337   base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
    338   int num_visits_24h_ago = 0;
    339   int num_visits_typed = 0;
    340   int num_visits_link = 0;
    341   for (history::VisitVector::const_iterator it = visits->begin();
    342        it != visits->end(); ++it) {
    343     if (!content::PageTransitionIsMainFrame(it->transition)) {
    344       continue;
    345     }
    346     if (it->visit_time < threshold) {
    347       ++num_visits_24h_ago;
    348     }
    349     content::PageTransition transition = content::PageTransitionStripQualifier(
    350         it->transition);
    351     if (transition == content::PAGE_TRANSITION_TYPED) {
    352       ++num_visits_typed;
    353     } else if (transition == content::PAGE_TRANSITION_LINK) {
    354       ++num_visits_link;
    355     }
    356   }
    357   AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
    358              static_cast<double>(num_visits_24h_ago),
    359              request);
    360   AddFeature(features::kUrlHistoryTypedCount,
    361              static_cast<double>(num_visits_typed),
    362              request);
    363   AddFeature(features::kUrlHistoryLinkCount,
    364              static_cast<double>(num_visits_link),
    365              request);
    366 
    367   // Issue next history lookup for host visits.
    368   HistoryService* history;
    369   if (!GetHistoryService(&history)) {
    370     callback.Run(false, request);
    371     return;
    372   }
    373   CancelableRequestProvider::Handle next_handle =
    374       history->GetVisibleVisitCountToHost(
    375           GURL(request->url()),
    376           &request_consumer_,
    377           base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
    378                      base::Unretained(this)));
    379   StorePendingQuery(next_handle, request, callback);
    380 }
    381 
    382 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
    383     CancelableRequestProvider::Handle handle,
    384     bool success,
    385     int num_visits,
    386     base::Time first_visit) {
    387   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    388   ClientPhishingRequest* request;
    389   DoneCallback callback;
    390   if (!GetPendingQuery(handle, &request, &callback)) {
    391     DLOG(FATAL) << "No pending history query found";
    392     return;
    393   }
    394   DCHECK(request);
    395   DCHECK(!callback.is_null());
    396   if (!success) {
    397     callback.Run(false, request);
    398     return;
    399   }
    400   SetHostVisitsFeatures(num_visits, first_visit, true, request);
    401 
    402   // Same lookup but for the HTTPS URL.
    403   HistoryService* history;
    404   if (!GetHistoryService(&history)) {
    405     callback.Run(false, request);
    406     return;
    407   }
    408   std::string https_url = request->url();
    409   CancelableRequestProvider::Handle next_handle =
    410       history->GetVisibleVisitCountToHost(
    411           GURL(https_url.replace(0, 5, "https:")),
    412           &request_consumer_,
    413           base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
    414                      base::Unretained(this)));
    415   StorePendingQuery(next_handle, request, callback);
    416 }
    417 
    418 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
    419     CancelableRequestProvider::Handle handle,
    420     bool success,
    421     int num_visits,
    422     base::Time first_visit) {
    423   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    424   ClientPhishingRequest* request;
    425   DoneCallback callback;
    426   if (!GetPendingQuery(handle, &request, &callback)) {
    427     DLOG(FATAL) << "No pending history query found";
    428     return;
    429   }
    430   DCHECK(request);
    431   DCHECK(!callback.is_null());
    432   if (!success) {
    433     callback.Run(false, request);
    434     return;
    435   }
    436   SetHostVisitsFeatures(num_visits, first_visit, false, request);
    437   callback.Run(true, request);  // We're done with all the history lookups.
    438 }
    439 
    440 void BrowserFeatureExtractor::SetHostVisitsFeatures(
    441     int num_visits,
    442     base::Time first_visit,
    443     bool is_http_query,
    444     ClientPhishingRequest* request) {
    445   DCHECK(request);
    446   AddFeature(is_http_query ?
    447              features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
    448              static_cast<double>(num_visits),
    449              request);
    450   if (num_visits > 0) {
    451     AddFeature(
    452         is_http_query ?
    453         features::kFirstHttpHostVisitMoreThan24hAgo :
    454         features::kFirstHttpsHostVisitMoreThan24hAgo,
    455         (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
    456         1.0 : 0.0,
    457         request);
    458   }
    459 }
    460 
    461 void BrowserFeatureExtractor::StorePendingQuery(
    462     CancelableRequestProvider::Handle handle,
    463     ClientPhishingRequest* request,
    464     const DoneCallback& callback) {
    465   DCHECK_EQ(0U, pending_queries_.count(handle));
    466   pending_queries_[handle] = std::make_pair(request, callback);
    467 }
    468 
    469 bool BrowserFeatureExtractor::GetPendingQuery(
    470     CancelableRequestProvider::Handle handle,
    471     ClientPhishingRequest** request,
    472     DoneCallback* callback) {
    473   PendingQueriesMap::iterator it = pending_queries_.find(handle);
    474   DCHECK(it != pending_queries_.end());
    475   if (it != pending_queries_.end()) {
    476     *request = it->second.first;
    477     *callback = it->second.second;
    478     pending_queries_.erase(it);
    479     return true;
    480   }
    481   return false;
    482 }
    483 
    484 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
    485   *history = NULL;
    486   if (tab_ && tab_->GetBrowserContext()) {
    487     Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
    488     *history = HistoryServiceFactory::GetForProfile(profile,
    489                                                     Profile::EXPLICIT_ACCESS);
    490     if (*history) {
    491       return true;
    492     }
    493   }
    494   VLOG(2) << "Unable to query history.  No history service available.";
    495   return false;
    496 }
    497 
    498 }  // namespace safe_browsing
    499