Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
      6 
      7 #include <map>
      8 #include <utility>
      9 
     10 #include "base/bind.h"
     11 #include "base/bind_helpers.h"
     12 #include "base/format_macros.h"
     13 #include "base/stl_util.h"
     14 #include "base/strings/stringprintf.h"
     15 #include "base/time/time.h"
     16 #include "chrome/browser/common/cancelable_request.h"
     17 #include "chrome/browser/history/history_service.h"
     18 #include "chrome/browser/history/history_service_factory.h"
     19 #include "chrome/browser/history/history_types.h"
     20 #include "chrome/browser/profiles/profile.h"
     21 #include "chrome/browser/safe_browsing/browser_features.h"
     22 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
     23 #include "chrome/browser/safe_browsing/database_manager.h"
     24 #include "chrome/common/safe_browsing/csd.pb.h"
     25 #include "content/public/browser/browser_thread.h"
     26 #include "content/public/browser/navigation_controller.h"
     27 #include "content/public/browser/navigation_entry.h"
     28 #include "content/public/browser/web_contents.h"
     29 #include "content/public/common/page_transition_types.h"
     30 #include "url/gurl.h"
     31 
     32 using content::BrowserThread;
     33 using content::NavigationController;
     34 using content::NavigationEntry;
     35 using content::WebContents;
     36 
     37 namespace safe_browsing {
     38 
     39 namespace {
     40 
     41 const int kMaxMalwareIPPerRequest = 5;
     42 
     43 void FilterBenignIpsOnIOThread(
     44     scoped_refptr<SafeBrowsingDatabaseManager> database_manager,
     45     IPUrlMap* ips) {
     46   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
     47   for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) {
     48     if (!database_manager.get() ||
     49         !database_manager->MatchMalwareIP(it->first)) {
     50       // it++ here returns a copy of the old iterator and passes it to erase.
     51       ips->erase(it++);
     52     } else {
     53       ++it;
     54     }
     55   }
     56 }
     57 }  // namespace
     58 
     59 IPUrlInfo::IPUrlInfo(const std::string& url,
     60                      const std::string& method,
     61                      const std::string& referrer,
     62                      const ResourceType::Type& resource_type)
     63       : url(url),
     64         method(method),
     65         referrer(referrer),
     66         resource_type(resource_type) {
     67 }
     68 
     69 IPUrlInfo::~IPUrlInfo() {}
     70 
     71 BrowseInfo::BrowseInfo() : http_status_code(0) {}
     72 
     73 BrowseInfo::~BrowseInfo() {}
     74 
     75 static void AddFeature(const std::string& feature_name,
     76                        double feature_value,
     77                        ClientPhishingRequest* request) {
     78   DCHECK(request);
     79   ClientPhishingRequest::Feature* feature =
     80       request->add_non_model_feature_map();
     81   feature->set_name(feature_name);
     82   feature->set_value(feature_value);
     83   VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
     84 }
     85 
     86 static void AddMalwareIpUrlInfo(const std::string& ip,
     87                                 const std::vector<IPUrlInfo>& meta_infos,
     88                                 ClientMalwareRequest* request) {
     89   DCHECK(request);
     90   for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin();
     91        it != meta_infos.end(); ++it) {
     92     ClientMalwareRequest::UrlInfo* urlinfo =
     93         request->add_bad_ip_url_info();
     94     // We add the information about url on the bad ip.
     95     urlinfo->set_ip(ip);
     96     urlinfo->set_url(it->url);
     97     urlinfo->set_method(it->method);
     98     urlinfo->set_referrer(it->referrer);
     99     urlinfo->set_resource_type(static_cast<int>(it->resource_type));
    100   }
    101   DVLOG(2) << "Added url info for bad ip: " << ip;
    102 }
    103 
    104 static void AddNavigationFeatures(
    105     const std::string& feature_prefix,
    106     const NavigationController& controller,
    107     int index,
    108     const std::vector<GURL>& redirect_chain,
    109     ClientPhishingRequest* request) {
    110   NavigationEntry* entry = controller.GetEntryAtIndex(index);
    111   bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure();
    112   if (!is_secure_referrer) {
    113     AddFeature(base::StringPrintf("%s%s=%s",
    114                                   feature_prefix.c_str(),
    115                                   features::kReferrer,
    116                                   entry->GetReferrer().url.spec().c_str()),
    117                1.0,
    118                request);
    119   }
    120   AddFeature(feature_prefix + features::kHasSSLReferrer,
    121              is_secure_referrer ? 1.0 : 0.0,
    122              request);
    123   AddFeature(feature_prefix + features::kPageTransitionType,
    124              static_cast<double>(
    125                  content::PageTransitionStripQualifier(
    126                     entry->GetTransitionType())),
    127              request);
    128   AddFeature(feature_prefix + features::kIsFirstNavigation,
    129              index == 0 ? 1.0 : 0.0,
    130              request);
    131   // Redirect chain should always be at least of size one, as the rendered
    132   // url is the last element in the chain.
    133   if (redirect_chain.empty()) {
    134     NOTREACHED();
    135     return;
    136   }
    137   if (redirect_chain.back() != entry->GetURL()) {
    138     // I originally had this as a DCHECK but I saw a failure once that I
    139     // can't reproduce. It looks like it might be related to the
    140     // navigation controller only keeping a limited number of navigation
    141     // events. For now we'll just attach a feature specifying that this is
    142     // a mismatch and try and figure out what to do with it on the server.
    143     DLOG(WARNING) << "Expected:" << entry->GetURL()
    144                  << " Actual:" << redirect_chain.back();
    145     AddFeature(feature_prefix + features::kRedirectUrlMismatch,
    146                1.0,
    147                request);
    148     return;
    149   }
    150   // We skip the last element since it should just be the current url.
    151   for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
    152     std::string printable_redirect = redirect_chain[i].spec();
    153     if (redirect_chain[i].SchemeIsSecure()) {
    154       printable_redirect = features::kSecureRedirectValue;
    155     }
    156     AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
    157                                   feature_prefix.c_str(),
    158                                   features::kRedirect,
    159                                   i,
    160                                   printable_redirect.c_str()),
    161                1.0,
    162                request);
    163   }
    164 }
    165 
    166 BrowserFeatureExtractor::BrowserFeatureExtractor(
    167     WebContents* tab,
    168     ClientSideDetectionHost* host)
    169     : tab_(tab),
    170       host_(host),
    171       weak_factory_(this) {
    172   DCHECK(tab);
    173 }
    174 
    175 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
    176   weak_factory_.InvalidateWeakPtrs();
    177   // Delete all the pending extractions (delete callback and request objects).
    178   STLDeleteContainerPairFirstPointers(pending_extractions_.begin(),
    179                                       pending_extractions_.end());
    180 
    181   // Also cancel all the pending history service queries.
    182   HistoryService* history;
    183   bool success = GetHistoryService(&history);
    184   DCHECK(success || pending_queries_.size() == 0);
    185   // Cancel all the pending history lookups and cleanup the memory.
    186   for (PendingQueriesMap::iterator it = pending_queries_.begin();
    187        it != pending_queries_.end(); ++it) {
    188     if (history) {
    189       history->CancelRequest(it->first);
    190     }
    191     ExtractionData& extraction = it->second;
    192     delete extraction.first;  // delete request
    193   }
    194   pending_queries_.clear();
    195 }
    196 
    197 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
    198                                               ClientPhishingRequest* request,
    199                                               const DoneCallback& callback) {
    200   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    201   DCHECK(request);
    202   DCHECK(info);
    203   DCHECK_EQ(0U, request->url().find("http:"));
    204   DCHECK(!callback.is_null());
    205   // Extract features pertaining to this navigation.
    206   const NavigationController& controller = tab_->GetController();
    207   int url_index = -1;
    208   int first_host_index = -1;
    209 
    210   GURL request_url(request->url());
    211   int index = controller.GetCurrentEntryIndex();
    212   // The url that we are extracting features for should already be commited.
    213   DCHECK_NE(index, -1);
    214   for (; index >= 0; index--) {
    215     NavigationEntry* entry = controller.GetEntryAtIndex(index);
    216     if (url_index == -1 && entry->GetURL() == request_url) {
    217       // It's possible that we've been on the on the possibly phishy url before
    218       // in this tab, so make sure that we use the latest navigation for
    219       // features.
    220       // Note that it's possible that the url_index should always be the
    221       // latest entry, but I'm worried about possible races during a navigation
    222       // and transient entries (i.e. interstiatials) so for now we will just
    223       // be cautious.
    224       url_index = index;
    225     } else if (index < url_index) {
    226       if (entry->GetURL().host() == request_url.host()) {
    227         first_host_index = index;
    228       } else {
    229         // We have found the possibly phishing url, but we are no longer on the
    230         // host. No reason to look back any further.
    231         break;
    232       }
    233     }
    234   }
    235 
    236   // Add features pertaining to how we got to
    237   //   1) The candidate url
    238   //   2) The first url on the same host as the candidate url (assuming that
    239   //      it's different from the candidate url).
    240   if (url_index != -1) {
    241     AddNavigationFeatures(
    242         std::string(), controller, url_index, info->url_redirects, request);
    243   }
    244   if (first_host_index != -1) {
    245     AddNavigationFeatures(features::kHostPrefix,
    246                           controller,
    247                           first_host_index,
    248                           info->host_redirects,
    249                           request);
    250   }
    251 
    252   ExtractBrowseInfoFeatures(*info, request);
    253   pending_extractions_[request] = callback;
    254   base::MessageLoop::current()->PostTask(
    255       FROM_HERE,
    256       base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
    257                  weak_factory_.GetWeakPtr(), request, callback));
    258 }
    259 
    260 void BrowserFeatureExtractor::ExtractMalwareFeatures(
    261     BrowseInfo* info,
    262     ClientMalwareRequest* request,
    263     const MalwareDoneCallback& callback) {
    264   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    265   DCHECK_EQ(0U, request->url().find("http:"));
    266   DCHECK(!callback.is_null());
    267 
    268   // Grab the IPs because they might go away before we're done
    269   // checking them against the IP blacklist on the IO thread.
    270   scoped_ptr<IPUrlMap> ips(new IPUrlMap);
    271   ips->swap(info->ips);
    272 
    273   IPUrlMap* ips_ptr = ips.get();
    274 
    275   // The API doesn't take a scoped_ptr because the API gets mocked and we
    276   // cannot mock an API that takes scoped_ptr as arguments.
    277   scoped_ptr<ClientMalwareRequest> req(request);
    278 
    279   // IP blacklist lookups have to happen on the IO thread.
    280   BrowserThread::PostTaskAndReply(
    281       BrowserThread::IO,
    282       FROM_HERE,
    283       base::Bind(&FilterBenignIpsOnIOThread,
    284                  host_->database_manager(),
    285                  ips_ptr),
    286       base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures,
    287                  weak_factory_.GetWeakPtr(),
    288                  base::Passed(&ips), callback, base::Passed(&req)));
    289 }
    290 
    291 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
    292     const BrowseInfo& info,
    293     ClientPhishingRequest* request) {
    294   if (info.unsafe_resource.get()) {
    295     // A SafeBrowsing interstitial was shown for the current URL.
    296     AddFeature(features::kSafeBrowsingMaliciousUrl +
    297                info.unsafe_resource->url.spec(),
    298                1.0,
    299                request);
    300     AddFeature(features::kSafeBrowsingOriginalUrl +
    301                info.unsafe_resource->original_url.spec(),
    302                1.0,
    303                request);
    304     AddFeature(features::kSafeBrowsingIsSubresource,
    305                info.unsafe_resource->is_subresource ? 1.0 : 0.0,
    306                request);
    307     AddFeature(features::kSafeBrowsingThreatType,
    308                static_cast<double>(info.unsafe_resource->threat_type),
    309                request);
    310   }
    311   if (info.http_status_code != 0) {
    312     AddFeature(features::kHttpStatusCode, info.http_status_code, request);
    313   }
    314 }
    315 
    316 void BrowserFeatureExtractor::StartExtractFeatures(
    317     ClientPhishingRequest* request,
    318     const DoneCallback& callback) {
    319   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    320   size_t removed = pending_extractions_.erase(request);
    321   DCHECK_EQ(1U, removed);
    322   HistoryService* history;
    323   if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
    324     callback.Run(false, request);
    325     return;
    326   }
    327   CancelableRequestProvider::Handle handle = history->QueryURL(
    328       GURL(request->url()),
    329       true /* wants_visits */,
    330       &request_consumer_,
    331       base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
    332                  base::Unretained(this)));
    333 
    334   StorePendingQuery(handle, request, callback);
    335 }
    336 
    337 void BrowserFeatureExtractor::QueryUrlHistoryDone(
    338     CancelableRequestProvider::Handle handle,
    339     bool success,
    340     const history::URLRow* row,
    341     history::VisitVector* visits) {
    342   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    343   ClientPhishingRequest* request;
    344   DoneCallback callback;
    345   if (!GetPendingQuery(handle, &request, &callback)) {
    346     DLOG(FATAL) << "No pending history query found";
    347     return;
    348   }
    349   DCHECK(request);
    350   DCHECK(!callback.is_null());
    351   if (!success) {
    352     // URL is not found in the history.  In practice this should not
    353     // happen (unless there is a real error) because we just visited
    354     // that URL.
    355     callback.Run(false, request);
    356     return;
    357   }
    358   AddFeature(features::kUrlHistoryVisitCount,
    359              static_cast<double>(row->visit_count()),
    360              request);
    361 
    362   base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
    363   int num_visits_24h_ago = 0;
    364   int num_visits_typed = 0;
    365   int num_visits_link = 0;
    366   for (history::VisitVector::const_iterator it = visits->begin();
    367        it != visits->end(); ++it) {
    368     if (!content::PageTransitionIsMainFrame(it->transition)) {
    369       continue;
    370     }
    371     if (it->visit_time < threshold) {
    372       ++num_visits_24h_ago;
    373     }
    374     content::PageTransition transition = content::PageTransitionStripQualifier(
    375         it->transition);
    376     if (transition == content::PAGE_TRANSITION_TYPED) {
    377       ++num_visits_typed;
    378     } else if (transition == content::PAGE_TRANSITION_LINK) {
    379       ++num_visits_link;
    380     }
    381   }
    382   AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
    383              static_cast<double>(num_visits_24h_ago),
    384              request);
    385   AddFeature(features::kUrlHistoryTypedCount,
    386              static_cast<double>(num_visits_typed),
    387              request);
    388   AddFeature(features::kUrlHistoryLinkCount,
    389              static_cast<double>(num_visits_link),
    390              request);
    391 
    392   // Issue next history lookup for host visits.
    393   HistoryService* history;
    394   if (!GetHistoryService(&history)) {
    395     callback.Run(false, request);
    396     return;
    397   }
    398   CancelableRequestProvider::Handle next_handle =
    399       history->GetVisibleVisitCountToHost(
    400           GURL(request->url()),
    401           &request_consumer_,
    402           base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
    403                      base::Unretained(this)));
    404   StorePendingQuery(next_handle, request, callback);
    405 }
    406 
    407 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
    408     CancelableRequestProvider::Handle handle,
    409     bool success,
    410     int num_visits,
    411     base::Time first_visit) {
    412   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    413   ClientPhishingRequest* request;
    414   DoneCallback callback;
    415   if (!GetPendingQuery(handle, &request, &callback)) {
    416     DLOG(FATAL) << "No pending history query found";
    417     return;
    418   }
    419   DCHECK(request);
    420   DCHECK(!callback.is_null());
    421   if (!success) {
    422     callback.Run(false, request);
    423     return;
    424   }
    425   SetHostVisitsFeatures(num_visits, first_visit, true, request);
    426 
    427   // Same lookup but for the HTTPS URL.
    428   HistoryService* history;
    429   if (!GetHistoryService(&history)) {
    430     callback.Run(false, request);
    431     return;
    432   }
    433   std::string https_url = request->url();
    434   CancelableRequestProvider::Handle next_handle =
    435       history->GetVisibleVisitCountToHost(
    436           GURL(https_url.replace(0, 5, "https:")),
    437           &request_consumer_,
    438           base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
    439                      base::Unretained(this)));
    440   StorePendingQuery(next_handle, request, callback);
    441 }
    442 
    443 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
    444     CancelableRequestProvider::Handle handle,
    445     bool success,
    446     int num_visits,
    447     base::Time first_visit) {
    448   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    449   ClientPhishingRequest* request;
    450   DoneCallback callback;
    451   if (!GetPendingQuery(handle, &request, &callback)) {
    452     DLOG(FATAL) << "No pending history query found";
    453     return;
    454   }
    455   DCHECK(request);
    456   DCHECK(!callback.is_null());
    457   if (!success) {
    458     callback.Run(false, request);
    459     return;
    460   }
    461   SetHostVisitsFeatures(num_visits, first_visit, false, request);
    462   callback.Run(true, request);  // We're done with all the history lookups.
    463 }
    464 
    465 void BrowserFeatureExtractor::SetHostVisitsFeatures(
    466     int num_visits,
    467     base::Time first_visit,
    468     bool is_http_query,
    469     ClientPhishingRequest* request) {
    470   DCHECK(request);
    471   AddFeature(is_http_query ?
    472              features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
    473              static_cast<double>(num_visits),
    474              request);
    475   if (num_visits > 0) {
    476     AddFeature(
    477         is_http_query ?
    478         features::kFirstHttpHostVisitMoreThan24hAgo :
    479         features::kFirstHttpsHostVisitMoreThan24hAgo,
    480         (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
    481         1.0 : 0.0,
    482         request);
    483   }
    484 }
    485 
    486 void BrowserFeatureExtractor::StorePendingQuery(
    487     CancelableRequestProvider::Handle handle,
    488     ClientPhishingRequest* request,
    489     const DoneCallback& callback) {
    490   DCHECK_EQ(0U, pending_queries_.count(handle));
    491   pending_queries_[handle] = std::make_pair(request, callback);
    492 }
    493 
    494 bool BrowserFeatureExtractor::GetPendingQuery(
    495     CancelableRequestProvider::Handle handle,
    496     ClientPhishingRequest** request,
    497     DoneCallback* callback) {
    498   PendingQueriesMap::iterator it = pending_queries_.find(handle);
    499   DCHECK(it != pending_queries_.end());
    500   if (it != pending_queries_.end()) {
    501     *request = it->second.first;
    502     *callback = it->second.second;
    503     pending_queries_.erase(it);
    504     return true;
    505   }
    506   return false;
    507 }
    508 
    509 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
    510   *history = NULL;
    511   if (tab_ && tab_->GetBrowserContext()) {
    512     Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
    513     *history = HistoryServiceFactory::GetForProfile(profile,
    514                                                     Profile::EXPLICIT_ACCESS);
    515     if (*history) {
    516       return true;
    517     }
    518   }
    519   VLOG(2) << "Unable to query history.  No history service available.";
    520   return false;
    521 }
    522 
    523 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
    524     scoped_ptr<IPUrlMap> bad_ips,
    525     MalwareDoneCallback callback,
    526     scoped_ptr<ClientMalwareRequest> request) {
    527   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    528   int matched_bad_ips = 0;
    529   for (IPUrlMap::const_iterator it = bad_ips->begin();
    530        it != bad_ips->end(); ++it) {
    531     AddMalwareIpUrlInfo(it->first, it->second, request.get());
    532     ++matched_bad_ips;
    533     // Limit the number of matched bad IPs in one request to control
    534     // the request's size
    535     if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
    536       break;
    537     }
    538   }
    539   callback.Run(true, request.Pass());
    540 }
    541 
    542 }  // namespace safe_browsing
    543