Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // BrowserFeatureExtractor computes various browser features for client-side
      6 // phishing detection.  For now it does a bunch of lookups in the history
      7 // service to see whether a particular URL has been visited before by the
      8 // user.
      9 
     10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     12 
     13 #include <map>
     14 #include <set>
     15 #include <string>
     16 #include <utility>
     17 #include <vector>
     18 
     19 #include "base/basictypes.h"
     20 #include "base/callback.h"
     21 #include "base/containers/hash_tables.h"
     22 #include "base/memory/scoped_ptr.h"
     23 #include "base/sequenced_task_runner_helpers.h"
     24 #include "base/task/cancelable_task_tracker.h"
     25 #include "base/time/time.h"
     26 #include "chrome/browser/common/cancelable_request.h"
     27 #include "chrome/browser/history/history_types.h"
     28 #include "chrome/browser/safe_browsing/safe_browsing_service.h"
     29 #include "chrome/browser/safe_browsing/ui_manager.h"
     30 #include "url/gurl.h"
     31 #include "webkit/common/resource_type.h"
     32 
     33 
     34 class HistoryService;
     35 
     36 namespace content {
     37 class WebContents;
     38 }
     39 
     40 namespace safe_browsing {
     41 class ClientMalwareRequest;
     42 class ClientPhishingRequest;
     43 class ClientSideDetectionHost;
     44 
     45 struct IPUrlInfo {
     46   // The url on the bad IP address.
     47   std::string url;
     48   std::string method;
     49   std::string referrer;
     50   ResourceType::Type resource_type;
     51 
     52   IPUrlInfo(const std::string& url,
     53             const std::string& method,
     54             const std::string& referrer,
     55             const ResourceType::Type& resource_type);
     56   ~IPUrlInfo();
     57 };
     58 
     59 typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
     60 
     61 struct BrowseInfo {
     62   // The URL we're currently browsing.
     63   GURL url;
     64 
     65   // List of IPv4 and IPv6 addresses from which content was requested
     66   // together with the hosts on it, while browsing to the |url|.
     67   IPUrlMap ips;
     68 
     69   // If a SafeBrowsing interstitial was shown for the current URL
     70   // this will contain the UnsafeResource struct for that URL.
     71   scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
     72 
     73   // List of redirects that lead to the first page on the current host and
     74   // the current url respectively. These may be the same if the current url
     75   // is the first page on its host.
     76   std::vector<GURL> host_redirects;
     77   std::vector<GURL> url_redirects;
     78 
     79   // URL of the referrer of this URL load.
     80   GURL referrer;
     81 
     82   // The HTTP status code from this navigation.
     83   int http_status_code;
     84 
     85   // The page ID of the navigation.  This comes from FrameNavigateParams.
     86   int32 page_id;
     87 
     88   BrowseInfo();
     89   ~BrowseInfo();
     90 };
     91 
     92 // All methods of this class must be called on the UI thread (including
     93 // the constructor).
     94 class BrowserFeatureExtractor {
     95  public:
     96   // Called when feature extraction is done.  The first argument will be
     97   // true iff feature extraction succeeded.  The second argument is the
     98   // phishing request which was modified by the feature extractor.  The
     99   // DoneCallback takes ownership of the request object.
    100   typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
    101   typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
    102       MalwareDoneCallback;
    103 
    104   // The caller keeps ownership of the tab and host objects and is
    105   // responsible for ensuring that they stay valid for the entire
    106   // lifetime of this object.
    107   BrowserFeatureExtractor(content::WebContents* tab,
    108                           ClientSideDetectionHost* host);
    109 
    110   // The destructor will cancel any pending requests.
    111   virtual ~BrowserFeatureExtractor();
    112 
    113   // Begins extraction of the browser features.  We take ownership
    114   // of the request object until |callback| is called (see DoneCallback above)
    115   // and will write the extracted features to the feature map.  Once the
    116   // feature extraction is complete, |callback| is run on the UI thread.  We
    117   // take ownership of the |callback| object.  |info| may not be valid after
    118   // ExtractFeatures returns.  This method must run on the UI thread.
    119   virtual void ExtractFeatures(const BrowseInfo* info,
    120                                ClientPhishingRequest* request,
    121                                const DoneCallback& callback);
    122 
    123   // Begins extraction of the malware related features.  We take ownership
    124   // of the request object until |callback| is called.  Once feature extraction
    125   // is complete, |callback| will run on the UI thread.  |info| is not expected
    126   // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
    127   // |info| will be cleared by calling this function.
    128   virtual void ExtractMalwareFeatures(BrowseInfo* info,
    129                                       ClientMalwareRequest* request,
    130                                       const MalwareDoneCallback& callback);
    131 
    132  private:
    133   friend class base::DeleteHelper<BrowserFeatureExtractor>;
    134   typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
    135   typedef std::map<CancelableRequestProvider::Handle,
    136                    ExtractionData> PendingQueriesMap;
    137 
    138   // Synchronous browser feature extraction.
    139   void ExtractBrowseInfoFeatures(const BrowseInfo& info,
    140                                  ClientPhishingRequest* request);
    141 
    142   // Actually starts feature extraction (does the real work).
    143   void StartExtractFeatures(ClientPhishingRequest* request,
    144                             const DoneCallback& callback);
    145 
    146   // HistoryService callback which is called when we're done querying URL visits
    147   // in the history.
    148   void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request,
    149                            const DoneCallback& callback,
    150                            bool success,
    151                            const history::URLRow& row,
    152                            const history::VisitVector& visits);
    153 
    154   // HistoryService callback which is called when we're done querying HTTP host
    155   // visits in the history.
    156   void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
    157                                bool success,
    158                                int num_visits,
    159                                base::Time first_visit);
    160 
    161   // HistoryService callback which is called when we're done querying HTTPS host
    162   // visits in the history.
    163   void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
    164                                 bool success,
    165                                 int num_visits,
    166                                 base::Time first_visit);
    167 
    168   // Helper function which sets the host history features given the
    169   // number of host visits and the time of the fist host visit.  Set
    170   // |is_http_query| to true if the URL scheme is HTTP and to false if
    171   // the scheme is HTTPS.
    172   void SetHostVisitsFeatures(int num_visits,
    173                              base::Time first_visit,
    174                              bool is_http_query,
    175                              ClientPhishingRequest* request);
    176 
    177   // Helper function which stores the request and callback while the history
    178   // query is being processed.
    179   void StorePendingQuery(CancelableRequestProvider::Handle handle,
    180                          ClientPhishingRequest* request,
    181                          const DoneCallback& callback);
    182 
    183   // Helper function which is the counterpart of StorePendingQuery.  If there
    184   // is a pending query for the given handle it will return false and set both
    185   // the request and cb pointers.  Otherwise, it will return false.
    186   bool GetPendingQuery(CancelableRequestProvider::Handle handle,
    187                        ClientPhishingRequest** request,
    188                        DoneCallback* callback);
    189 
    190   // Helper function which gets the history server if possible.  If the pointer
    191   // is set it will return true and false otherwise.
    192   bool GetHistoryService(HistoryService** history);
    193 
    194   // Helper function which is called when we're done filtering out benign IPs
    195   // on the IO thread.  This function is called on the UI thread.
    196   void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
    197                                     MalwareDoneCallback callback,
    198                                     scoped_ptr<ClientMalwareRequest> request);
    199 
    200   content::WebContents* tab_;
    201   ClientSideDetectionHost* host_;
    202   CancelableRequestConsumer request_consumer_;
    203   base::CancelableTaskTracker cancelable_task_tracker_;
    204   base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
    205 
    206   // Set of pending extractions (i.e. extractions for which ExtractFeatures was
    207   // called but not StartExtractFeatures).
    208   std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
    209 
    210   // Set of pending queries (i.e., where history->Query...() was called but
    211   // the history callback hasn't been invoked yet).
    212   PendingQueriesMap pending_queries_;
    213 
    214   DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
    215 };
    216 
    217 }  // namespace safe_browsing
    218 #endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
    219