Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // BrowserFeatureExtractor computes various browser features for client-side
      6 // phishing detection.  For now it does a bunch of lookups in the history
      7 // service to see whether a particular URL has been visited before by the
      8 // user.
      9 
     10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     12 
     13 #include <map>
     14 #include <set>
     15 #include <string>
     16 #include <utility>
     17 #include <vector>
     18 
     19 #include "base/basictypes.h"
     20 #include "base/callback.h"
     21 #include "base/containers/hash_tables.h"
     22 #include "base/memory/scoped_ptr.h"
     23 #include "base/sequenced_task_runner_helpers.h"
     24 #include "base/time/time.h"
     25 #include "chrome/browser/common/cancelable_request.h"
     26 #include "chrome/browser/history/history_types.h"
     27 #include "chrome/browser/safe_browsing/safe_browsing_service.h"
     28 #include "chrome/browser/safe_browsing/ui_manager.h"
     29 #include "url/gurl.h"
     30 #include "webkit/common/resource_type.h"
     31 
     32 
     33 class HistoryService;
     34 
     35 namespace content {
     36 class WebContents;
     37 }
     38 
     39 namespace safe_browsing {
     40 class ClientMalwareRequest;
     41 class ClientPhishingRequest;
     42 class ClientSideDetectionHost;
     43 
     44 struct IPUrlInfo {
     45   // The url on the bad IP address.
     46   std::string url;
     47   std::string method;
     48   std::string referrer;
     49   ResourceType::Type resource_type;
     50 
     51   IPUrlInfo(const std::string& url,
     52             const std::string& method,
     53             const std::string& referrer,
     54             const ResourceType::Type& resource_type);
     55   ~IPUrlInfo();
     56 };
     57 
     58 typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
     59 
     60 struct BrowseInfo {
     61   // List of IPv4 and IPv6 addresses from which content was requested
     62   // together with the hosts on it, while browsing to the |url|.
     63   IPUrlMap ips;
     64 
     65   // If a SafeBrowsing interstitial was shown for the current URL
     66   // this will contain the UnsafeResource struct for that URL.
     67   scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
     68 
     69   // List of redirects that lead to the first page on the current host and
     70   // the current url respectively. These may be the same if the current url
     71   // is the first page on its host.
     72   std::vector<GURL> host_redirects;
     73   std::vector<GURL> url_redirects;
     74 
     75   // URL of the referrer of this URL load.
     76   GURL referrer;
     77 
     78   // The HTTP status code from this navigation.
     79   int http_status_code;
     80 
     81   BrowseInfo();
     82   ~BrowseInfo();
     83 };
     84 
     85 // All methods of this class must be called on the UI thread (including
     86 // the constructor).
     87 class BrowserFeatureExtractor {
     88  public:
     89   // Called when feature extraction is done.  The first argument will be
     90   // true iff feature extraction succeeded.  The second argument is the
     91   // phishing request which was modified by the feature extractor.  The
     92   // DoneCallback takes ownership of the request object.
     93   typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
     94   typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
     95       MalwareDoneCallback;
     96 
     97   // The caller keeps ownership of the tab and host objects and is
     98   // responsible for ensuring that they stay valid for the entire
     99   // lifetime of this object.
    100   BrowserFeatureExtractor(content::WebContents* tab,
    101                           ClientSideDetectionHost* host);
    102 
    103   // The destructor will cancel any pending requests.
    104   virtual ~BrowserFeatureExtractor();
    105 
    106   // Begins extraction of the browser features.  We take ownership
    107   // of the request object until |callback| is called (see DoneCallback above)
    108   // and will write the extracted features to the feature map.  Once the
    109   // feature extraction is complete, |callback| is run on the UI thread.  We
    110   // take ownership of the |callback| object.  |info| may not be valid after
    111   // ExtractFeatures returns.  This method must run on the UI thread.
    112   virtual void ExtractFeatures(const BrowseInfo* info,
    113                                ClientPhishingRequest* request,
    114                                const DoneCallback& callback);
    115 
    116   // Begins extraction of the malware related features.  We take ownership
    117   // of the request object until |callback| is called.  Once feature extraction
    118   // is complete, |callback| will run on the UI thread.  |info| is not expected
    119   // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
    120   // |info| will be cleared by calling this function.
    121   virtual void ExtractMalwareFeatures(BrowseInfo* info,
    122                                       ClientMalwareRequest* request,
    123                                       const MalwareDoneCallback& callback);
    124 
    125  private:
    126   friend class base::DeleteHelper<BrowserFeatureExtractor>;
    127   typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
    128   typedef std::map<CancelableRequestProvider::Handle,
    129                    ExtractionData> PendingQueriesMap;
    130 
    131   // Synchronous browser feature extraction.
    132   void ExtractBrowseInfoFeatures(const BrowseInfo& info,
    133                                  ClientPhishingRequest* request);
    134 
    135   // Actually starts feature extraction (does the real work).
    136   void StartExtractFeatures(ClientPhishingRequest* request,
    137                             const DoneCallback& callback);
    138 
    139   // HistoryService callback which is called when we're done querying URL visits
    140   // in the history.
    141   void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
    142                            bool success,
    143                            const history::URLRow* row,
    144                            history::VisitVector* visits);
    145 
    146   // HistoryService callback which is called when we're done querying HTTP host
    147   // visits in the history.
    148   void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
    149                                bool success,
    150                                int num_visits,
    151                                base::Time first_visit);
    152 
    153   // HistoryService callback which is called when we're done querying HTTPS host
    154   // visits in the history.
    155   void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
    156                                 bool success,
    157                                 int num_visits,
    158                                 base::Time first_visit);
    159 
    160   // Helper function which sets the host history features given the
    161   // number of host visits and the time of the fist host visit.  Set
    162   // |is_http_query| to true if the URL scheme is HTTP and to false if
    163   // the scheme is HTTPS.
    164   void SetHostVisitsFeatures(int num_visits,
    165                              base::Time first_visit,
    166                              bool is_http_query,
    167                              ClientPhishingRequest* request);
    168 
    169   // Helper function which stores the request and callback while the history
    170   // query is being processed.
    171   void StorePendingQuery(CancelableRequestProvider::Handle handle,
    172                          ClientPhishingRequest* request,
    173                          const DoneCallback& callback);
    174 
    175   // Helper function which is the counterpart of StorePendingQuery.  If there
    176   // is a pending query for the given handle it will return false and set both
    177   // the request and cb pointers.  Otherwise, it will return false.
    178   bool GetPendingQuery(CancelableRequestProvider::Handle handle,
    179                        ClientPhishingRequest** request,
    180                        DoneCallback* callback);
    181 
    182   // Helper function which gets the history server if possible.  If the pointer
    183   // is set it will return true and false otherwise.
    184   bool GetHistoryService(HistoryService** history);
    185 
    186   // Helper function which is called when we're done filtering out benign IPs
    187   // on the IO thread.  This function is called on the UI thread.
    188   void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
    189                                     MalwareDoneCallback callback,
    190                                     scoped_ptr<ClientMalwareRequest> request);
    191 
    192   content::WebContents* tab_;
    193   ClientSideDetectionHost* host_;
    194   CancelableRequestConsumer request_consumer_;
    195   base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
    196 
    197   // Set of pending extractions (i.e. extractions for which ExtractFeatures was
    198   // called but not StartExtractFeatures).
    199   std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
    200 
    201   // Set of pending queries (i.e., where history->Query...() was called but
    202   // the history callback hasn't been invoked yet).
    203   PendingQueriesMap pending_queries_;
    204 
    205   DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
    206 };
    207 
    208 }  // namespace safe_browsing
    209 #endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
    210