Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // BrowserFeatureExtractor computes various browser features for client-side
      6 // phishing detection.  For now it does a bunch of lookups in the history
      7 // service to see whether a particular URL has been visited before by the
      8 // user.
      9 
     10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     12 
     13 #include <map>
     14 #include <set>
     15 #include <string>
     16 #include <utility>
     17 #include <vector>
     18 
     19 #include "base/basictypes.h"
     20 #include "base/callback.h"
     21 #include "base/containers/hash_tables.h"
     22 #include "base/memory/scoped_ptr.h"
     23 #include "base/task/cancelable_task_tracker.h"
     24 #include "base/time/time.h"
     25 #include "chrome/browser/safe_browsing/safe_browsing_service.h"
     26 #include "chrome/browser/safe_browsing/ui_manager.h"
     27 #include "components/history/core/browser/history_types.h"
     28 #include "content/public/common/resource_type.h"
     29 #include "url/gurl.h"
     30 
     31 
     32 class HistoryService;
     33 
     34 namespace content {
     35 class WebContents;
     36 }
     37 
     38 namespace safe_browsing {
     39 class ClientMalwareRequest;
     40 class ClientPhishingRequest;
     41 class ClientSideDetectionHost;
     42 
     43 struct IPUrlInfo {
     44   // The url on the bad IP address.
     45   std::string url;
     46   std::string method;
     47   std::string referrer;
     48   content::ResourceType resource_type;
     49 
     50   IPUrlInfo(const std::string& url,
     51             const std::string& method,
     52             const std::string& referrer,
     53             const content::ResourceType& resource_type);
     54   ~IPUrlInfo();
     55 };
     56 
     57 typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap;
     58 
     59 struct BrowseInfo {
     60   // The URL we're currently browsing.
     61   GURL url;
     62 
     63   // List of IPv4 and IPv6 addresses from which content was requested
     64   // together with the hosts on it, while browsing to the |url|.
     65   IPUrlMap ips;
     66 
     67   // If a SafeBrowsing interstitial was shown for the current URL
     68   // this will contain the UnsafeResource struct for that URL.
     69   scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
     70 
     71   // List of redirects that lead to the first page on the current host and
     72   // the current url respectively. These may be the same if the current url
     73   // is the first page on its host.
     74   std::vector<GURL> host_redirects;
     75   std::vector<GURL> url_redirects;
     76 
     77   // URL of the referrer of this URL load.
     78   GURL referrer;
     79 
     80   // The HTTP status code from this navigation.
     81   int http_status_code;
     82 
     83   // The page ID of the navigation.  This comes from FrameNavigateParams.
     84   int32 page_id;
     85 
     86   BrowseInfo();
     87   ~BrowseInfo();
     88 };
     89 
     90 // All methods of this class must be called on the UI thread (including
     91 // the constructor).
     92 class BrowserFeatureExtractor {
     93  public:
     94   // Called when feature extraction is done.  The first argument will be
     95   // true iff feature extraction succeeded.  The second argument is the
     96   // phishing request which was modified by the feature extractor.  The
     97   // DoneCallback takes ownership of the request object.
     98   typedef base::Callback<void(bool, scoped_ptr<ClientPhishingRequest>)>
     99       DoneCallback;
    100   typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)>
    101       MalwareDoneCallback;
    102 
    103   // The caller keeps ownership of the tab and host objects and is
    104   // responsible for ensuring that they stay valid for the entire
    105   // lifetime of this object.
    106   BrowserFeatureExtractor(content::WebContents* tab,
    107                           ClientSideDetectionHost* host);
    108 
    109   // The destructor will cancel any pending requests.
    110   virtual ~BrowserFeatureExtractor();
    111 
    112   // Begins extraction of the browser features.  We take ownership
    113   // of the request object until |callback| is called (see DoneCallback above)
    114   // and will write the extracted features to the feature map.  Once the
    115   // feature extraction is complete, |callback| is run on the UI thread.  We
    116   // take ownership of the |callback| object.  |info| may not be valid after
    117   // ExtractFeatures returns.  This method must run on the UI thread.
    118   virtual void ExtractFeatures(const BrowseInfo* info,
    119                                ClientPhishingRequest* request,
    120                                const DoneCallback& callback);
    121 
    122   // Begins extraction of the malware related features.  We take ownership
    123   // of the request object until |callback| is called.  Once feature extraction
    124   // is complete, |callback| will run on the UI thread.  |info| is not expected
    125   // to stay valid after ExtractMalwareFeatures returns.  All IPs stored in
    126   // |info| will be cleared by calling this function.
    127   virtual void ExtractMalwareFeatures(BrowseInfo* info,
    128                                       ClientMalwareRequest* request,
    129                                       const MalwareDoneCallback& callback);
    130 
    131  private:
    132   // Synchronous browser feature extraction.
    133   void ExtractBrowseInfoFeatures(const BrowseInfo& info,
    134                                  ClientPhishingRequest* request);
    135 
    136   // Actually starts feature extraction (does the real work).
    137   void StartExtractFeatures(scoped_ptr<ClientPhishingRequest> request,
    138                             const DoneCallback& callback);
    139 
    140   // HistoryService callback which is called when we're done querying URL visits
    141   // in the history.
    142   void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request,
    143                            const DoneCallback& callback,
    144                            bool success,
    145                            const history::URLRow& row,
    146                            const history::VisitVector& visits);
    147 
    148   // HistoryService callback which is called when we're done querying HTTP host
    149   // visits in the history.
    150   void QueryHttpHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
    151                                const DoneCallback& callback,
    152                                bool success,
    153                                int num_visits,
    154                                base::Time first_visit);
    155 
    156   // HistoryService callback which is called when we're done querying HTTPS host
    157   // visits in the history.
    158   void QueryHttpsHostVisitsDone(scoped_ptr<ClientPhishingRequest> request,
    159                                 const DoneCallback& callback,
    160                                 bool success,
    161                                 int num_visits,
    162                                 base::Time first_visit);
    163 
    164   // Helper function which sets the host history features given the
    165   // number of host visits and the time of the fist host visit.  Set
    166   // |is_http_query| to true if the URL scheme is HTTP and to false if
    167   // the scheme is HTTPS.
    168   void SetHostVisitsFeatures(int num_visits,
    169                              base::Time first_visit,
    170                              bool is_http_query,
    171                              ClientPhishingRequest* request);
    172 
    173   // Helper function which gets the history server if possible.  If the pointer
    174   // is set it will return true and false otherwise.
    175   bool GetHistoryService(HistoryService** history);
    176 
    177   // Helper function which is called when we're done filtering out benign IPs
    178   // on the IO thread.  This function is called on the UI thread.
    179   void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips,
    180                                     MalwareDoneCallback callback,
    181                                     scoped_ptr<ClientMalwareRequest> request);
    182 
    183   content::WebContents* tab_;
    184   ClientSideDetectionHost* host_;
    185   base::CancelableTaskTracker cancelable_task_tracker_;
    186   base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
    187 
    188   DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
    189 };
    190 
    191 }  // namespace safe_browsing
    192 #endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
    193