Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // BrowserFeatureExtractor computes various browser features for client-side
      6 // phishing detection.  For now it does a bunch of lookups in the history
      7 // service to see whether a particular URL has been visited before by the
      8 // user.
      9 
     10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
     12 
     13 #include <map>
     14 #include <set>
     15 #include <string>
     16 #include <utility>
     17 #include <vector>
     18 
     19 #include "base/basictypes.h"
     20 #include "base/callback.h"
     21 #include "base/containers/hash_tables.h"
     22 #include "base/memory/scoped_ptr.h"
     23 #include "base/sequenced_task_runner_helpers.h"
     24 #include "base/time/time.h"
     25 #include "chrome/browser/common/cancelable_request.h"
     26 #include "chrome/browser/history/history_types.h"
     27 #include "chrome/browser/safe_browsing/safe_browsing_service.h"
     28 #include "chrome/browser/safe_browsing/ui_manager.h"
     29 #include "url/gurl.h"
     30 
     31 class HistoryService;
     32 
     33 namespace content {
     34 class WebContents;
     35 }
     36 
     37 namespace safe_browsing {
     38 class ClientMalwareRequest;
     39 class ClientPhishingRequest;
     40 class ClientSideDetectionService;
     41 
     42 typedef std::map<std::string, std::set<std::string> > IPUrlMap;
     43 
     44 struct BrowseInfo {
     45   // List of IPv4 and IPv6 addresses from which content was requested
     46   // together with the hosts on it, while browsing to the |url|.
     47   IPUrlMap ips;
     48 
     49   // If a SafeBrowsing interstitial was shown for the current URL
     50   // this will contain the UnsafeResource struct for that URL.
     51   scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;
     52 
     53   // List of redirects that lead to the first page on the current host and
     54   // the current url respectively. These may be the same if the current url
     55   // is the first page on its host.
     56   std::vector<GURL> host_redirects;
     57   std::vector<GURL> url_redirects;
     58 
     59   // The HTTP status code from this navigation.
     60   int http_status_code;
     61 
     62   BrowseInfo();
     63   ~BrowseInfo();
     64 };
     65 
     66 // All methods of this class must be called on the UI thread (including
     67 // the constructor).
     68 class BrowserFeatureExtractor {
     69  public:
     70   // Called when feature extraction is done.  The first argument will be
     71   // true iff feature extraction succeeded.  The second argument is the
     72   // phishing request which was modified by the feature extractor.  The
     73   // DoneCallback takes ownership of the request object.
     74   typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
     75   typedef base::Callback<void(bool, ClientMalwareRequest*)> MalwareDoneCallback;
     76 
     77   // The caller keeps ownership of the tab and service objects and is
     78   // responsible for ensuring that they stay valid for the entire
     79   // lifetime of this object.
     80   BrowserFeatureExtractor(content::WebContents* tab,
     81                           ClientSideDetectionService* service);
     82 
     83   // The destructor will cancel any pending requests.
     84   virtual ~BrowserFeatureExtractor();
     85 
     86   // Begins extraction of the browser features.  We take ownership
     87   // of the request object until |callback| is called (see DoneCallback above)
     88   // and will write the extracted features to the feature map.  Once the
     89   // feature extraction is complete, |callback| is run on the UI thread.  We
     90   // take ownership of the |callback| object.  |info| may not be valid after
     91   // ExtractFeatures returns.  This method must run on the UI thread.
     92   virtual void ExtractFeatures(const BrowseInfo* info,
     93                                ClientPhishingRequest* request,
     94                                const DoneCallback& callback);
     95 
     96   // Extract the malware related features. The request object is owned by the
     97   // caller.
     98   virtual void ExtractMalwareFeatures(const BrowseInfo* info,
     99                                       ClientMalwareRequest* request);
    100 
    101  private:
    102   friend class base::DeleteHelper<BrowserFeatureExtractor>;
    103   typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
    104   typedef std::map<CancelableRequestProvider::Handle,
    105                    ExtractionData> PendingQueriesMap;
    106 
    107   // Synchronous browser feature extraction.
    108   void ExtractBrowseInfoFeatures(const BrowseInfo& info,
    109                                  ClientPhishingRequest* request);
    110 
    111   // Actually starts feature extraction (does the real work).
    112   void StartExtractFeatures(ClientPhishingRequest* request,
    113                             const DoneCallback& callback);
    114 
    115   // HistoryService callback which is called when we're done querying URL visits
    116   // in the history.
    117   void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
    118                            bool success,
    119                            const history::URLRow* row,
    120                            history::VisitVector* visits);
    121 
    122   // HistoryService callback which is called when we're done querying HTTP host
    123   // visits in the history.
    124   void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
    125                                bool success,
    126                                int num_visits,
    127                                base::Time first_visit);
    128 
    129   // HistoryService callback which is called when we're done querying HTTPS host
    130   // visits in the history.
    131   void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
    132                                 bool success,
    133                                 int num_visits,
    134                                 base::Time first_visit);
    135 
    136   // Helper function which sets the host history features given the
    137   // number of host visits and the time of the fist host visit.  Set
    138   // |is_http_query| to true if the URL scheme is HTTP and to false if
    139   // the scheme is HTTPS.
    140   void SetHostVisitsFeatures(int num_visits,
    141                              base::Time first_visit,
    142                              bool is_http_query,
    143                              ClientPhishingRequest* request);
    144 
    145   // Helper function which stores the request and callback while the history
    146   // query is being processed.
    147   void StorePendingQuery(CancelableRequestProvider::Handle handle,
    148                          ClientPhishingRequest* request,
    149                          const DoneCallback& callback);
    150 
    151   // Helper function which is the counterpart of StorePendingQuery.  If there
    152   // is a pending query for the given handle it will return false and set both
    153   // the request and cb pointers.  Otherwise, it will return false.
    154   bool GetPendingQuery(CancelableRequestProvider::Handle handle,
    155                        ClientPhishingRequest** request,
    156                        DoneCallback* callback);
    157 
    158   // Helper function which gets the history server if possible.  If the pointer
    159   // is set it will return true and false otherwise.
    160   bool GetHistoryService(HistoryService** history);
    161 
    162   content::WebContents* tab_;
    163   ClientSideDetectionService* service_;
    164   CancelableRequestConsumer request_consumer_;
    165   base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;
    166 
    167   // Set of pending extractions (i.e. extractions for which ExtractFeatures was
    168   // called but not StartExtractFeatures).
    169   std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;
    170 
    171   // Set of pending queries (i.e., where history->Query...() was called but
    172   // the history callback hasn't been invoked yet).
    173   PendingQueriesMap pending_queries_;
    174 
    175   // Max number of malware IPs can be sent in one malware request
    176   static const int kMaxMalwareIPPerRequest;
    177 
    178   DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
    179 };
    180 
    181 }  // namespace safe_browsing
    182 #endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
    183