1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // BrowserFeatureExtractor computes various browser features for client-side 6 // phishing detection. For now it does a bunch of lookups in the history 7 // service to see whether a particular URL has been visited before by the 8 // user. 9 10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 12 13 #include <map> 14 #include <set> 15 #include <string> 16 #include <utility> 17 #include <vector> 18 19 #include "base/basictypes.h" 20 #include "base/callback.h" 21 #include "base/containers/hash_tables.h" 22 #include "base/memory/scoped_ptr.h" 23 #include "base/sequenced_task_runner_helpers.h" 24 #include "base/time/time.h" 25 #include "chrome/browser/common/cancelable_request.h" 26 #include "chrome/browser/history/history_types.h" 27 #include "chrome/browser/safe_browsing/safe_browsing_service.h" 28 #include "chrome/browser/safe_browsing/ui_manager.h" 29 #include "url/gurl.h" 30 31 class HistoryService; 32 33 namespace content { 34 class WebContents; 35 } 36 37 namespace safe_browsing { 38 class ClientMalwareRequest; 39 class ClientPhishingRequest; 40 class ClientSideDetectionService; 41 42 typedef std::map<std::string, std::set<std::string> > IPUrlMap; 43 44 struct BrowseInfo { 45 // List of IPv4 and IPv6 addresses from which content was requested 46 // together with the hosts on it, while browsing to the |url|. 47 IPUrlMap ips; 48 49 // If a SafeBrowsing interstitial was shown for the current URL 50 // this will contain the UnsafeResource struct for that URL. 51 scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 52 53 // List of redirects that lead to the first page on the current host and 54 // the current url respectively. These may be the same if the current url 55 // is the first page on its host. 56 std::vector<GURL> host_redirects; 57 std::vector<GURL> url_redirects; 58 59 // The HTTP status code from this navigation. 60 int http_status_code; 61 62 BrowseInfo(); 63 ~BrowseInfo(); 64 }; 65 66 // All methods of this class must be called on the UI thread (including 67 // the constructor). 68 class BrowserFeatureExtractor { 69 public: 70 // Called when feature extraction is done. The first argument will be 71 // true iff feature extraction succeeded. The second argument is the 72 // phishing request which was modified by the feature extractor. The 73 // DoneCallback takes ownership of the request object. 74 typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback; 75 typedef base::Callback<void(bool, ClientMalwareRequest*)> MalwareDoneCallback; 76 77 // The caller keeps ownership of the tab and service objects and is 78 // responsible for ensuring that they stay valid for the entire 79 // lifetime of this object. 80 BrowserFeatureExtractor(content::WebContents* tab, 81 ClientSideDetectionService* service); 82 83 // The destructor will cancel any pending requests. 84 virtual ~BrowserFeatureExtractor(); 85 86 // Begins extraction of the browser features. We take ownership 87 // of the request object until |callback| is called (see DoneCallback above) 88 // and will write the extracted features to the feature map. Once the 89 // feature extraction is complete, |callback| is run on the UI thread. We 90 // take ownership of the |callback| object. |info| may not be valid after 91 // ExtractFeatures returns. This method must run on the UI thread. 92 virtual void ExtractFeatures(const BrowseInfo* info, 93 ClientPhishingRequest* request, 94 const DoneCallback& callback); 95 96 // Extract the malware related features. The request object is owned by the 97 // caller. 98 virtual void ExtractMalwareFeatures(const BrowseInfo* info, 99 ClientMalwareRequest* request); 100 101 private: 102 friend class base::DeleteHelper<BrowserFeatureExtractor>; 103 typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData; 104 typedef std::map<CancelableRequestProvider::Handle, 105 ExtractionData> PendingQueriesMap; 106 107 // Synchronous browser feature extraction. 108 void ExtractBrowseInfoFeatures(const BrowseInfo& info, 109 ClientPhishingRequest* request); 110 111 // Actually starts feature extraction (does the real work). 112 void StartExtractFeatures(ClientPhishingRequest* request, 113 const DoneCallback& callback); 114 115 // HistoryService callback which is called when we're done querying URL visits 116 // in the history. 117 void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle, 118 bool success, 119 const history::URLRow* row, 120 history::VisitVector* visits); 121 122 // HistoryService callback which is called when we're done querying HTTP host 123 // visits in the history. 124 void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle, 125 bool success, 126 int num_visits, 127 base::Time first_visit); 128 129 // HistoryService callback which is called when we're done querying HTTPS host 130 // visits in the history. 131 void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle, 132 bool success, 133 int num_visits, 134 base::Time first_visit); 135 136 // Helper function which sets the host history features given the 137 // number of host visits and the time of the fist host visit. Set 138 // |is_http_query| to true if the URL scheme is HTTP and to false if 139 // the scheme is HTTPS. 140 void SetHostVisitsFeatures(int num_visits, 141 base::Time first_visit, 142 bool is_http_query, 143 ClientPhishingRequest* request); 144 145 // Helper function which stores the request and callback while the history 146 // query is being processed. 147 void StorePendingQuery(CancelableRequestProvider::Handle handle, 148 ClientPhishingRequest* request, 149 const DoneCallback& callback); 150 151 // Helper function which is the counterpart of StorePendingQuery. If there 152 // is a pending query for the given handle it will return false and set both 153 // the request and cb pointers. Otherwise, it will return false. 154 bool GetPendingQuery(CancelableRequestProvider::Handle handle, 155 ClientPhishingRequest** request, 156 DoneCallback* callback); 157 158 // Helper function which gets the history server if possible. If the pointer 159 // is set it will return true and false otherwise. 160 bool GetHistoryService(HistoryService** history); 161 162 content::WebContents* tab_; 163 ClientSideDetectionService* service_; 164 CancelableRequestConsumer request_consumer_; 165 base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 166 167 // Set of pending extractions (i.e. extractions for which ExtractFeatures was 168 // called but not StartExtractFeatures). 169 std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_; 170 171 // Set of pending queries (i.e., where history->Query...() was called but 172 // the history callback hasn't been invoked yet). 173 PendingQueriesMap pending_queries_; 174 175 // Max number of malware IPs can be sent in one malware request 176 static const int kMaxMalwareIPPerRequest; 177 178 DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 179 }; 180 181 } // namespace safe_browsing 182 #endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 183