1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // BrowserFeatureExtractor computes various browser features for client-side 6 // phishing detection. For now it does a bunch of lookups in the history 7 // service to see whether a particular URL has been visited before by the 8 // user. 9 10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 12 13 #include <map> 14 #include <set> 15 #include <string> 16 #include <utility> 17 #include <vector> 18 19 #include "base/basictypes.h" 20 #include "base/callback.h" 21 #include "base/containers/hash_tables.h" 22 #include "base/memory/scoped_ptr.h" 23 #include "base/sequenced_task_runner_helpers.h" 24 #include "base/task/cancelable_task_tracker.h" 25 #include "base/time/time.h" 26 #include "chrome/browser/common/cancelable_request.h" 27 #include "chrome/browser/history/history_types.h" 28 #include "chrome/browser/safe_browsing/safe_browsing_service.h" 29 #include "chrome/browser/safe_browsing/ui_manager.h" 30 #include "url/gurl.h" 31 #include "webkit/common/resource_type.h" 32 33 34 class HistoryService; 35 36 namespace content { 37 class WebContents; 38 } 39 40 namespace safe_browsing { 41 class ClientMalwareRequest; 42 class ClientPhishingRequest; 43 class ClientSideDetectionHost; 44 45 struct IPUrlInfo { 46 // The url on the bad IP address. 47 std::string url; 48 std::string method; 49 std::string referrer; 50 ResourceType::Type resource_type; 51 52 IPUrlInfo(const std::string& url, 53 const std::string& method, 54 const std::string& referrer, 55 const ResourceType::Type& resource_type); 56 ~IPUrlInfo(); 57 }; 58 59 typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap; 60 61 struct BrowseInfo { 62 // The URL we're currently browsing. 63 GURL url; 64 65 // List of IPv4 and IPv6 addresses from which content was requested 66 // together with the hosts on it, while browsing to the |url|. 67 IPUrlMap ips; 68 69 // If a SafeBrowsing interstitial was shown for the current URL 70 // this will contain the UnsafeResource struct for that URL. 71 scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 72 73 // List of redirects that lead to the first page on the current host and 74 // the current url respectively. These may be the same if the current url 75 // is the first page on its host. 76 std::vector<GURL> host_redirects; 77 std::vector<GURL> url_redirects; 78 79 // URL of the referrer of this URL load. 80 GURL referrer; 81 82 // The HTTP status code from this navigation. 83 int http_status_code; 84 85 // The page ID of the navigation. This comes from FrameNavigateParams. 86 int32 page_id; 87 88 BrowseInfo(); 89 ~BrowseInfo(); 90 }; 91 92 // All methods of this class must be called on the UI thread (including 93 // the constructor). 94 class BrowserFeatureExtractor { 95 public: 96 // Called when feature extraction is done. The first argument will be 97 // true iff feature extraction succeeded. The second argument is the 98 // phishing request which was modified by the feature extractor. The 99 // DoneCallback takes ownership of the request object. 100 typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback; 101 typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)> 102 MalwareDoneCallback; 103 104 // The caller keeps ownership of the tab and host objects and is 105 // responsible for ensuring that they stay valid for the entire 106 // lifetime of this object. 107 BrowserFeatureExtractor(content::WebContents* tab, 108 ClientSideDetectionHost* host); 109 110 // The destructor will cancel any pending requests. 111 virtual ~BrowserFeatureExtractor(); 112 113 // Begins extraction of the browser features. We take ownership 114 // of the request object until |callback| is called (see DoneCallback above) 115 // and will write the extracted features to the feature map. Once the 116 // feature extraction is complete, |callback| is run on the UI thread. We 117 // take ownership of the |callback| object. |info| may not be valid after 118 // ExtractFeatures returns. This method must run on the UI thread. 119 virtual void ExtractFeatures(const BrowseInfo* info, 120 ClientPhishingRequest* request, 121 const DoneCallback& callback); 122 123 // Begins extraction of the malware related features. We take ownership 124 // of the request object until |callback| is called. Once feature extraction 125 // is complete, |callback| will run on the UI thread. |info| is not expected 126 // to stay valid after ExtractMalwareFeatures returns. All IPs stored in 127 // |info| will be cleared by calling this function. 128 virtual void ExtractMalwareFeatures(BrowseInfo* info, 129 ClientMalwareRequest* request, 130 const MalwareDoneCallback& callback); 131 132 private: 133 friend class base::DeleteHelper<BrowserFeatureExtractor>; 134 typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData; 135 typedef std::map<CancelableRequestProvider::Handle, 136 ExtractionData> PendingQueriesMap; 137 138 // Synchronous browser feature extraction. 139 void ExtractBrowseInfoFeatures(const BrowseInfo& info, 140 ClientPhishingRequest* request); 141 142 // Actually starts feature extraction (does the real work). 143 void StartExtractFeatures(ClientPhishingRequest* request, 144 const DoneCallback& callback); 145 146 // HistoryService callback which is called when we're done querying URL visits 147 // in the history. 148 void QueryUrlHistoryDone(scoped_ptr<ClientPhishingRequest> request, 149 const DoneCallback& callback, 150 bool success, 151 const history::URLRow& row, 152 const history::VisitVector& visits); 153 154 // HistoryService callback which is called when we're done querying HTTP host 155 // visits in the history. 156 void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle, 157 bool success, 158 int num_visits, 159 base::Time first_visit); 160 161 // HistoryService callback which is called when we're done querying HTTPS host 162 // visits in the history. 163 void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle, 164 bool success, 165 int num_visits, 166 base::Time first_visit); 167 168 // Helper function which sets the host history features given the 169 // number of host visits and the time of the fist host visit. Set 170 // |is_http_query| to true if the URL scheme is HTTP and to false if 171 // the scheme is HTTPS. 172 void SetHostVisitsFeatures(int num_visits, 173 base::Time first_visit, 174 bool is_http_query, 175 ClientPhishingRequest* request); 176 177 // Helper function which stores the request and callback while the history 178 // query is being processed. 179 void StorePendingQuery(CancelableRequestProvider::Handle handle, 180 ClientPhishingRequest* request, 181 const DoneCallback& callback); 182 183 // Helper function which is the counterpart of StorePendingQuery. If there 184 // is a pending query for the given handle it will return false and set both 185 // the request and cb pointers. Otherwise, it will return false. 186 bool GetPendingQuery(CancelableRequestProvider::Handle handle, 187 ClientPhishingRequest** request, 188 DoneCallback* callback); 189 190 // Helper function which gets the history server if possible. If the pointer 191 // is set it will return true and false otherwise. 192 bool GetHistoryService(HistoryService** history); 193 194 // Helper function which is called when we're done filtering out benign IPs 195 // on the IO thread. This function is called on the UI thread. 196 void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips, 197 MalwareDoneCallback callback, 198 scoped_ptr<ClientMalwareRequest> request); 199 200 content::WebContents* tab_; 201 ClientSideDetectionHost* host_; 202 CancelableRequestConsumer request_consumer_; 203 base::CancelableTaskTracker cancelable_task_tracker_; 204 base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 205 206 // Set of pending extractions (i.e. extractions for which ExtractFeatures was 207 // called but not StartExtractFeatures). 208 std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_; 209 210 // Set of pending queries (i.e., where history->Query...() was called but 211 // the history callback hasn't been invoked yet). 212 PendingQueriesMap pending_queries_; 213 214 DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 215 }; 216 217 } // namespace safe_browsing 218 #endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 219