1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // BrowserFeatureExtractor computes various browser features for client-side 6 // phishing detection. For now it does a bunch of lookups in the history 7 // service to see whether a particular URL has been visited before by the 8 // user. 9 10 #ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 11 #define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 12 13 #include <map> 14 #include <set> 15 #include <string> 16 #include <utility> 17 #include <vector> 18 19 #include "base/basictypes.h" 20 #include "base/callback.h" 21 #include "base/containers/hash_tables.h" 22 #include "base/memory/scoped_ptr.h" 23 #include "base/sequenced_task_runner_helpers.h" 24 #include "base/time/time.h" 25 #include "chrome/browser/common/cancelable_request.h" 26 #include "chrome/browser/history/history_types.h" 27 #include "chrome/browser/safe_browsing/safe_browsing_service.h" 28 #include "chrome/browser/safe_browsing/ui_manager.h" 29 #include "url/gurl.h" 30 #include "webkit/common/resource_type.h" 31 32 33 class HistoryService; 34 35 namespace content { 36 class WebContents; 37 } 38 39 namespace safe_browsing { 40 class ClientMalwareRequest; 41 class ClientPhishingRequest; 42 class ClientSideDetectionHost; 43 44 struct IPUrlInfo { 45 // The url on the bad IP address. 46 std::string url; 47 std::string method; 48 std::string referrer; 49 ResourceType::Type resource_type; 50 51 IPUrlInfo(const std::string& url, 52 const std::string& method, 53 const std::string& referrer, 54 const ResourceType::Type& resource_type); 55 ~IPUrlInfo(); 56 }; 57 58 typedef std::map<std::string, std::vector<IPUrlInfo> > IPUrlMap; 59 60 struct BrowseInfo { 61 // List of IPv4 and IPv6 addresses from which content was requested 62 // together with the hosts on it, while browsing to the |url|. 63 IPUrlMap ips; 64 65 // If a SafeBrowsing interstitial was shown for the current URL 66 // this will contain the UnsafeResource struct for that URL. 67 scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource; 68 69 // List of redirects that lead to the first page on the current host and 70 // the current url respectively. These may be the same if the current url 71 // is the first page on its host. 72 std::vector<GURL> host_redirects; 73 std::vector<GURL> url_redirects; 74 75 // URL of the referrer of this URL load. 76 GURL referrer; 77 78 // The HTTP status code from this navigation. 79 int http_status_code; 80 81 BrowseInfo(); 82 ~BrowseInfo(); 83 }; 84 85 // All methods of this class must be called on the UI thread (including 86 // the constructor). 87 class BrowserFeatureExtractor { 88 public: 89 // Called when feature extraction is done. The first argument will be 90 // true iff feature extraction succeeded. The second argument is the 91 // phishing request which was modified by the feature extractor. The 92 // DoneCallback takes ownership of the request object. 93 typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback; 94 typedef base::Callback<void(bool, scoped_ptr<ClientMalwareRequest>)> 95 MalwareDoneCallback; 96 97 // The caller keeps ownership of the tab and host objects and is 98 // responsible for ensuring that they stay valid for the entire 99 // lifetime of this object. 100 BrowserFeatureExtractor(content::WebContents* tab, 101 ClientSideDetectionHost* host); 102 103 // The destructor will cancel any pending requests. 104 virtual ~BrowserFeatureExtractor(); 105 106 // Begins extraction of the browser features. We take ownership 107 // of the request object until |callback| is called (see DoneCallback above) 108 // and will write the extracted features to the feature map. Once the 109 // feature extraction is complete, |callback| is run on the UI thread. We 110 // take ownership of the |callback| object. |info| may not be valid after 111 // ExtractFeatures returns. This method must run on the UI thread. 112 virtual void ExtractFeatures(const BrowseInfo* info, 113 ClientPhishingRequest* request, 114 const DoneCallback& callback); 115 116 // Begins extraction of the malware related features. We take ownership 117 // of the request object until |callback| is called. Once feature extraction 118 // is complete, |callback| will run on the UI thread. |info| is not expected 119 // to stay valid after ExtractMalwareFeatures returns. All IPs stored in 120 // |info| will be cleared by calling this function. 121 virtual void ExtractMalwareFeatures(BrowseInfo* info, 122 ClientMalwareRequest* request, 123 const MalwareDoneCallback& callback); 124 125 private: 126 friend class base::DeleteHelper<BrowserFeatureExtractor>; 127 typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData; 128 typedef std::map<CancelableRequestProvider::Handle, 129 ExtractionData> PendingQueriesMap; 130 131 // Synchronous browser feature extraction. 132 void ExtractBrowseInfoFeatures(const BrowseInfo& info, 133 ClientPhishingRequest* request); 134 135 // Actually starts feature extraction (does the real work). 136 void StartExtractFeatures(ClientPhishingRequest* request, 137 const DoneCallback& callback); 138 139 // HistoryService callback which is called when we're done querying URL visits 140 // in the history. 141 void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle, 142 bool success, 143 const history::URLRow* row, 144 history::VisitVector* visits); 145 146 // HistoryService callback which is called when we're done querying HTTP host 147 // visits in the history. 148 void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle, 149 bool success, 150 int num_visits, 151 base::Time first_visit); 152 153 // HistoryService callback which is called when we're done querying HTTPS host 154 // visits in the history. 155 void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle, 156 bool success, 157 int num_visits, 158 base::Time first_visit); 159 160 // Helper function which sets the host history features given the 161 // number of host visits and the time of the fist host visit. Set 162 // |is_http_query| to true if the URL scheme is HTTP and to false if 163 // the scheme is HTTPS. 164 void SetHostVisitsFeatures(int num_visits, 165 base::Time first_visit, 166 bool is_http_query, 167 ClientPhishingRequest* request); 168 169 // Helper function which stores the request and callback while the history 170 // query is being processed. 171 void StorePendingQuery(CancelableRequestProvider::Handle handle, 172 ClientPhishingRequest* request, 173 const DoneCallback& callback); 174 175 // Helper function which is the counterpart of StorePendingQuery. If there 176 // is a pending query for the given handle it will return false and set both 177 // the request and cb pointers. Otherwise, it will return false. 178 bool GetPendingQuery(CancelableRequestProvider::Handle handle, 179 ClientPhishingRequest** request, 180 DoneCallback* callback); 181 182 // Helper function which gets the history server if possible. If the pointer 183 // is set it will return true and false otherwise. 184 bool GetHistoryService(HistoryService** history); 185 186 // Helper function which is called when we're done filtering out benign IPs 187 // on the IO thread. This function is called on the UI thread. 188 void FinishExtractMalwareFeatures(scoped_ptr<IPUrlMap> bad_ips, 189 MalwareDoneCallback callback, 190 scoped_ptr<ClientMalwareRequest> request); 191 192 content::WebContents* tab_; 193 ClientSideDetectionHost* host_; 194 CancelableRequestConsumer request_consumer_; 195 base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_; 196 197 // Set of pending extractions (i.e. extractions for which ExtractFeatures was 198 // called but not StartExtractFeatures). 199 std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_; 200 201 // Set of pending queries (i.e., where history->Query...() was called but 202 // the history callback hasn't been invoked yet). 203 PendingQueriesMap pending_queries_; 204 205 DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor); 206 }; 207 208 } // namespace safe_browsing 209 #endif // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_ 210