1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the 6 // client-side phishing detection model. These include the presence of various 7 // types of elements, ratios of external and secure links, and tokens for 8 // external domains linked to. 9 10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 12 13 #include <string> 14 15 #include "base/basictypes.h" 16 #include "base/callback.h" 17 #include "base/memory/scoped_ptr.h" 18 #include "base/memory/weak_ptr.h" 19 #include "third_party/WebKit/public/web/WebDocument.h" 20 21 class GURL; 22 23 namespace blink { 24 class WebElement; 25 } 26 27 namespace content { 28 class RenderView; 29 } 30 31 namespace safe_browsing { 32 class FeatureExtractorClock; 33 class FeatureMap; 34 35 class PhishingDOMFeatureExtractor { 36 public: 37 // Callback to be run when feature extraction finishes. The callback 38 // argument is true if extraction was successful, false otherwise. 39 typedef base::Callback<void(bool)> DoneCallback; 40 41 // Creates a PhishingDOMFeatureExtractor for the specified RenderView. 42 // The PhishingDOMFeatureExtrator should be destroyed prior to destroying 43 // the RenderView. |clock| is used for timing feature extractor operations, 44 // and may be mocked for testing. The caller maintains ownership of the 45 // clock. 46 PhishingDOMFeatureExtractor(content::RenderView* render_view, 47 FeatureExtractorClock* clock); 48 ~PhishingDOMFeatureExtractor(); 49 50 // Begins extracting features into the given FeatureMap for the page 51 // currently loaded in this object's RenderView. To avoid blocking the 52 // render thread for too long, the feature extractor may run in several 53 // chunks of work, posting a task to the current MessageLoop to continue 54 // processing. Once feature extraction is complete, |done_callback| 55 // is run on the current thread. PhishingDOMFeatureExtractor takes 56 // ownership of the callback. 57 void ExtractFeatures(FeatureMap* features, const DoneCallback& done_callback); 58 59 // Cancels any pending feature extraction. The DoneCallback will not be run. 60 // Must be called if there is a feature extraction in progress when the page 61 // is unloaded or the PhishingDOMFeatureExtractor is destroyed. 62 void CancelPendingExtraction(); 63 64 private: 65 struct FrameData; 66 struct PageFeatureState; 67 68 // The maximum amount of wall time that we will spend on a single extraction 69 // iteration before pausing to let other MessageLoop tasks run. 70 static const int kMaxTimePerChunkMs; 71 72 // The number of elements that we will process before checking to see whether 73 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be 74 // slow, we don't do this on every element processed. 75 static const int kClockCheckGranularity; 76 77 // The maximum total amount of time that the feature extractor will run 78 // before giving up on the current page. 79 static const int kMaxTotalTimeMs; 80 81 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs 82 // until a predefined maximum amount of time has elapsed, then posts a task 83 // to the current MessageLoop to continue extraction. When extraction 84 // finishes, calls RunCallback(). 85 void ExtractFeaturesWithTimeout(); 86 87 // Handlers for the various HTML elements that we compute features for. 88 // Since some of the features (such as ratios) cannot be computed until 89 // feature extraction is finished, these handlers do not add to the feature 90 // map directly. Instead, they update the values in the PageFeatureState. 91 void HandleLink(const blink::WebElement& element); 92 void HandleForm(const blink::WebElement& element); 93 void HandleImage(const blink::WebElement& element); 94 void HandleInput(const blink::WebElement& element); 95 void HandleScript(const blink::WebElement& element); 96 97 // Helper to verify that there is no pending feature extraction. Dies in 98 // debug builds if the state is not as expected. This is a no-op in release 99 // builds. 100 void CheckNoPendingExtraction(); 101 102 // Runs |done_callback_| and then clears all internal state. 103 void RunCallback(bool success); 104 105 // Clears all internal feature extraction state. 106 void Clear(); 107 108 // Called after advancing |cur_document_| to update the state in 109 // |cur_frame_data_|. 110 void ResetFrameData(); 111 112 // Returns the next document in frame-traversal order from cur_document_. 113 // If there are no more documents, returns a null WebDocument. 114 blink::WebDocument GetNextDocument(); 115 116 // Given a URL, checks whether the domain is different from the domain of 117 // the current frame's URL. If so, stores the domain in |domain| and returns 118 // true, otherwise returns false. 119 bool IsExternalDomain(const GURL& url, std::string* domain) const; 120 121 // Called once all frames have been processed to compute features from the 122 // PageFeatureState and add them to |features_|. See features.h for a 123 // description of which features are computed. 124 void InsertFeatures(); 125 126 // Non-owned pointer to the view that we will extract features from. 127 content::RenderView* render_view_; 128 129 // Non-owned pointer to our clock. 130 FeatureExtractorClock* clock_; 131 132 // The output parameters from the most recent call to ExtractFeatures(). 133 FeatureMap* features_; // The caller keeps ownership of this. 134 DoneCallback done_callback_; 135 136 // The current (sub-)document that we are processing. May be a null document 137 // (isNull()) if we are not currently extracting features. 138 blink::WebDocument cur_document_; 139 140 // Stores extra state for |cur_document_| that will be persisted until we 141 // advance to the next frame. 142 scoped_ptr<FrameData> cur_frame_data_; 143 144 // Stores the intermediate data used to create features. This data is 145 // accumulated across all frames in the RenderView. 146 scoped_ptr<PageFeatureState> page_feature_state_; 147 148 // Used in scheduling ExtractFeaturesWithTimeout tasks. 149 // These pointers are invalidated if extraction is cancelled. 150 base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_; 151 152 DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor); 153 }; 154 155 } // namespace safe_browsing 156 157 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ 158