1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // PhishingTermFeatureExtractor handles computing term features from the text 6 // of a web page for the client-side phishing detection model. To do this, it 7 // takes a list of terms that appear in the model, and scans through the page 8 // text looking for them. Any terms that appear will cause a corresponding 9 // features::kPageTerm feature to be added to the FeatureMap. 10 // 11 // To make it harder for a phisher to enumerate all of the relevant terms in 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. 13 // 14 // There is one PhishingTermFeatureExtractor per RenderView. 15 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 18 19 #include <set> 20 #include <string> 21 22 #include "base/basictypes.h" 23 #include "base/callback.h" 24 #include "base/containers/hash_tables.h" 25 #include "base/memory/scoped_ptr.h" 26 #include "base/memory/weak_ptr.h" 27 #include "base/strings/string16.h" 28 #include "base/strings/string_piece.h" 29 30 namespace safe_browsing { 31 class FeatureExtractorClock; 32 class FeatureMap; 33 34 class PhishingTermFeatureExtractor { 35 public: 36 // Callback to be run when feature extraction finishes. The callback 37 // argument is true if extraction was successful, false otherwise. 38 typedef base::Callback<void(bool)> DoneCallback; 39 40 // Creates a PhishingTermFeatureExtractor which will extract features for 41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These 42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. 43 // 44 // |page_word_hashes| contains the murmur3 hashes for all of the individual 45 // words that make up the terms. Both sets of strings are UTF-8 encoded and 46 // lowercased prior to hashing. The caller owns both sets of strings, and 47 // must ensure that they are valid until the PhishingTermFeatureExtractor is 48 // destroyed. 49 // 50 // In addition to extracting page terms, we will also extract text shingling 51 // sketch, which consists of hashes of N-gram-words (referred to as shingles) 52 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines 53 // the maximum number of unique shingle hashes we extracted per page. 54 // 55 // |clock| is used for timing feature extractor operations, and may be mocked 56 // for testing. The caller keeps ownership of the clock. 57 PhishingTermFeatureExtractor( 58 const base::hash_set<std::string>* page_term_hashes, 59 const base::hash_set<uint32>* page_word_hashes, 60 size_t max_words_per_term, 61 uint32 murmurhash3_seed, 62 size_t max_shingles_per_page, 63 size_t shingle_size, 64 FeatureExtractorClock* clock); 65 ~PhishingTermFeatureExtractor(); 66 67 // Begins extracting features from |page_text| into the given FeatureMap. 68 // |page_text| should contain the plain text of a web page, including any 69 // subframes, as returned by RenderView::CaptureText(). 70 // 71 // To avoid blocking the render thread for too long, the feature extractor 72 // may run in several chunks of work, posting a task to the current 73 // MessageLoop to continue processing. Once feature extraction is complete, 74 // |done_callback| is run on the current thread. 75 // PhishingTermFeatureExtractor takes ownership of the callback. 76 // 77 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, 78 // and must not be destroyed until either |done_callback| is run or 79 // CancelPendingExtraction() is called. 80 void ExtractFeatures(const base::string16* page_text, 81 FeatureMap* features, 82 std::set<uint32>* shingle_hashes, 83 const DoneCallback& done_callback); 84 85 // Cancels any pending feature extraction. The DoneCallback will not be run. 86 // Must be called if there is a feature extraction in progress when the page 87 // is unloaded or the PhishingTermFeatureExtractor is destroyed. 88 void CancelPendingExtraction(); 89 90 private: 91 struct ExtractionState; 92 93 // The maximum amount of wall time that we will spend on a single extraction 94 // iteration before pausing to let other MessageLoop tasks run. 95 static const int kMaxTimePerChunkMs; 96 97 // The number of words that we will process before checking to see whether 98 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be 99 // slow, we don't do this on every word processed. 100 static const int kClockCheckGranularity; 101 102 // The maximum total amount of time that the feature extractor will run 103 // before giving up on the current page. 104 static const int kMaxTotalTimeMs; 105 106 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs 107 // until a predefined maximum amount of time has elapsed, then posts a task 108 // to the current MessageLoop to continue extraction. When extraction 109 // finishes, calls RunCallback(). 110 void ExtractFeaturesWithTimeout(); 111 112 // Handles a single word in the page text. 113 void HandleWord(const base::StringPiece16& word); 114 115 // Helper to verify that there is no pending feature extraction. Dies in 116 // debug builds if the state is not as expected. This is a no-op in release 117 // builds. 118 void CheckNoPendingExtraction(); 119 120 // Runs |done_callback_| and then clears all internal state. 121 void RunCallback(bool success); 122 123 // Clears all internal feature extraction state. 124 void Clear(); 125 126 // All of the term hashes that we are looking for in the page. 127 const base::hash_set<std::string>* page_term_hashes_; 128 129 // Murmur3 hashes of all the individual words in page_term_hashes_. If 130 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ 131 // would contain (hashed) "one" and "two". We do this so that we can have a 132 // quick out in the common case that the current word we are processing 133 // doesn't contain any part of one of our terms. 134 const base::hash_set<uint32>* page_word_hashes_; 135 136 // The maximum number of words in an n-gram. 137 const size_t max_words_per_term_; 138 139 // The seed for murmurhash3. 140 const uint32 murmurhash3_seed_; 141 142 // The maximum number of unique shingle hashes we extract in a page. 143 const size_t max_shingles_per_page_; 144 145 // The number of words in a shingle. 146 const size_t shingle_size_; 147 148 // Non-owned pointer to our clock. 149 FeatureExtractorClock* clock_; 150 151 // The output parameters from the most recent call to ExtractFeatures(). 152 const base::string16* page_text_; // The caller keeps ownership of this. 153 FeatureMap* features_; // The caller keeps ownership of this. 154 std::set<uint32>* shingle_hashes_; 155 DoneCallback done_callback_; 156 157 // Stores the current state of term extraction from |page_text_|. 158 scoped_ptr<ExtractionState> state_; 159 160 // Used in scheduling ExtractFeaturesWithTimeout tasks. 161 // These pointers are invalidated if extraction is cancelled. 162 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; 163 164 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); 165 }; 166 167 } // namespace safe_browsing 168 169 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 170