1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 6 7 #include <list> 8 #include <map> 9 10 #include "base/bind.h" 11 #include "base/compiler_specific.h" 12 #include "base/i18n/case_conversion.h" 13 #include "base/logging.h" 14 #include "base/message_loop/message_loop.h" 15 #include "base/metrics/histogram.h" 16 #include "base/strings/utf_string_conversions.h" 17 #include "base/time/time.h" 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 19 #include "chrome/renderer/safe_browsing/features.h" 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 21 #include "crypto/sha2.h" 22 #include "third_party/icu/source/common/unicode/ubrk.h" 23 #include "ui/base/l10n/l10n_util.h" 24 25 namespace safe_browsing { 26 27 // This time should be short enough that it doesn't noticeably disrupt the 28 // user's interaction with the page. 29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; 30 31 // Experimenting shows that we get a reasonable gain in performance by 32 // increasing this up to around 10, but there's not much benefit in 33 // increasing it past that. 34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; 35 36 // This should be longer than we expect feature extraction to take on any 37 // actual phishing page. 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; 39 40 // The maximum size of the negative word cache. 41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; 42 43 // All of the state pertaining to the current feature extraction. 44 struct PhishingTermFeatureExtractor::ExtractionState { 45 // Stores up to max_words_per_term_ previous words separated by spaces. 46 std::string previous_words; 47 48 // Stores the sizes of the words in previous_words. Note: the size includes 49 // the space after each word. In other words, the sum of all sizes in this 50 // list is equal to the length of previous_words. 51 std::list<size_t> previous_word_sizes; 52 53 // An iterator for word breaking. 54 UBreakIterator* iterator; 55 56 // Our current position in the text that was passed to the ExtractionState 57 // constructor, speciailly, the most recent break position returned by our 58 // iterator. 59 int position; 60 61 // True if position has been initialized. 62 bool position_initialized; 63 64 // The time at which we started feature extraction for the current page. 65 base::TimeTicks start_time; 66 67 // The number of iterations we've done for the current extraction. 68 int num_iterations; 69 70 ExtractionState(const string16& text, base::TimeTicks start_time_ticks) 71 : position(-1), 72 position_initialized(false), 73 start_time(start_time_ticks), 74 num_iterations(0) { 75 UErrorCode status = U_ZERO_ERROR; 76 // TODO(bryner): We should pass in the language for the document. 77 iterator = ubrk_open(UBRK_WORD, NULL, 78 text.data(), text.size(), 79 &status); 80 if (U_FAILURE(status)) { 81 DLOG(ERROR) << "ubrk_open failed: " << status; 82 iterator = NULL; 83 } 84 } 85 86 ~ExtractionState() { 87 if (iterator) { 88 ubrk_close(iterator); 89 } 90 } 91 }; 92 93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 94 const base::hash_set<std::string>* page_term_hashes, 95 const base::hash_set<uint32>* page_word_hashes, 96 size_t max_words_per_term, 97 uint32 murmurhash3_seed, 98 FeatureExtractorClock* clock) 99 : page_term_hashes_(page_term_hashes), 100 page_word_hashes_(page_word_hashes), 101 max_words_per_term_(max_words_per_term), 102 murmurhash3_seed_(murmurhash3_seed), 103 negative_word_cache_(kMaxNegativeWordCacheSize), 104 clock_(clock), 105 weak_factory_(this) { 106 Clear(); 107 } 108 109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 110 // The RenderView should have called CancelPendingExtraction() before 111 // we are destroyed. 112 CheckNoPendingExtraction(); 113 } 114 115 void PhishingTermFeatureExtractor::ExtractFeatures( 116 const string16* page_text, 117 FeatureMap* features, 118 const DoneCallback& done_callback) { 119 // The RenderView should have called CancelPendingExtraction() before 120 // starting a new extraction, so DCHECK this. 121 CheckNoPendingExtraction(); 122 // However, in an opt build, we will go ahead and clean up the pending 123 // extraction so that we can start in a known state. 124 CancelPendingExtraction(); 125 126 page_text_ = page_text; 127 features_ = features; 128 done_callback_ = done_callback; 129 130 state_.reset(new ExtractionState(*page_text_, clock_->Now())); 131 base::MessageLoop::current()->PostTask( 132 FROM_HERE, 133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, 134 weak_factory_.GetWeakPtr())); 135 } 136 137 void PhishingTermFeatureExtractor::CancelPendingExtraction() { 138 // Cancel any pending callbacks, and clear our state. 139 weak_factory_.InvalidateWeakPtrs(); 140 Clear(); 141 } 142 143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { 144 DCHECK(state_.get()); 145 ++state_->num_iterations; 146 base::TimeTicks current_chunk_start_time = clock_->Now(); 147 148 if (!state_->iterator) { 149 // We failed to initialize the break iterator, so stop now. 150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); 151 RunCallback(false); 152 return; 153 } 154 155 if (!state_->position_initialized) { 156 state_->position = ubrk_first(state_->iterator); 157 if (state_->position == UBRK_DONE) { 158 // No words present, so we're done. 159 RunCallback(true); 160 return; 161 } 162 state_->position_initialized = true; 163 } 164 165 int num_words = 0; 166 for (int next = ubrk_next(state_->iterator); 167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { 168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { 169 // next is now positioned at the end of a word. 170 HandleWord(base::StringPiece16(page_text_->data() + state_->position, 171 next - state_->position)); 172 ++num_words; 173 } 174 state_->position = next; 175 176 if (num_words >= kClockCheckGranularity) { 177 num_words = 0; 178 base::TimeTicks now = clock_->Now(); 179 if (now - state_->start_time >= 180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 181 DLOG(ERROR) << "Feature extraction took too long, giving up"; 182 // We expect this to happen infrequently, so record when it does. 183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); 184 RunCallback(false); 185 return; 186 } 187 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 188 if (chunk_elapsed >= 189 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 190 // The time limit for the current chunk is up, so post a task to 191 // continue extraction. 192 // 193 // Record how much time we actually spent on the chunk. If this is 194 // much higher than kMaxTimePerChunkMs, we may need to adjust the 195 // clock granularity. 196 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime", 197 chunk_elapsed); 198 base::MessageLoop::current()->PostTask( 199 FROM_HERE, 200 base::Bind( 201 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, 202 weak_factory_.GetWeakPtr())); 203 return; 204 } 205 // Otherwise, continue. 206 } 207 } 208 RunCallback(true); 209 } 210 211 void PhishingTermFeatureExtractor::HandleWord( 212 const base::StringPiece16& word) { 213 // Quickest out if we have seen this word before and know that it's not 214 // part of any term. This avoids the lowercasing and UTF conversion, both of 215 // which are relatively expensive. 216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { 217 // We know we're no longer in a possible n-gram, so clear the previous word 218 // state. 219 state_->previous_words.clear(); 220 state_->previous_word_sizes.clear(); 221 return; 222 } 223 224 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); 225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); 226 227 // Quick out if the word is not part of any term, which is the common case. 228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 229 // Word doesn't exist in our terms so we can clear the n-gram state. 230 state_->previous_words.clear(); 231 state_->previous_word_sizes.clear(); 232 // Insert into negative cache so that we don't try this again. 233 negative_word_cache_.Put(word, true); 234 return; 235 } 236 237 // Find all of the n-grams that we need to check and compute their SHA-256 238 // hashes. 239 std::map<std::string /* hash */, std::string /* plaintext */> 240 hashes_to_check; 241 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; 242 243 // Combine the new word with the previous words to find additional n-grams. 244 // Note that we don't yet add the new word length to previous_word_sizes, 245 // since we don't want to compute the hash for the word by itself again. 246 // 247 state_->previous_words.append(word_lower); 248 std::string current_term = state_->previous_words; 249 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); 250 it != state_->previous_word_sizes.end(); ++it) { 251 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; 252 current_term.erase(0, *it); 253 } 254 255 // Add features for any hashes that match page_term_hashes_. 256 for (std::map<std::string, std::string>::iterator it = 257 hashes_to_check.begin(); 258 it != hashes_to_check.end(); ++it) { 259 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { 260 features_->AddBooleanFeature(features::kPageTerm + it->second); 261 } 262 } 263 264 // Now that we have handled the current word, we have to add a space at the 265 // end of it, and add the new word's size (including the space) to 266 // previous_word_sizes. Note: it's possible that the document language 267 // doesn't use ASCII spaces to separate words. That's fine though, we just 268 // need to be consistent with how the model is generated. 269 state_->previous_words.append(" "); 270 state_->previous_word_sizes.push_back(word_lower.size() + 1); 271 272 // Cap the number of previous words. 273 if (state_->previous_word_sizes.size() >= max_words_per_term_) { 274 state_->previous_words.erase(0, state_->previous_word_sizes.front()); 275 state_->previous_word_sizes.pop_front(); 276 } 277 } 278 279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() { 280 DCHECK(done_callback_.is_null()); 281 DCHECK(!state_.get()); 282 if (!done_callback_.is_null() || state_.get()) { 283 LOG(ERROR) << "Extraction in progress, missing call to " 284 << "CancelPendingExtraction"; 285 } 286 } 287 288 void PhishingTermFeatureExtractor::RunCallback(bool success) { 289 // Record some timing stats that we can use to evaluate feature extraction 290 // performance. These include both successful and failed extractions. 291 DCHECK(state_.get()); 292 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations", 293 state_->num_iterations); 294 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime", 295 clock_->Now() - state_->start_time); 296 297 DCHECK(!done_callback_.is_null()); 298 done_callback_.Run(success); 299 Clear(); 300 } 301 302 void PhishingTermFeatureExtractor::Clear() { 303 page_text_ = NULL; 304 features_ = NULL; 305 done_callback_.Reset(); 306 state_.reset(NULL); 307 negative_word_cache_.Clear(); 308 } 309 310 } // namespace safe_browsing 311