1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 6 7 #include <list> 8 #include <map> 9 10 #include "base/bind.h" 11 #include "base/compiler_specific.h" 12 #include "base/i18n/break_iterator.h" 13 #include "base/i18n/case_conversion.h" 14 #include "base/logging.h" 15 #include "base/memory/scoped_ptr.h" 16 #include "base/message_loop/message_loop.h" 17 #include "base/metrics/histogram.h" 18 #include "base/strings/utf_string_conversions.h" 19 #include "base/time/time.h" 20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 21 #include "chrome/renderer/safe_browsing/features.h" 22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 23 #include "crypto/sha2.h" 24 25 namespace safe_browsing { 26 27 // This time should be short enough that it doesn't noticeably disrupt the 28 // user's interaction with the page. 29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; 30 31 // Experimenting shows that we get a reasonable gain in performance by 32 // increasing this up to around 10, but there's not much benefit in 33 // increasing it past that. 34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; 35 36 // This should be longer than we expect feature extraction to take on any 37 // actual phishing page. 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; 39 40 // All of the state pertaining to the current feature extraction. 41 struct PhishingTermFeatureExtractor::ExtractionState { 42 // Stores up to max_words_per_term_ previous words separated by spaces. 43 std::string previous_words; 44 45 // Stores the current shingle after a new word is processed and added in. 46 std::string current_shingle; 47 48 // Stores the sizes of the words in current_shingle. Note: the size includes 49 // the space after each word. In other words, the sum of all sizes in this 50 // list is equal to the length of current_shingle. 51 std::list<size_t> shingle_word_sizes; 52 53 // Stores the sizes of the words in previous_words. Note: the size includes 54 // the space after each word. In other words, the sum of all sizes in this 55 // list is equal to the length of previous_words. 56 std::list<size_t> previous_word_sizes; 57 58 // An iterator for word breaking. 59 scoped_ptr<base::i18n::BreakIterator> iterator; 60 61 // The time at which we started feature extraction for the current page. 62 base::TimeTicks start_time; 63 64 // The number of iterations we've done for the current extraction. 65 int num_iterations; 66 67 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) 68 : start_time(start_time_ticks), 69 num_iterations(0) { 70 71 scoped_ptr<base::i18n::BreakIterator> i( 72 new base::i18n::BreakIterator( 73 text, base::i18n::BreakIterator::BREAK_WORD)); 74 75 if (i->Init()) { 76 iterator = i.Pass(); 77 } else { 78 DLOG(ERROR) << "failed to open iterator"; 79 } 80 } 81 }; 82 83 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 84 const base::hash_set<std::string>* page_term_hashes, 85 const base::hash_set<uint32>* page_word_hashes, 86 size_t max_words_per_term, 87 uint32 murmurhash3_seed, 88 size_t max_shingles_per_page, 89 size_t shingle_size, 90 FeatureExtractorClock* clock) 91 : page_term_hashes_(page_term_hashes), 92 page_word_hashes_(page_word_hashes), 93 max_words_per_term_(max_words_per_term), 94 murmurhash3_seed_(murmurhash3_seed), 95 max_shingles_per_page_(max_shingles_per_page), 96 shingle_size_(shingle_size), 97 clock_(clock), 98 weak_factory_(this) { 99 Clear(); 100 } 101 102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 103 // The RenderView should have called CancelPendingExtraction() before 104 // we are destroyed. 105 CheckNoPendingExtraction(); 106 } 107 108 void PhishingTermFeatureExtractor::ExtractFeatures( 109 const base::string16* page_text, 110 FeatureMap* features, 111 std::set<uint32>* shingle_hashes, 112 const DoneCallback& done_callback) { 113 // The RenderView should have called CancelPendingExtraction() before 114 // starting a new extraction, so DCHECK this. 115 CheckNoPendingExtraction(); 116 // However, in an opt build, we will go ahead and clean up the pending 117 // extraction so that we can start in a known state. 118 CancelPendingExtraction(); 119 120 page_text_ = page_text; 121 features_ = features; 122 shingle_hashes_ = shingle_hashes, 123 done_callback_ = done_callback; 124 125 state_.reset(new ExtractionState(*page_text_, clock_->Now())); 126 base::MessageLoop::current()->PostTask( 127 FROM_HERE, 128 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, 129 weak_factory_.GetWeakPtr())); 130 } 131 132 void PhishingTermFeatureExtractor::CancelPendingExtraction() { 133 // Cancel any pending callbacks, and clear our state. 134 weak_factory_.InvalidateWeakPtrs(); 135 Clear(); 136 } 137 138 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { 139 DCHECK(state_.get()); 140 ++state_->num_iterations; 141 base::TimeTicks current_chunk_start_time = clock_->Now(); 142 143 if (!state_->iterator.get()) { 144 // We failed to initialize the break iterator, so stop now. 145 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); 146 RunCallback(false); 147 return; 148 } 149 150 int num_words = 0; 151 while (state_->iterator->Advance()) { 152 if (state_->iterator->IsWord()) { 153 const size_t start = state_->iterator->prev(); 154 const size_t length = state_->iterator->pos() - start; 155 HandleWord(base::StringPiece16(page_text_->data() + start, length)); 156 ++num_words; 157 } 158 159 if (num_words >= kClockCheckGranularity) { 160 num_words = 0; 161 base::TimeTicks now = clock_->Now(); 162 if (now - state_->start_time >= 163 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 164 DLOG(ERROR) << "Feature extraction took too long, giving up"; 165 // We expect this to happen infrequently, so record when it does. 166 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); 167 RunCallback(false); 168 return; 169 } 170 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 171 if (chunk_elapsed >= 172 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 173 // The time limit for the current chunk is up, so post a task to 174 // continue extraction. 175 // 176 // Record how much time we actually spent on the chunk. If this is 177 // much higher than kMaxTimePerChunkMs, we may need to adjust the 178 // clock granularity. 179 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime", 180 chunk_elapsed); 181 base::MessageLoop::current()->PostTask( 182 FROM_HERE, 183 base::Bind( 184 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, 185 weak_factory_.GetWeakPtr())); 186 return; 187 } 188 // Otherwise, continue. 189 } 190 } 191 RunCallback(true); 192 } 193 194 void PhishingTermFeatureExtractor::HandleWord( 195 const base::StringPiece16& word) { 196 // First, extract shingle hashes. 197 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); 198 state_->current_shingle.append(word_lower + " "); 199 state_->shingle_word_sizes.push_back(word_lower.size() + 1); 200 if (state_->shingle_word_sizes.size() == shingle_size_) { 201 shingle_hashes_->insert( 202 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); 203 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); 204 state_->shingle_word_sizes.pop_front(); 205 } 206 // Check if the size of shingle hashes is over the limit. 207 if (shingle_hashes_->size() > max_shingles_per_page_) { 208 // Pop the largest one. 209 std::set<uint32>::iterator it = shingle_hashes_->end(); 210 shingle_hashes_->erase(--it); 211 } 212 213 // Next, extract page terms. 214 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); 215 216 // Quick out if the word is not part of any term, which is the common case. 217 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 218 // Word doesn't exist in our terms so we can clear the n-gram state. 219 state_->previous_words.clear(); 220 state_->previous_word_sizes.clear(); 221 return; 222 } 223 224 // Find all of the n-grams that we need to check and compute their SHA-256 225 // hashes. 226 std::map<std::string /* hash */, std::string /* plaintext */> 227 hashes_to_check; 228 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; 229 230 // Combine the new word with the previous words to find additional n-grams. 231 // Note that we don't yet add the new word length to previous_word_sizes, 232 // since we don't want to compute the hash for the word by itself again. 233 // 234 state_->previous_words.append(word_lower); 235 std::string current_term = state_->previous_words; 236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); 237 it != state_->previous_word_sizes.end(); ++it) { 238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; 239 current_term.erase(0, *it); 240 } 241 242 // Add features for any hashes that match page_term_hashes_. 243 for (std::map<std::string, std::string>::iterator it = 244 hashes_to_check.begin(); 245 it != hashes_to_check.end(); ++it) { 246 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { 247 features_->AddBooleanFeature(features::kPageTerm + it->second); 248 } 249 } 250 251 // Now that we have handled the current word, we have to add a space at the 252 // end of it, and add the new word's size (including the space) to 253 // previous_word_sizes. Note: it's possible that the document language 254 // doesn't use ASCII spaces to separate words. That's fine though, we just 255 // need to be consistent with how the model is generated. 256 state_->previous_words.append(" "); 257 state_->previous_word_sizes.push_back(word_lower.size() + 1); 258 259 // Cap the number of previous words. 260 if (state_->previous_word_sizes.size() >= max_words_per_term_) { 261 state_->previous_words.erase(0, state_->previous_word_sizes.front()); 262 state_->previous_word_sizes.pop_front(); 263 } 264 } 265 266 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() { 267 DCHECK(done_callback_.is_null()); 268 DCHECK(!state_.get()); 269 if (!done_callback_.is_null() || state_.get()) { 270 LOG(ERROR) << "Extraction in progress, missing call to " 271 << "CancelPendingExtraction"; 272 } 273 } 274 275 void PhishingTermFeatureExtractor::RunCallback(bool success) { 276 // Record some timing stats that we can use to evaluate feature extraction 277 // performance. These include both successful and failed extractions. 278 DCHECK(state_.get()); 279 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations", 280 state_->num_iterations); 281 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime", 282 clock_->Now() - state_->start_time); 283 284 DCHECK(!done_callback_.is_null()); 285 done_callback_.Run(success); 286 Clear(); 287 } 288 289 void PhishingTermFeatureExtractor::Clear() { 290 page_text_ = NULL; 291 features_ = NULL; 292 shingle_hashes_ = NULL; 293 done_callback_.Reset(); 294 state_.reset(NULL); 295 } 296 297 } // namespace safe_browsing 298