Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
      6 
      7 #include <list>
      8 #include <map>
      9 
     10 #include "base/bind.h"
     11 #include "base/compiler_specific.h"
     12 #include "base/i18n/case_conversion.h"
     13 #include "base/logging.h"
     14 #include "base/message_loop/message_loop.h"
     15 #include "base/metrics/histogram.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "base/time/time.h"
     18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
     19 #include "chrome/renderer/safe_browsing/features.h"
     20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
     21 #include "crypto/sha2.h"
     22 #include "third_party/icu/source/common/unicode/ubrk.h"
     23 #include "ui/base/l10n/l10n_util.h"
     24 
     25 namespace safe_browsing {
     26 
     27 // This time should be short enough that it doesn't noticeably disrupt the
     28 // user's interaction with the page.
     29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
     30 
     31 // Experimenting shows that we get a reasonable gain in performance by
     32 // increasing this up to around 10, but there's not much benefit in
     33 // increasing it past that.
     34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
     35 
     36 // This should be longer than we expect feature extraction to take on any
     37 // actual phishing page.
     38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
     39 
     40 // The maximum size of the negative word cache.
     41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
     42 
     43 // All of the state pertaining to the current feature extraction.
     44 struct PhishingTermFeatureExtractor::ExtractionState {
     45   // Stores up to max_words_per_term_ previous words separated by spaces.
     46   std::string previous_words;
     47 
     48   // Stores the sizes of the words in previous_words.  Note: the size includes
     49   // the space after each word.  In other words, the sum of all sizes in this
     50   // list is equal to the length of previous_words.
     51   std::list<size_t> previous_word_sizes;
     52 
     53   // An iterator for word breaking.
     54   UBreakIterator* iterator;
     55 
     56   // Our current position in the text that was passed to the ExtractionState
     57   // constructor, speciailly, the most recent break position returned by our
     58   // iterator.
     59   int position;
     60 
     61   // True if position has been initialized.
     62   bool position_initialized;
     63 
     64   // The time at which we started feature extraction for the current page.
     65   base::TimeTicks start_time;
     66 
     67   // The number of iterations we've done for the current extraction.
     68   int num_iterations;
     69 
     70   ExtractionState(const string16& text, base::TimeTicks start_time_ticks)
     71       : position(-1),
     72         position_initialized(false),
     73         start_time(start_time_ticks),
     74         num_iterations(0) {
     75     UErrorCode status = U_ZERO_ERROR;
     76     // TODO(bryner): We should pass in the language for the document.
     77     iterator = ubrk_open(UBRK_WORD, NULL,
     78                          text.data(), text.size(),
     79                          &status);
     80     if (U_FAILURE(status)) {
     81       DLOG(ERROR) << "ubrk_open failed: " << status;
     82       iterator = NULL;
     83     }
     84   }
     85 
     86   ~ExtractionState() {
     87     if (iterator) {
     88       ubrk_close(iterator);
     89     }
     90   }
     91 };
     92 
     93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
     94     const base::hash_set<std::string>* page_term_hashes,
     95     const base::hash_set<uint32>* page_word_hashes,
     96     size_t max_words_per_term,
     97     uint32 murmurhash3_seed,
     98     FeatureExtractorClock* clock)
     99     : page_term_hashes_(page_term_hashes),
    100       page_word_hashes_(page_word_hashes),
    101       max_words_per_term_(max_words_per_term),
    102       murmurhash3_seed_(murmurhash3_seed),
    103       negative_word_cache_(kMaxNegativeWordCacheSize),
    104       clock_(clock),
    105       weak_factory_(this) {
    106   Clear();
    107 }
    108 
    109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
    110   // The RenderView should have called CancelPendingExtraction() before
    111   // we are destroyed.
    112   CheckNoPendingExtraction();
    113 }
    114 
    115 void PhishingTermFeatureExtractor::ExtractFeatures(
    116     const string16* page_text,
    117     FeatureMap* features,
    118     const DoneCallback& done_callback) {
    119   // The RenderView should have called CancelPendingExtraction() before
    120   // starting a new extraction, so DCHECK this.
    121   CheckNoPendingExtraction();
    122   // However, in an opt build, we will go ahead and clean up the pending
    123   // extraction so that we can start in a known state.
    124   CancelPendingExtraction();
    125 
    126   page_text_ = page_text;
    127   features_ = features;
    128   done_callback_ = done_callback;
    129 
    130   state_.reset(new ExtractionState(*page_text_, clock_->Now()));
    131   base::MessageLoop::current()->PostTask(
    132       FROM_HERE,
    133       base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
    134                  weak_factory_.GetWeakPtr()));
    135 }
    136 
    137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
    138   // Cancel any pending callbacks, and clear our state.
    139   weak_factory_.InvalidateWeakPtrs();
    140   Clear();
    141 }
    142 
    143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
    144   DCHECK(state_.get());
    145   ++state_->num_iterations;
    146   base::TimeTicks current_chunk_start_time = clock_->Now();
    147 
    148   if (!state_->iterator) {
    149     // We failed to initialize the break iterator, so stop now.
    150     UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
    151     RunCallback(false);
    152     return;
    153   }
    154 
    155   if (!state_->position_initialized) {
    156     state_->position = ubrk_first(state_->iterator);
    157     if (state_->position == UBRK_DONE) {
    158       // No words present, so we're done.
    159       RunCallback(true);
    160       return;
    161     }
    162     state_->position_initialized = true;
    163   }
    164 
    165   int num_words = 0;
    166   for (int next = ubrk_next(state_->iterator);
    167        next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
    168     if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
    169       // next is now positioned at the end of a word.
    170       HandleWord(base::StringPiece16(page_text_->data() + state_->position,
    171                                      next - state_->position));
    172       ++num_words;
    173     }
    174     state_->position = next;
    175 
    176     if (num_words >= kClockCheckGranularity) {
    177       num_words = 0;
    178       base::TimeTicks now = clock_->Now();
    179       if (now - state_->start_time >=
    180           base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
    181         DLOG(ERROR) << "Feature extraction took too long, giving up";
    182         // We expect this to happen infrequently, so record when it does.
    183         UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
    184         RunCallback(false);
    185         return;
    186       }
    187       base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
    188       if (chunk_elapsed >=
    189           base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
    190         // The time limit for the current chunk is up, so post a task to
    191         // continue extraction.
    192         //
    193         // Record how much time we actually spent on the chunk.  If this is
    194         // much higher than kMaxTimePerChunkMs, we may need to adjust the
    195         // clock granularity.
    196         UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
    197                             chunk_elapsed);
    198         base::MessageLoop::current()->PostTask(
    199             FROM_HERE,
    200             base::Bind(
    201                 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
    202                 weak_factory_.GetWeakPtr()));
    203         return;
    204       }
    205       // Otherwise, continue.
    206     }
    207   }
    208   RunCallback(true);
    209 }
    210 
    211 void PhishingTermFeatureExtractor::HandleWord(
    212     const base::StringPiece16& word) {
    213   // Quickest out if we have seen this word before and know that it's not
    214   // part of any term. This avoids the lowercasing and UTF conversion, both of
    215   // which are relatively expensive.
    216   if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
    217     // We know we're no longer in a possible n-gram, so clear the previous word
    218     // state.
    219     state_->previous_words.clear();
    220     state_->previous_word_sizes.clear();
    221     return;
    222   }
    223 
    224   std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
    225   uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
    226 
    227   // Quick out if the word is not part of any term, which is the common case.
    228   if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
    229     // Word doesn't exist in our terms so we can clear the n-gram state.
    230     state_->previous_words.clear();
    231     state_->previous_word_sizes.clear();
    232     // Insert into negative cache so that we don't try this again.
    233     negative_word_cache_.Put(word, true);
    234     return;
    235   }
    236 
    237   // Find all of the n-grams that we need to check and compute their SHA-256
    238   // hashes.
    239   std::map<std::string /* hash */, std::string /* plaintext */>
    240       hashes_to_check;
    241   hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
    242 
    243   // Combine the new word with the previous words to find additional n-grams.
    244   // Note that we don't yet add the new word length to previous_word_sizes,
    245   // since we don't want to compute the hash for the word by itself again.
    246   //
    247   state_->previous_words.append(word_lower);
    248   std::string current_term = state_->previous_words;
    249   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
    250        it != state_->previous_word_sizes.end(); ++it) {
    251     hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
    252     current_term.erase(0, *it);
    253   }
    254 
    255   // Add features for any hashes that match page_term_hashes_.
    256   for (std::map<std::string, std::string>::iterator it =
    257            hashes_to_check.begin();
    258        it != hashes_to_check.end(); ++it) {
    259     if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
    260       features_->AddBooleanFeature(features::kPageTerm + it->second);
    261     }
    262   }
    263 
    264   // Now that we have handled the current word, we have to add a space at the
    265   // end of it, and add the new word's size (including the space) to
    266   // previous_word_sizes.  Note: it's possible that the document language
    267   // doesn't use ASCII spaces to separate words.  That's fine though, we just
    268   // need to be consistent with how the model is generated.
    269   state_->previous_words.append(" ");
    270   state_->previous_word_sizes.push_back(word_lower.size() + 1);
    271 
    272   // Cap the number of previous words.
    273   if (state_->previous_word_sizes.size() >= max_words_per_term_) {
    274     state_->previous_words.erase(0, state_->previous_word_sizes.front());
    275     state_->previous_word_sizes.pop_front();
    276   }
    277 }
    278 
    279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
    280   DCHECK(done_callback_.is_null());
    281   DCHECK(!state_.get());
    282   if (!done_callback_.is_null() || state_.get()) {
    283     LOG(ERROR) << "Extraction in progress, missing call to "
    284                << "CancelPendingExtraction";
    285   }
    286 }
    287 
    288 void PhishingTermFeatureExtractor::RunCallback(bool success) {
    289   // Record some timing stats that we can use to evaluate feature extraction
    290   // performance.  These include both successful and failed extractions.
    291   DCHECK(state_.get());
    292   UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
    293                        state_->num_iterations);
    294   UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
    295                       clock_->Now() - state_->start_time);
    296 
    297   DCHECK(!done_callback_.is_null());
    298   done_callback_.Run(success);
    299   Clear();
    300 }
    301 
    302 void PhishingTermFeatureExtractor::Clear() {
    303   page_text_ = NULL;
    304   features_ = NULL;
    305   done_callback_.Reset();
    306   state_.reset(NULL);
    307   negative_word_cache_.Clear();
    308 }
    309 
    310 }  // namespace safe_browsing
    311