Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
      6 
      7 #include <list>
      8 #include <map>
      9 
     10 #include "base/bind.h"
     11 #include "base/compiler_specific.h"
     12 #include "base/i18n/break_iterator.h"
     13 #include "base/i18n/case_conversion.h"
     14 #include "base/logging.h"
     15 #include "base/memory/scoped_ptr.h"
     16 #include "base/message_loop/message_loop.h"
     17 #include "base/metrics/histogram.h"
     18 #include "base/strings/utf_string_conversions.h"
     19 #include "base/time/time.h"
     20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
     21 #include "chrome/renderer/safe_browsing/features.h"
     22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
     23 #include "crypto/sha2.h"
     24 
     25 namespace safe_browsing {
     26 
     27 // This time should be short enough that it doesn't noticeably disrupt the
     28 // user's interaction with the page.
     29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
     30 
     31 // Experimenting shows that we get a reasonable gain in performance by
     32 // increasing this up to around 10, but there's not much benefit in
     33 // increasing it past that.
     34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
     35 
     36 // This should be longer than we expect feature extraction to take on any
     37 // actual phishing page.
     38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
     39 
     40 // All of the state pertaining to the current feature extraction.
     41 struct PhishingTermFeatureExtractor::ExtractionState {
     42   // Stores up to max_words_per_term_ previous words separated by spaces.
     43   std::string previous_words;
     44 
     45   // Stores the current shingle after a new word is processed and added in.
     46   std::string current_shingle;
     47 
     48   // Stores the sizes of the words in current_shingle. Note: the size includes
     49   // the space after each word. In other words, the sum of all sizes in this
     50   // list is equal to the length of current_shingle.
     51   std::list<size_t> shingle_word_sizes;
     52 
     53   // Stores the sizes of the words in previous_words.  Note: the size includes
     54   // the space after each word.  In other words, the sum of all sizes in this
     55   // list is equal to the length of previous_words.
     56   std::list<size_t> previous_word_sizes;
     57 
     58   // An iterator for word breaking.
     59   scoped_ptr<base::i18n::BreakIterator> iterator;
     60 
     61   // The time at which we started feature extraction for the current page.
     62   base::TimeTicks start_time;
     63 
     64   // The number of iterations we've done for the current extraction.
     65   int num_iterations;
     66 
     67   ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
     68       : start_time(start_time_ticks),
     69         num_iterations(0) {
     70 
     71     scoped_ptr<base::i18n::BreakIterator> i(
     72         new base::i18n::BreakIterator(
     73             text, base::i18n::BreakIterator::BREAK_WORD));
     74 
     75     if (i->Init()) {
     76       iterator = i.Pass();
     77     } else {
     78       DLOG(ERROR) << "failed to open iterator";
     79     }
     80   }
     81 };
     82 
     83 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
     84     const base::hash_set<std::string>* page_term_hashes,
     85     const base::hash_set<uint32>* page_word_hashes,
     86     size_t max_words_per_term,
     87     uint32 murmurhash3_seed,
     88     size_t max_shingles_per_page,
     89     size_t shingle_size,
     90     FeatureExtractorClock* clock)
     91     : page_term_hashes_(page_term_hashes),
     92       page_word_hashes_(page_word_hashes),
     93       max_words_per_term_(max_words_per_term),
     94       murmurhash3_seed_(murmurhash3_seed),
     95       max_shingles_per_page_(max_shingles_per_page),
     96       shingle_size_(shingle_size),
     97       clock_(clock),
     98       weak_factory_(this) {
     99   Clear();
    100 }
    101 
    102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
    103   // The RenderView should have called CancelPendingExtraction() before
    104   // we are destroyed.
    105   CheckNoPendingExtraction();
    106 }
    107 
    108 void PhishingTermFeatureExtractor::ExtractFeatures(
    109     const base::string16* page_text,
    110     FeatureMap* features,
    111     std::set<uint32>* shingle_hashes,
    112     const DoneCallback& done_callback) {
    113   // The RenderView should have called CancelPendingExtraction() before
    114   // starting a new extraction, so DCHECK this.
    115   CheckNoPendingExtraction();
    116   // However, in an opt build, we will go ahead and clean up the pending
    117   // extraction so that we can start in a known state.
    118   CancelPendingExtraction();
    119 
    120   page_text_ = page_text;
    121   features_ = features;
    122   shingle_hashes_ = shingle_hashes,
    123   done_callback_ = done_callback;
    124 
    125   state_.reset(new ExtractionState(*page_text_, clock_->Now()));
    126   base::MessageLoop::current()->PostTask(
    127       FROM_HERE,
    128       base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
    129                  weak_factory_.GetWeakPtr()));
    130 }
    131 
    132 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
    133   // Cancel any pending callbacks, and clear our state.
    134   weak_factory_.InvalidateWeakPtrs();
    135   Clear();
    136 }
    137 
    138 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
    139   DCHECK(state_.get());
    140   ++state_->num_iterations;
    141   base::TimeTicks current_chunk_start_time = clock_->Now();
    142 
    143   if (!state_->iterator.get()) {
    144     // We failed to initialize the break iterator, so stop now.
    145     UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
    146     RunCallback(false);
    147     return;
    148   }
    149 
    150   int num_words = 0;
    151   while (state_->iterator->Advance()) {
    152     if (state_->iterator->IsWord()) {
    153       const size_t start = state_->iterator->prev();
    154       const size_t length = state_->iterator->pos() - start;
    155       HandleWord(base::StringPiece16(page_text_->data() + start, length));
    156       ++num_words;
    157     }
    158 
    159     if (num_words >= kClockCheckGranularity) {
    160       num_words = 0;
    161       base::TimeTicks now = clock_->Now();
    162       if (now - state_->start_time >=
    163           base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
    164         DLOG(ERROR) << "Feature extraction took too long, giving up";
    165         // We expect this to happen infrequently, so record when it does.
    166         UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
    167         RunCallback(false);
    168         return;
    169       }
    170       base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
    171       if (chunk_elapsed >=
    172           base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
    173         // The time limit for the current chunk is up, so post a task to
    174         // continue extraction.
    175         //
    176         // Record how much time we actually spent on the chunk.  If this is
    177         // much higher than kMaxTimePerChunkMs, we may need to adjust the
    178         // clock granularity.
    179         UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
    180                             chunk_elapsed);
    181         base::MessageLoop::current()->PostTask(
    182             FROM_HERE,
    183             base::Bind(
    184                 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
    185                 weak_factory_.GetWeakPtr()));
    186         return;
    187       }
    188       // Otherwise, continue.
    189     }
    190   }
    191   RunCallback(true);
    192 }
    193 
    194 void PhishingTermFeatureExtractor::HandleWord(
    195     const base::StringPiece16& word) {
    196   // First, extract shingle hashes.
    197   const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
    198   state_->current_shingle.append(word_lower + " ");
    199   state_->shingle_word_sizes.push_back(word_lower.size() + 1);
    200   if (state_->shingle_word_sizes.size() == shingle_size_) {
    201     shingle_hashes_->insert(
    202         MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
    203     state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
    204     state_->shingle_word_sizes.pop_front();
    205   }
    206   // Check if the size of shingle hashes is over the limit.
    207   if (shingle_hashes_->size() > max_shingles_per_page_) {
    208     // Pop the largest one.
    209     std::set<uint32>::iterator it = shingle_hashes_->end();
    210     shingle_hashes_->erase(--it);
    211   }
    212 
    213   // Next, extract page terms.
    214   uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
    215 
    216   // Quick out if the word is not part of any term, which is the common case.
    217   if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
    218     // Word doesn't exist in our terms so we can clear the n-gram state.
    219     state_->previous_words.clear();
    220     state_->previous_word_sizes.clear();
    221     return;
    222   }
    223 
    224   // Find all of the n-grams that we need to check and compute their SHA-256
    225   // hashes.
    226   std::map<std::string /* hash */, std::string /* plaintext */>
    227       hashes_to_check;
    228   hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
    229 
    230   // Combine the new word with the previous words to find additional n-grams.
    231   // Note that we don't yet add the new word length to previous_word_sizes,
    232   // since we don't want to compute the hash for the word by itself again.
    233   //
    234   state_->previous_words.append(word_lower);
    235   std::string current_term = state_->previous_words;
    236   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
    237        it != state_->previous_word_sizes.end(); ++it) {
    238     hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
    239     current_term.erase(0, *it);
    240   }
    241 
    242   // Add features for any hashes that match page_term_hashes_.
    243   for (std::map<std::string, std::string>::iterator it =
    244            hashes_to_check.begin();
    245        it != hashes_to_check.end(); ++it) {
    246     if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
    247       features_->AddBooleanFeature(features::kPageTerm + it->second);
    248     }
    249   }
    250 
    251   // Now that we have handled the current word, we have to add a space at the
    252   // end of it, and add the new word's size (including the space) to
    253   // previous_word_sizes.  Note: it's possible that the document language
    254   // doesn't use ASCII spaces to separate words.  That's fine though, we just
    255   // need to be consistent with how the model is generated.
    256   state_->previous_words.append(" ");
    257   state_->previous_word_sizes.push_back(word_lower.size() + 1);
    258 
    259   // Cap the number of previous words.
    260   if (state_->previous_word_sizes.size() >= max_words_per_term_) {
    261     state_->previous_words.erase(0, state_->previous_word_sizes.front());
    262     state_->previous_word_sizes.pop_front();
    263   }
    264 }
    265 
    266 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
    267   DCHECK(done_callback_.is_null());
    268   DCHECK(!state_.get());
    269   if (!done_callback_.is_null() || state_.get()) {
    270     LOG(ERROR) << "Extraction in progress, missing call to "
    271                << "CancelPendingExtraction";
    272   }
    273 }
    274 
    275 void PhishingTermFeatureExtractor::RunCallback(bool success) {
    276   // Record some timing stats that we can use to evaluate feature extraction
    277   // performance.  These include both successful and failed extractions.
    278   DCHECK(state_.get());
    279   UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
    280                        state_->num_iterations);
    281   UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
    282                       clock_->Now() - state_->start_time);
    283 
    284   DCHECK(!done_callback_.is_null());
    285   done_callback_.Run(success);
    286   Clear();
    287 }
    288 
    289 void PhishingTermFeatureExtractor::Clear() {
    290   page_text_ = NULL;
    291   features_ = NULL;
    292   shingle_hashes_ = NULL;
    293   done_callback_.Reset();
    294   state_.reset(NULL);
    295 }
    296 
    297 }  // namespace safe_browsing
    298