Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // PhishingTermFeatureExtractor handles computing term features from the text
      6 // of a web page for the client-side phishing detection model.  To do this, it
      7 // takes a list of terms that appear in the model, and scans through the page
      8 // text looking for them.  Any terms that appear will cause a corresponding
      9 // features::kPageTerm feature to be added to the FeatureMap.
     10 //
     11 // To make it harder for a phisher to enumerate all of the relevant terms in
     12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
     13 //
     14 // There is one PhishingTermFeatureExtractor per RenderView.
     15 
     16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
     17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
     18 
     19 #include <set>
     20 #include <string>
     21 
     22 #include "base/basictypes.h"
     23 #include "base/callback.h"
     24 #include "base/containers/hash_tables.h"
     25 #include "base/memory/scoped_ptr.h"
     26 #include "base/memory/weak_ptr.h"
     27 #include "base/strings/string16.h"
     28 #include "base/strings/string_piece.h"
     29 
     30 namespace safe_browsing {
     31 class FeatureExtractorClock;
     32 class FeatureMap;
     33 
     34 class PhishingTermFeatureExtractor {
     35  public:
     36   // Callback to be run when feature extraction finishes.  The callback
     37   // argument is true if extraction was successful, false otherwise.
     38   typedef base::Callback<void(bool)> DoneCallback;
     39 
     40   // Creates a PhishingTermFeatureExtractor which will extract features for
     41   // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
     42   // terms may be multi-word n-grams, with at most |max_words_per_term| words.
     43   //
     44   // |page_word_hashes| contains the murmur3 hashes for all of the individual
     45   // words that make up the terms.  Both sets of strings are UTF-8 encoded and
     46   // lowercased prior to hashing.  The caller owns both sets of strings, and
     47   // must ensure that they are valid until the PhishingTermFeatureExtractor is
     48   // destroyed.
     49   //
     50   // In addition to extracting page terms, we will also extract text shingling
     51   // sketch, which consists of hashes of N-gram-words (referred to as shingles)
     52   // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
     53   // the maximum number of unique shingle hashes we extracted per page.
     54   //
     55   // |clock| is used for timing feature extractor operations, and may be mocked
     56   // for testing.  The caller keeps ownership of the clock.
     57   PhishingTermFeatureExtractor(
     58       const base::hash_set<std::string>* page_term_hashes,
     59       const base::hash_set<uint32>* page_word_hashes,
     60       size_t max_words_per_term,
     61       uint32 murmurhash3_seed,
     62       size_t max_shingles_per_page,
     63       size_t shingle_size,
     64       FeatureExtractorClock* clock);
     65   ~PhishingTermFeatureExtractor();
     66 
     67   // Begins extracting features from |page_text| into the given FeatureMap.
     68   // |page_text| should contain the plain text of a web page, including any
     69   // subframes, as returned by RenderView::CaptureText().
     70   //
     71   // To avoid blocking the render thread for too long, the feature extractor
     72   // may run in several chunks of work, posting a task to the current
     73   // MessageLoop to continue processing.  Once feature extraction is complete,
     74   // |done_callback| is run on the current thread.
     75   // PhishingTermFeatureExtractor takes ownership of the callback.
     76   //
     77   // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
     78   // and must not be destroyed until either |done_callback| is run or
     79   // CancelPendingExtraction() is called.
     80   void ExtractFeatures(const base::string16* page_text,
     81                        FeatureMap* features,
     82                        std::set<uint32>* shingle_hashes,
     83                        const DoneCallback& done_callback);
     84 
     85   // Cancels any pending feature extraction.  The DoneCallback will not be run.
     86   // Must be called if there is a feature extraction in progress when the page
     87   // is unloaded or the PhishingTermFeatureExtractor is destroyed.
     88   void CancelPendingExtraction();
     89 
     90  private:
     91   struct ExtractionState;
     92 
     93   // The maximum amount of wall time that we will spend on a single extraction
     94   // iteration before pausing to let other MessageLoop tasks run.
     95   static const int kMaxTimePerChunkMs;
     96 
     97   // The number of words that we will process before checking to see whether
     98   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
     99   // slow, we don't do this on every word processed.
    100   static const int kClockCheckGranularity;
    101 
    102   // The maximum total amount of time that the feature extractor will run
    103   // before giving up on the current page.
    104   static const int kMaxTotalTimeMs;
    105 
    106   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
    107   // until a predefined maximum amount of time has elapsed, then posts a task
    108   // to the current MessageLoop to continue extraction.  When extraction
    109   // finishes, calls RunCallback().
    110   void ExtractFeaturesWithTimeout();
    111 
    112   // Handles a single word in the page text.
    113   void HandleWord(const base::StringPiece16& word);
    114 
    115   // Helper to verify that there is no pending feature extraction.  Dies in
    116   // debug builds if the state is not as expected.  This is a no-op in release
    117   // builds.
    118   void CheckNoPendingExtraction();
    119 
    120   // Runs |done_callback_| and then clears all internal state.
    121   void RunCallback(bool success);
    122 
    123   // Clears all internal feature extraction state.
    124   void Clear();
    125 
    126   // All of the term hashes that we are looking for in the page.
    127   const base::hash_set<std::string>* page_term_hashes_;
    128 
    129   // Murmur3 hashes of all the individual words in page_term_hashes_.  If
    130   // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
    131   // would contain (hashed) "one" and "two".  We do this so that we can have a
    132   // quick out in the common case that the current word we are processing
    133   // doesn't contain any part of one of our terms.
    134   const base::hash_set<uint32>* page_word_hashes_;
    135 
    136   // The maximum number of words in an n-gram.
    137   const size_t max_words_per_term_;
    138 
    139   // The seed for murmurhash3.
    140   const uint32 murmurhash3_seed_;
    141 
    142   // The maximum number of unique shingle hashes we extract in a page.
    143   const size_t max_shingles_per_page_;
    144 
    145   // The number of words in a shingle.
    146   const size_t shingle_size_;
    147 
    148   // Non-owned pointer to our clock.
    149   FeatureExtractorClock* clock_;
    150 
    151   // The output parameters from the most recent call to ExtractFeatures().
    152   const base::string16* page_text_;  // The caller keeps ownership of this.
    153   FeatureMap* features_;  // The caller keeps ownership of this.
    154   std::set<uint32>* shingle_hashes_;
    155   DoneCallback done_callback_;
    156 
    157   // Stores the current state of term extraction from |page_text_|.
    158   scoped_ptr<ExtractionState> state_;
    159 
    160   // Used in scheduling ExtractFeaturesWithTimeout tasks.
    161   // These pointers are invalidated if extraction is cancelled.
    162   base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
    163 
    164   DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
    165 };
    166 
    167 }  // namespace safe_browsing
    168 
    169 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
    170