Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the
      6 // client-side phishing detection model.  These include the presence of various
      7 // types of elements, ratios of external and secure links, and tokens for
      8 // external domains linked to.
      9 
     10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
     11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
     12 
     13 #include <string>
     14 
     15 #include "base/basictypes.h"
     16 #include "base/callback.h"
     17 #include "base/memory/scoped_ptr.h"
     18 #include "base/memory/weak_ptr.h"
     19 #include "third_party/WebKit/public/web/WebDocument.h"
     20 
     21 class GURL;
     22 
     23 namespace WebKit {
     24 class WebElement;
     25 }
     26 
     27 namespace content {
     28 class RenderView;
     29 }
     30 
     31 namespace safe_browsing {
     32 class FeatureExtractorClock;
     33 class FeatureMap;
     34 
     35 class PhishingDOMFeatureExtractor {
     36  public:
     37   // Callback to be run when feature extraction finishes.  The callback
     38   // argument is true if extraction was successful, false otherwise.
     39   typedef base::Callback<void(bool)> DoneCallback;
     40 
     41   // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
     42   // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
     43   // the RenderView.  |clock| is used for timing feature extractor operations,
     44   // and may be mocked for testing.  The caller maintains ownership of the
     45   // clock.
     46   PhishingDOMFeatureExtractor(content::RenderView* render_view,
     47                               FeatureExtractorClock* clock);
     48   ~PhishingDOMFeatureExtractor();
     49 
     50   // Begins extracting features into the given FeatureMap for the page
     51   // currently loaded in this object's RenderView.  To avoid blocking the
     52   // render thread for too long, the feature extractor may run in several
     53   // chunks of work, posting a task to the current MessageLoop to continue
     54   // processing.  Once feature extraction is complete, |done_callback|
     55   // is run on the current thread.  PhishingDOMFeatureExtractor takes
     56   // ownership of the callback.
     57   void ExtractFeatures(FeatureMap* features, const DoneCallback& done_callback);
     58 
     59   // Cancels any pending feature extraction.  The DoneCallback will not be run.
     60   // Must be called if there is a feature extraction in progress when the page
     61   // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
     62   void CancelPendingExtraction();
     63 
     64  private:
     65   struct FrameData;
     66   struct PageFeatureState;
     67 
     68   // The maximum amount of wall time that we will spend on a single extraction
     69   // iteration before pausing to let other MessageLoop tasks run.
     70   static const int kMaxTimePerChunkMs;
     71 
     72   // The number of elements that we will process before checking to see whether
     73   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
     74   // slow, we don't do this on every element processed.
     75   static const int kClockCheckGranularity;
     76 
     77   // The maximum total amount of time that the feature extractor will run
     78   // before giving up on the current page.
     79   static const int kMaxTotalTimeMs;
     80 
     81   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
     82   // until a predefined maximum amount of time has elapsed, then posts a task
     83   // to the current MessageLoop to continue extraction.  When extraction
     84   // finishes, calls RunCallback().
     85   void ExtractFeaturesWithTimeout();
     86 
     87   // Handlers for the various HTML elements that we compute features for.
     88   // Since some of the features (such as ratios) cannot be computed until
     89   // feature extraction is finished, these handlers do not add to the feature
     90   // map directly.  Instead, they update the values in the PageFeatureState.
     91   void HandleLink(const WebKit::WebElement& element);
     92   void HandleForm(const WebKit::WebElement& element);
     93   void HandleImage(const WebKit::WebElement& element);
     94   void HandleInput(const WebKit::WebElement& element);
     95   void HandleScript(const WebKit::WebElement& element);
     96 
     97   // Helper to verify that there is no pending feature extraction.  Dies in
     98   // debug builds if the state is not as expected.  This is a no-op in release
     99   // builds.
    100   void CheckNoPendingExtraction();
    101 
    102   // Runs |done_callback_| and then clears all internal state.
    103   void RunCallback(bool success);
    104 
    105   // Clears all internal feature extraction state.
    106   void Clear();
    107 
    108   // Called after advancing |cur_document_| to update the state in
    109   // |cur_frame_data_|.
    110   void ResetFrameData();
    111 
    112   // Returns the next document in frame-traversal order from cur_document_.
    113   // If there are no more documents, returns a null WebDocument.
    114   WebKit::WebDocument GetNextDocument();
    115 
    116   // Given a URL, checks whether the domain is different from the domain of
    117   // the current frame's URL.  If so, stores the domain in |domain| and returns
    118   // true, otherwise returns false.
    119   bool IsExternalDomain(const GURL& url, std::string* domain) const;
    120 
    121   // Called once all frames have been processed to compute features from the
    122   // PageFeatureState and add them to |features_|.  See features.h for a
    123   // description of which features are computed.
    124   void InsertFeatures();
    125 
    126   // Non-owned pointer to the view that we will extract features from.
    127   content::RenderView* render_view_;
    128 
    129   // Non-owned pointer to our clock.
    130   FeatureExtractorClock* clock_;
    131 
    132   // The output parameters from the most recent call to ExtractFeatures().
    133   FeatureMap* features_;  // The caller keeps ownership of this.
    134   DoneCallback done_callback_;
    135 
    136   // The current (sub-)document that we are processing.  May be a null document
    137   // (isNull()) if we are not currently extracting features.
    138   WebKit::WebDocument cur_document_;
    139 
    140   // Stores extra state for |cur_document_| that will be persisted until we
    141   // advance to the next frame.
    142   scoped_ptr<FrameData> cur_frame_data_;
    143 
    144   // Stores the intermediate data used to create features.  This data is
    145   // accumulated across all frames in the RenderView.
    146   scoped_ptr<PageFeatureState> page_feature_state_;
    147 
    148   // Used in scheduling ExtractFeaturesWithTimeout tasks.
    149   // These pointers are invalidated if extraction is cancelled.
    150   base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_;
    151 
    152   DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
    153 };
    154 
    155 }  // namespace safe_browsing
    156 
    157 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
    158