Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This class handles the process of extracting all of the features from a
      6 // page and computing a phishyness score.  The basic steps are:
      7 //  - Run each feature extractor over the page, building up a FeatureMap of
      8 //    feature -> value.
      9 //  - SHA-256 hash all of the feature names in the map so that they match the
     10 //    supplied model.
     11 //  - Hand the hashed map off to a Scorer, which computes the probability that
     12 //    the page is phishy.
     13 //  - If the page is phishy, run the supplied callback.
     14 //
     15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and
     16 // client_model.proto.
     17 
     18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
     19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
     20 
     21 #include <set>
     22 
     23 #include "base/basictypes.h"
     24 #include "base/callback.h"
     25 #include "base/memory/scoped_ptr.h"
     26 #include "base/memory/weak_ptr.h"
     27 #include "base/strings/string16.h"
     28 
     29 namespace content {
     30 class RenderView;
     31 }
     32 
     33 namespace safe_browsing {
     34 class ClientPhishingRequest;
     35 class FeatureExtractorClock;
     36 class FeatureMap;
     37 class PhishingDOMFeatureExtractor;
     38 class PhishingTermFeatureExtractor;
     39 class PhishingUrlFeatureExtractor;
     40 class Scorer;
     41 
     42 class PhishingClassifier {
     43  public:
     44   // Callback to be run when phishing classification finishes. The verdict
     45   // is a ClientPhishingRequest which contains the verdict computed by the
     46   // classifier as well as the extracted features.  If the verdict.is_phishing()
     47   // is true, the page is considered phishy by the client-side model,
     48   // and the browser should ping back to get a final verdict.  The
     49   // verdict.client_score() is set to kInvalidScore if classification failed.
     50   typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)>
     51       DoneCallback;
     52 
     53   static const float kInvalidScore;
     54 
     55   // Creates a new PhishingClassifier object that will operate on
     56   // |render_view|.  |clock| is used to time feature extractor operations, and
     57   // the PhishingClassifier takes ownership of this object.  Note that the
     58   // classifier will not be 'ready' until set_phishing_scorer() is called.
     59   PhishingClassifier(content::RenderView* render_view,
     60                      FeatureExtractorClock* clock);
     61   virtual ~PhishingClassifier();
     62 
     63   // Sets a scorer for the classifier to use in computing the phishiness score.
     64   // This must live at least as long as the PhishingClassifier.  The caller is
     65   // expected to cancel any pending classification before setting a phishing
     66   // scorer.
     67   void set_phishing_scorer(const Scorer* scorer);
     68 
     69   // Returns true if the classifier is ready to classify pages, i.e. it
     70   // has had a scorer set via set_phishing_scorer().
     71   bool is_ready() const;
     72 
     73   // Called by the RenderView when a page has finished loading.  This begins
     74   // the feature extraction and scoring process. |page_text| should contain
     75   // the plain text of a web page, including any subframes, as returned by
     76   // RenderView::CaptureText().  |page_text| is owned by the caller, and must
     77   // not be destroyed until either |done_callback| is run or
     78   // CancelPendingClassification() is called.
     79   //
     80   // To avoid blocking the render thread for too long, phishing classification
     81   // may run in several chunks of work, posting a task to the current
     82   // MessageLoop to continue processing.  Once the scoring process is complete,
     83   // |done_callback| is run on the current thread.  PhishingClassifier takes
     84   // ownership of the callback.
     85   //
     86   // It is an error to call BeginClassification if the classifier is not yet
     87   // ready.
     88   virtual void BeginClassification(const base::string16* page_text,
     89                                    const DoneCallback& callback);
     90 
     91   // Called by the RenderView (on the render thread) when a page is unloading
     92   // or the RenderView is being destroyed.  This cancels any extraction that
     93   // is in progress.  It is an error to call CancelPendingClassification if
     94   // the classifier is not yet ready.
     95   virtual void CancelPendingClassification();
     96 
     97  private:
     98   // Any score equal to or above this value is considered phishy.
     99   static const float kPhishyThreshold;
    100 
    101   // Begins the feature extraction process, by extracting URL features and
    102   // beginning DOM feature extraction.
    103   void BeginFeatureExtraction();
    104 
    105   // Callback to be run when DOM feature extraction is complete.
    106   // If it was successful, begins term feature extraction, otherwise
    107   // runs the DoneCallback with a non-phishy verdict.
    108   void DOMExtractionFinished(bool success);
    109 
    110   // Callback to be run when term feature extraction is complete.
    111   // If it was successful, computes a score and runs the DoneCallback.
    112   // If extraction was unsuccessful, runs the DoneCallback with a
    113   // non-phishy verdict.
    114   void TermExtractionFinished(bool success);
    115 
    116   // Helper to verify that there is no pending phishing classification.  Dies
    117   // in debug builds if the state is not as expected.  This is a no-op in
    118   // release builds.
    119   void CheckNoPendingClassification();
    120 
    121   // Helper method to run the DoneCallback and clear the state.
    122   void RunCallback(const ClientPhishingRequest& verdict);
    123 
    124   // Helper to run the DoneCallback when feature extraction has failed.
    125   // This always signals a non-phishy verdict for the page, with kInvalidScore.
    126   void RunFailureCallback();
    127 
    128   // Clears the current state of the PhishingClassifier.
    129   void Clear();
    130 
    131   content::RenderView* render_view_;  // owns us
    132   const Scorer* scorer_;  // owned by the caller
    133   scoped_ptr<FeatureExtractorClock> clock_;
    134   scoped_ptr<PhishingUrlFeatureExtractor> url_extractor_;
    135   scoped_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
    136   scoped_ptr<PhishingTermFeatureExtractor> term_extractor_;
    137 
    138   // State for any in-progress extraction.
    139   scoped_ptr<FeatureMap> features_;
    140   scoped_ptr<std::set<uint32> > shingle_hashes_;
    141   const base::string16* page_text_;  // owned by the caller
    142   DoneCallback done_callback_;
    143 
    144   // Used in scheduling BeginFeatureExtraction tasks.
    145   // These pointers are invalidated if classification is cancelled.
    146   base::WeakPtrFactory<PhishingClassifier> weak_factory_;
    147 
    148   DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
    149 };
    150 
    151 }  // namespace safe_browsing
    152 
    153 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
    154