Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This class loads a client-side model and lets you compute a phishing score
      6 // for a set of previously extracted features.  The phishing score corresponds
      7 // to the probability that the features are indicative of a phishing site.
      8 //
      9 // For more details on how the score is actually computed for a given model
     10 // and a given set of features read the comments in client_model.proto file.
     11 //
     12 // See features.h for a list of features that are currently used.
     13 
     14 #ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
     15 #define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
     16 
     17 #include <string>
     18 
     19 #include "base/basictypes.h"
     20 #include "base/containers/hash_tables.h"
     21 #include "base/strings/string_piece.h"
     22 #include "chrome/common/safe_browsing/client_model.pb.h"
     23 
     24 namespace safe_browsing {
     25 class FeatureMap;
     26 
     27 // Scorer methods are virtual to simplify mocking of this class.
     28 class Scorer {
     29  public:
     30   virtual ~Scorer();
     31 
     32   // Factory method which creates a new Scorer object by parsing the given
     33   // model.  If parsing fails this method returns NULL.
     34   static Scorer* Create(const base::StringPiece& model_str);
     35 
     36   // This method computes the probability that the given features are indicative
     37   // of phishing.  It returns a score value that falls in the range [0.0,1.0]
     38   // (range is inclusive on both ends).
     39   virtual double ComputeScore(const FeatureMap& features) const;
     40 
     41   // Returns the version number of the loaded client model.
     42   int model_version() const;
     43 
     44   // -- Accessors used by the page feature extractor ---------------------------
     45 
     46   // Returns a set of hashed page terms that appear in the model in binary
     47   // format.
     48   const base::hash_set<std::string>& page_terms() const;
     49 
     50   // Returns a set of hashed page words that appear in the model in binary
     51   // format.
     52   const base::hash_set<uint32>& page_words() const;
     53 
     54   // Return the maximum number of words per term for the loaded model.
     55   size_t max_words_per_term() const;
     56 
     57   // Returns the murmurhash3 seed for the loaded model.
     58   uint32 murmurhash3_seed() const;
     59 
     60   // Return the maximum number of unique shingle hashes per page.
     61   size_t max_shingles_per_page() const;
     62 
     63   // Return the number of words in a shingle.
     64   size_t shingle_size() const;
     65 
     66  protected:
     67   // Most clients should use the factory method.  This constructor is public
     68   // to allow for mock implementations.
     69   Scorer();
     70 
     71  private:
     72   friend class PhishingScorerTest;
     73 
     74   // Computes the score for a given rule and feature map.  The score is computed
     75   // by multiplying the rule weight with the product of feature weights for the
     76   // given rule.  The feature weights are stored in the feature map.  If a
     77   // particular feature does not exist in the feature map we set its weight to
     78   // zero.
     79   double ComputeRuleScore(const ClientSideModel::Rule& rule,
     80                           const FeatureMap& features) const;
     81 
     82   ClientSideModel model_;
     83   base::hash_set<std::string> page_terms_;
     84   base::hash_set<uint32> page_words_;
     85 
     86   DISALLOW_COPY_AND_ASSIGN(Scorer);
     87 };
     88 }  // namespace safe_browsing
     89 
     90 #endif  // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
     91