1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // This class loads a client-side model and lets you compute a phishing score 6 // for a set of previously extracted features. The phishing score corresponds 7 // to the probability that the features are indicative of a phishing site. 8 // 9 // For more details on how the score is actually computed for a given model 10 // and a given set of features read the comments in client_model.proto file. 11 // 12 // See features.h for a list of features that are currently used. 13 14 #ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 15 #define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 16 17 #include <string> 18 19 #include "base/basictypes.h" 20 #include "base/containers/hash_tables.h" 21 #include "base/strings/string_piece.h" 22 #include "chrome/common/safe_browsing/client_model.pb.h" 23 24 namespace safe_browsing { 25 class FeatureMap; 26 27 // Scorer methods are virtual to simplify mocking of this class. 28 class Scorer { 29 public: 30 virtual ~Scorer(); 31 32 // Factory method which creates a new Scorer object by parsing the given 33 // model. If parsing fails this method returns NULL. 34 static Scorer* Create(const base::StringPiece& model_str); 35 36 // This method computes the probability that the given features are indicative 37 // of phishing. It returns a score value that falls in the range [0.0,1.0] 38 // (range is inclusive on both ends). 39 virtual double ComputeScore(const FeatureMap& features) const; 40 41 // Returns the version number of the loaded client model. 42 int model_version() const; 43 44 // -- Accessors used by the page feature extractor --------------------------- 45 46 // Returns a set of hashed page terms that appear in the model in binary 47 // format. 48 const base::hash_set<std::string>& page_terms() const; 49 50 // Returns a set of hashed page words that appear in the model in binary 51 // format. 52 const base::hash_set<uint32>& page_words() const; 53 54 // Return the maximum number of words per term for the loaded model. 55 size_t max_words_per_term() const; 56 57 // Returns the murmurhash3 seed for the loaded model. 58 uint32 murmurhash3_seed() const; 59 60 protected: 61 // Most clients should use the factory method. This constructor is public 62 // to allow for mock implementations. 63 Scorer(); 64 65 private: 66 friend class PhishingScorerTest; 67 68 // Computes the score for a given rule and feature map. The score is computed 69 // by multiplying the rule weight with the product of feature weights for the 70 // given rule. The feature weights are stored in the feature map. If a 71 // particular feature does not exist in the feature map we set its weight to 72 // zero. 73 double ComputeRuleScore(const ClientSideModel::Rule& rule, 74 const FeatureMap& features) const; 75 76 ClientSideModel model_; 77 base::hash_set<std::string> page_terms_; 78 base::hash_set<uint32> page_words_; 79 80 DISALLOW_COPY_AND_ASSIGN(Scorer); 81 }; 82 } // namepsace safe_browsing 83 84 #endif // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 85