1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/scorer.h" 6 7 #include <math.h> 8 9 #include "base/logging.h" 10 #include "base/memory/scoped_ptr.h" 11 #include "base/metrics/histogram.h" 12 #include "base/strings/string_piece.h" 13 #include "chrome/common/safe_browsing/client_model.pb.h" 14 #include "chrome/renderer/safe_browsing/features.h" 15 16 namespace { 17 // Enum used to keep stats about the status of the Scorer creation. 18 enum ScorerCreationStatus { 19 SCORER_SUCCESS, 20 SCORER_FAIL_MODEL_OPEN_FAIL, // Not used anymore 21 SCORER_FAIL_MODEL_FILE_EMPTY, // Not used anymore 22 SCORER_FAIL_MODEL_FILE_TOO_LARGE, // Not used anymore 23 SCORER_FAIL_MODEL_PARSE_ERROR, 24 SCORER_FAIL_MODEL_MISSING_FIELDS, 25 SCORER_STATUS_MAX // Always add new values before this one. 26 }; 27 28 void RecordScorerCreationStatus(ScorerCreationStatus status) { 29 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.ScorerCreationStatus", 30 status, 31 SCORER_STATUS_MAX); 32 } 33 } // namespace 34 35 namespace safe_browsing { 36 37 // Helper function which converts log odds to a probability in the range 38 // [0.0,1.0]. 39 static double LogOdds2Prob(double log_odds) { 40 // 709 = floor(1023*ln(2)). 2**1023 is the largest finite double. 41 // Small log odds aren't a problem. as the odds will be 0. It's only 42 // when we get +infinity for the odds, that odds/(odds+1) would be NaN. 43 if (log_odds >= 709) { 44 return 1.0; 45 } 46 double odds = exp(log_odds); 47 return odds/(odds+1.0); 48 } 49 50 Scorer::Scorer() {} 51 Scorer::~Scorer() {} 52 53 /* static */ 54 Scorer* Scorer::Create(const base::StringPiece& model_str) { 55 scoped_ptr<Scorer> scorer(new Scorer()); 56 ClientSideModel& model = scorer->model_; 57 if (!model.ParseFromArray(model_str.data(), model_str.size())) { 58 DLOG(ERROR) << "Unable to parse phishing model. This Scorer object is " 59 << "invalid."; 60 RecordScorerCreationStatus(SCORER_FAIL_MODEL_PARSE_ERROR); 61 return NULL; 62 } else if (!model.IsInitialized()) { 63 DLOG(ERROR) << "Unable to parse phishing model. The model is missing " 64 << "some required fields. Maybe the .proto file changed?"; 65 RecordScorerCreationStatus(SCORER_FAIL_MODEL_MISSING_FIELDS); 66 return NULL; 67 } 68 RecordScorerCreationStatus(SCORER_SUCCESS); 69 for (int i = 0; i < model.page_term_size(); ++i) { 70 scorer->page_terms_.insert(model.hashes(model.page_term(i))); 71 } 72 for (int i = 0; i < model.page_word_size(); ++i) { 73 scorer->page_words_.insert(model.page_word(i)); 74 } 75 return scorer.release(); 76 } 77 78 double Scorer::ComputeScore(const FeatureMap& features) const { 79 double logodds = 0.0; 80 for (int i = 0; i < model_.rule_size(); ++i) { 81 logodds += ComputeRuleScore(model_.rule(i), features); 82 } 83 return LogOdds2Prob(logodds); 84 } 85 86 int Scorer::model_version() const { 87 return model_.version(); 88 } 89 90 const base::hash_set<std::string>& Scorer::page_terms() const { 91 return page_terms_; 92 } 93 94 const base::hash_set<uint32>& Scorer::page_words() const { 95 return page_words_; 96 } 97 98 size_t Scorer::max_words_per_term() const { 99 return model_.max_words_per_term(); 100 } 101 102 uint32 Scorer::murmurhash3_seed() const { 103 return model_.murmur_hash_seed(); 104 } 105 106 double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule, 107 const FeatureMap& features) const { 108 const base::hash_map<std::string, double>& feature_map = features.features(); 109 double rule_score = 1.0; 110 for (int i = 0; i < rule.feature_size(); ++i) { 111 base::hash_map<std::string, double>::const_iterator it = feature_map.find( 112 model_.hashes(rule.feature(i))); 113 if (it == feature_map.end() || it->second == 0.0) { 114 // If the feature of the rule does not exist in the given feature map the 115 // feature weight is considered to be zero. If the feature weight is zero 116 // we leave early since we know that the rule score will be zero. 117 return 0.0; 118 } 119 rule_score *= it->second; 120 } 121 return rule_score * rule.weight(); 122 } 123 } // namespace safe_browsing 124