Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/scorer.h"
      6 
      7 #include <math.h>
      8 
      9 #include "base/logging.h"
     10 #include "base/memory/scoped_ptr.h"
     11 #include "base/metrics/histogram.h"
     12 #include "base/strings/string_piece.h"
     13 #include "chrome/common/safe_browsing/client_model.pb.h"
     14 #include "chrome/renderer/safe_browsing/features.h"
     15 
     16 namespace {
     17 // Enum used to keep stats about the status of the Scorer creation.
     18 enum ScorerCreationStatus {
     19   SCORER_SUCCESS,
     20   SCORER_FAIL_MODEL_OPEN_FAIL,  // Not used anymore
     21   SCORER_FAIL_MODEL_FILE_EMPTY,  // Not used anymore
     22   SCORER_FAIL_MODEL_FILE_TOO_LARGE,  // Not used anymore
     23   SCORER_FAIL_MODEL_PARSE_ERROR,
     24   SCORER_FAIL_MODEL_MISSING_FIELDS,
     25   SCORER_STATUS_MAX  // Always add new values before this one.
     26 };
     27 
     28 void RecordScorerCreationStatus(ScorerCreationStatus status) {
     29   UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.ScorerCreationStatus",
     30                             status,
     31                             SCORER_STATUS_MAX);
     32 }
     33 }  // namespace
     34 
     35 namespace safe_browsing {
     36 
     37 // Helper function which converts log odds to a probability in the range
     38 // [0.0,1.0].
     39 static double LogOdds2Prob(double log_odds) {
     40   // 709 = floor(1023*ln(2)).  2**1023 is the largest finite double.
     41   // Small log odds aren't a problem.  as the odds will be 0.  It's only
     42   // when we get +infinity for the odds, that odds/(odds+1) would be NaN.
     43   if (log_odds >= 709) {
     44     return 1.0;
     45   }
     46   double odds = exp(log_odds);
     47   return odds/(odds+1.0);
     48 }
     49 
     50 Scorer::Scorer() {}
     51 Scorer::~Scorer() {}
     52 
     53 /* static */
     54 Scorer* Scorer::Create(const base::StringPiece& model_str) {
     55   scoped_ptr<Scorer> scorer(new Scorer());
     56   ClientSideModel& model = scorer->model_;
     57   if (!model.ParseFromArray(model_str.data(), model_str.size())) {
     58     DLOG(ERROR) << "Unable to parse phishing model.  This Scorer object is "
     59                 << "invalid.";
     60     RecordScorerCreationStatus(SCORER_FAIL_MODEL_PARSE_ERROR);
     61     return NULL;
     62   } else if (!model.IsInitialized()) {
     63     DLOG(ERROR) << "Unable to parse phishing model.  The model is missing "
     64                 << "some required fields.  Maybe the .proto file changed?";
     65     RecordScorerCreationStatus(SCORER_FAIL_MODEL_MISSING_FIELDS);
     66     return NULL;
     67   }
     68   RecordScorerCreationStatus(SCORER_SUCCESS);
     69   for (int i = 0; i < model.page_term_size(); ++i) {
     70     scorer->page_terms_.insert(model.hashes(model.page_term(i)));
     71   }
     72   for (int i = 0; i < model.page_word_size(); ++i) {
     73     scorer->page_words_.insert(model.page_word(i));
     74   }
     75   return scorer.release();
     76 }
     77 
     78 double Scorer::ComputeScore(const FeatureMap& features) const {
     79   double logodds = 0.0;
     80   for (int i = 0; i < model_.rule_size(); ++i) {
     81     logodds += ComputeRuleScore(model_.rule(i), features);
     82   }
     83   return LogOdds2Prob(logodds);
     84 }
     85 
     86 int Scorer::model_version() const {
     87   return model_.version();
     88 }
     89 
     90 const base::hash_set<std::string>& Scorer::page_terms() const {
     91   return page_terms_;
     92 }
     93 
     94 const base::hash_set<uint32>& Scorer::page_words() const {
     95   return page_words_;
     96 }
     97 
     98 size_t Scorer::max_words_per_term() const {
     99   return model_.max_words_per_term();
    100 }
    101 
    102 uint32 Scorer::murmurhash3_seed() const {
    103   return model_.murmur_hash_seed();
    104 }
    105 
    106 size_t Scorer::max_shingles_per_page() const {
    107   return model_.max_shingles_per_page();
    108 }
    109 
    110 size_t Scorer::shingle_size() const {
    111   return model_.shingle_size();
    112 }
    113 
    114 double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
    115                                 const FeatureMap& features) const {
    116   const base::hash_map<std::string, double>& feature_map = features.features();
    117   double rule_score = 1.0;
    118   for (int i = 0; i < rule.feature_size(); ++i) {
    119     base::hash_map<std::string, double>::const_iterator it = feature_map.find(
    120         model_.hashes(rule.feature(i)));
    121     if (it == feature_map.end() || it->second == 0.0) {
    122       // If the feature of the rule does not exist in the given feature map the
    123       // feature weight is considered to be zero.  If the feature weight is zero
    124       // we leave early since we know that the rule score will be zero.
    125       return 0.0;
    126     }
    127     rule_score *= it->second;
    128   }
    129   return rule_score * rule.weight();
    130 }
    131 }  // namespace safe_browsing
    132