Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // This proto represents a machine learning model which is used to compute
      6 // the probability that a particular page visited by Chrome is phishing.
      7 //
      8 // Note: sine the machine learning model is trained on the server-side and then
      9 // downloaded onto the client it is important that this proto file stays in
     10 // sync with the server-side copy.  Otherwise, the client may not be able to
     11 // parse the server generated model anymore.  If you want to change this
     12 // protocol definition or you have questions regarding its format please contact
     13 // chrome-anti-phishing (a] googlegroups.com.
     14 
     15 syntax = "proto2";
     16 
     17 option optimize_for = LITE_RUNTIME;
     18 
     19 package safe_browsing;
     20 
     21 // This protocol buffer represents a machine learning model that is used in
     22 // client-side phishing detection (in Chrome).  The client extracts a set
     23 // of features from every website the user visits.  Extracted features map
     24 // feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
     25 //
     26 // To compute the phishing score (i.e., the probability that the website is
     27 // phishing) a scorer will simply compute the sum of all rule scores for a
     28 // given set of extracted features.  The score of a particular rule corresponds
     29 // to the product of all feature values that are part of the rule times the
     30 // rule weight.  If a feature has no value (i.e., is not part of the extracted
     31 // features) its value will be set to zero.  The overall score is computed
     32 // by summing up all the rule scores.  This overall score is a logodds and can
     33 // be converted to a probability like this:
     34 // p = exp(logodds) / (exp(logodds) + 1).
     35 //
     36 // To make it harder for phishers to reverse engineer our machine learning model
     37 // all the features in the model are hashed with a sha256 hash function.  The
     38 // feature extractors also hash the extracted features before scoring happens.
     39 message ClientSideModel {
     40   // In order to save some space we store all the hashed strings in a
     41   // single repeated field and then the rules as well as page terms
     42   // and page words refer to an index in that repeated field.  All
     43   // hashes are sha256 hashes stored in binary format.
     44   repeated bytes hashes = 1;
     45 
     46   message Rule {
     47     // List of indexes into hashes above which are basically hashed
     48     // features that form the current rule.
     49     repeated int32 feature = 1;
     50 
     51     // The weight for this particular rule.
     52     required float weight = 2;
     53   }
     54 
     55   // List of rules which make up the model
     56   repeated Rule rule = 2;
     57 
     58   // List of indexes that point to the hashed page terms that appear in
     59   // the model.  The hashes are computed over page terms that are encoded
     60   // as lowercase UTF-8 strings.
     61   repeated int32 page_term = 3;
     62 
     63   // List of hashed page words.  The page words correspond to all words that
     64   // appear in page terms.  If the term "one two" is in the list of page terms
     65   // then "one" and "two" will be in the list of page words.  For page words
     66   // we don't use SHA256 because it is too expensive.  We use MurmurHash3
     67   // instead.  See: http://code.google.com/p/smhasher.
     68   repeated fixed32 page_word = 4;
     69 
     70   // Page terms in page_term contain at most this many page words.
     71   required int32 max_words_per_term = 5;
     72 
     73   // Model version number.  Every model that we train should have a different
     74   // version number and it should always be larger than the previous model
     75   // version.
     76   optional int32 version = 6;
     77 
     78   // List of known bad IP subnets.
     79   message IPSubnet {
     80     // The subnet prefix is a valid 16-byte IPv6 address (in network order) that
     81     // is hashed using sha256.
     82     required bytes prefix = 1;
     83 
     84     // Network prefix size in bits.  Default is an exact-host match.
     85     optional int32 size = 2 [default = 128];
     86   };
     87   repeated IPSubnet bad_subnet = 7;
     88 
     89   // Murmur hash seed that was used to hash the page words.
     90   optional fixed32 murmur_hash_seed = 8;
     91 
     92   // Maximum number of unique shingle hashes per page.
     93   optional int32 max_shingles_per_page = 9 [default = 200];
     94 
     95   // The number of words in a shingle.
     96   optional int32 shingle_size = 10 [default = 4];
     97 }
     98