Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // Common types and constants for extracting and evaluating features in the
      6 // client-side phishing detection model.  A feature is simply a string and an
      7 // associated floating-point value between 0 and 1.  The phishing
      8 // classification model contains rules which give an appropriate weight to each
      9 // feature or combination of features.  These values can then be summed to
     10 // compute a final phishiness score.
     11 //
     12 // Some features are boolean features.  If these features are set, they always
     13 // have a value of 0.0 or 1.0.  In practice, the features are only set if the
     14 // value is true (1.0).
     15 //
     16 // We also use token features.  These features have a unique name that is
     17 // constructed from the URL or page contents that we are classifying, for
     18 // example, "UrlDomain=chromium".  These features are also always set to 1.0
     19 // if they are present.
     20 //
     21 // The intermediate storage of the features for a URL is a FeatureMap, which is
     22 // just a thin wrapper around a map of feature name to value.  The entire set
     23 // of features for a URL is extracted before we do any scoring.
     24 
     25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
     26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
     27 
     28 #include <string>
     29 #include "base/basictypes.h"
     30 #include "base/containers/hash_tables.h"
     31 
     32 namespace safe_browsing {
     33 
     34 // Container for a map of features to values, which enforces behavior
     35 // such as a maximum number of features in the map.
     36 class FeatureMap {
     37  public:
     38   FeatureMap();
     39   ~FeatureMap();
     40 
     41   // Adds a boolean feature to a FeatureMap with a value of 1.0.
     42   // Returns true on success, or false if the feature map exceeds
     43   // kMaxFeatureMapSize.
     44   bool AddBooleanFeature(const std::string& name);
     45 
     46   // Adds a real-valued feature to a FeatureMap with the given value.
     47   // Values must always be in the range [0.0, 1.0].  Returns true on
     48   // success, or false if the feature map exceeds kMaxFeatureMapSize
     49   // or the value is outside of the allowed range.
     50   bool AddRealFeature(const std::string& name, double value);
     51 
     52   // Provides read-only access to the current set of features.
     53   const base::hash_map<std::string, double>& features() const {
     54     return features_;
     55   }
     56 
     57   // Clears the set of features in the map.
     58   void Clear();
     59 
     60   // This is an upper bound on the number of features that will be extracted.
     61   // We should never hit this cap; it is intended as a sanity check to prevent
     62   // the FeatureMap from growing too large.
     63   static const size_t kMaxFeatureMapSize;
     64 
     65  private:
     66   base::hash_map<std::string, double> features_;
     67 
     68   DISALLOW_COPY_AND_ASSIGN(FeatureMap);
     69 };
     70 
     71 namespace features {
     72 // Constants for the various feature names that we use.
     73 //
     74 // IMPORTANT: when adding new features, you must update kAllowedFeatures in
     75 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
     76 // should be sent in sanitized pingbacks.
     77 
     78 ////////////////////////////////////////////////////
     79 // URL host features
     80 ////////////////////////////////////////////////////
     81 
     82 // Set if the URL's hostname is an IP address.
     83 extern const char kUrlHostIsIpAddress[];
     84 // Token feature containing the portion of the hostname controlled by a
     85 // registrar, for example "com" or "co.uk".
     86 extern const char kUrlTldToken[];
     87 // Token feature containing the first host component below the registrar.
     88 // For example, in "www.google.com", the domain would be "google".
     89 extern const char kUrlDomainToken[];
     90 // Token feature containing each host component below the domain.
     91 // For example, in "www.host.example.com", both "www" and "host" would be
     92 // "other host tokens".
     93 extern const char kUrlOtherHostToken[];
     94 
     95 ////////////////////////////////////////////////////
     96 // Aggregate features for URL host tokens
     97 ////////////////////////////////////////////////////
     98 
     99 // Set if the number of "other" host tokens for a URL is greater than one.
    100 // Longer hostnames, regardless of the specific tokens, can be a signal that
    101 // the URL is phishy.
    102 extern const char kUrlNumOtherHostTokensGTOne[];
    103 // Set if the number of "other" host tokens for a URL is greater than three.
    104 extern const char kUrlNumOtherHostTokensGTThree[];
    105 
    106 ////////////////////////////////////////////////////
    107 // URL path token features
    108 ////////////////////////////////////////////////////
    109 
    110 // Token feature containing each alphanumeric string in the path that is at
    111 // least 3 characters long.  For example, "/abc/d/efg" would have 2 path
    112 // token features, "abc" and "efg".  Query parameters are not included.
    113 extern const char kUrlPathToken[];
    114 
    115 ////////////////////////////////////////////////////
    116 // DOM HTML form features
    117 ////////////////////////////////////////////////////
    118 
    119 // Set if the page has any <form> elements.
    120 extern const char kPageHasForms[];
    121 // The fraction of form elements whose |action| attribute points to a
    122 // URL on a different domain from the document URL.
    123 extern const char kPageActionOtherDomainFreq[];
    124 
    125 // Set if the page has any <input type="text"> elements
    126 // (includes inputs with missing or unknown types).
    127 extern const char kPageHasTextInputs[];
    128 // Set if the page has any <input type="password"> elements.
    129 extern const char kPageHasPswdInputs[];
    130 // Set if the page has any <input type="radio"> elements.
    131 extern const char kPageHasRadioInputs[];
    132 // Set if the page has any <input type="checkbox"> elements.
    133 extern const char kPageHasCheckInputs[];
    134 
    135 ////////////////////////////////////////////////////
    136 // DOM HTML link features
    137 ////////////////////////////////////////////////////
    138 
    139 // The fraction of links in the page which point to a domain other than the
    140 // domain of the document.  See "URL host features" above for a discussion
    141 // of how the doamin is computed.
    142 extern const char kPageExternalLinksFreq[];
    143 // Token feature containing each external domain that is linked to.
    144 extern const char kPageLinkDomain[];
    145 // Fraction of links in the page that use https.
    146 extern const char kPageSecureLinksFreq[];
    147 
    148 ////////////////////////////////////////////////////
    149 // DOM HTML script features
    150 ////////////////////////////////////////////////////
    151 
    152 // Set if the number of <script> elements in the page is greater than 1.
    153 extern const char kPageNumScriptTagsGTOne[];
    154 // Set if the number of <script> elements in the page is greater than 6.
    155 extern const char kPageNumScriptTagsGTSix[];
    156 
    157 ////////////////////////////////////////////////////
    158 // Other DOM HTML features
    159 ////////////////////////////////////////////////////
    160 
    161 // The fraction of images whose src attribute points to an external domain.
    162 extern const char kPageImgOtherDomainFreq[];
    163 
    164 ////////////////////////////////////////////////////
    165 // Page term features
    166 ////////////////////////////////////////////////////
    167 
    168 // Token feature for a term (whitespace-delimited) on a page.  Terms can be
    169 // single words or multi-word n-grams.  Rather than adding this feature for
    170 // every possible token on a page, only the terms that are mentioned in the
    171 // classification model are added.
    172 extern const char kPageTerm[];
    173 
    174 }  // namespace features
    175 }  // namepsace safe_browsing
    176 
    177 #endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
    178