1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // Common types and constants for extracting and evaluating features in the 6 // client-side phishing detection model. A feature is simply a string and an 7 // associated floating-point value between 0 and 1. The phishing 8 // classification model contains rules which give an appropriate weight to each 9 // feature or combination of features. These values can then be summed to 10 // compute a final phishiness score. 11 // 12 // Some features are boolean features. If these features are set, they always 13 // have a value of 0.0 or 1.0. In practice, the features are only set if the 14 // value is true (1.0). 15 // 16 // We also use token features. These features have a unique name that is 17 // constructed from the URL or page contents that we are classifying, for 18 // example, "UrlDomain=chromium". These features are also always set to 1.0 19 // if they are present. 20 // 21 // The intermediate storage of the features for a URL is a FeatureMap, which is 22 // just a thin wrapper around a map of feature name to value. The entire set 23 // of features for a URL is extracted before we do any scoring. 24 25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 27 28 #include <string> 29 #include "base/basictypes.h" 30 #include "base/containers/hash_tables.h" 31 32 namespace safe_browsing { 33 34 // Container for a map of features to values, which enforces behavior 35 // such as a maximum number of features in the map. 36 class FeatureMap { 37 public: 38 FeatureMap(); 39 ~FeatureMap(); 40 41 // Adds a boolean feature to a FeatureMap with a value of 1.0. 42 // Returns true on success, or false if the feature map exceeds 43 // kMaxFeatureMapSize. 44 bool AddBooleanFeature(const std::string& name); 45 46 // Adds a real-valued feature to a FeatureMap with the given value. 47 // Values must always be in the range [0.0, 1.0]. Returns true on 48 // success, or false if the feature map exceeds kMaxFeatureMapSize 49 // or the value is outside of the allowed range. 50 bool AddRealFeature(const std::string& name, double value); 51 52 // Provides read-only access to the current set of features. 53 const base::hash_map<std::string, double>& features() const { 54 return features_; 55 } 56 57 // Clears the set of features in the map. 58 void Clear(); 59 60 // This is an upper bound on the number of features that will be extracted. 61 // We should never hit this cap; it is intended as a sanity check to prevent 62 // the FeatureMap from growing too large. 63 static const size_t kMaxFeatureMapSize; 64 65 private: 66 base::hash_map<std::string, double> features_; 67 68 DISALLOW_COPY_AND_ASSIGN(FeatureMap); 69 }; 70 71 namespace features { 72 // Constants for the various feature names that we use. 73 // 74 // IMPORTANT: when adding new features, you must update kAllowedFeatures in 75 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature 76 // should be sent in sanitized pingbacks. 77 78 //////////////////////////////////////////////////// 79 // URL host features 80 //////////////////////////////////////////////////// 81 82 // Set if the URL's hostname is an IP address. 83 extern const char kUrlHostIsIpAddress[]; 84 // Token feature containing the portion of the hostname controlled by a 85 // registrar, for example "com" or "co.uk". 86 extern const char kUrlTldToken[]; 87 // Token feature containing the first host component below the registrar. 88 // For example, in "www.google.com", the domain would be "google". 89 extern const char kUrlDomainToken[]; 90 // Token feature containing each host component below the domain. 91 // For example, in "www.host.example.com", both "www" and "host" would be 92 // "other host tokens". 93 extern const char kUrlOtherHostToken[]; 94 95 //////////////////////////////////////////////////// 96 // Aggregate features for URL host tokens 97 //////////////////////////////////////////////////// 98 99 // Set if the number of "other" host tokens for a URL is greater than one. 100 // Longer hostnames, regardless of the specific tokens, can be a signal that 101 // the URL is phishy. 102 extern const char kUrlNumOtherHostTokensGTOne[]; 103 // Set if the number of "other" host tokens for a URL is greater than three. 104 extern const char kUrlNumOtherHostTokensGTThree[]; 105 106 //////////////////////////////////////////////////// 107 // URL path token features 108 //////////////////////////////////////////////////// 109 110 // Token feature containing each alphanumeric string in the path that is at 111 // least 3 characters long. For example, "/abc/d/efg" would have 2 path 112 // token features, "abc" and "efg". Query parameters are not included. 113 extern const char kUrlPathToken[]; 114 115 //////////////////////////////////////////////////// 116 // DOM HTML form features 117 //////////////////////////////////////////////////// 118 119 // Set if the page has any <form> elements. 120 extern const char kPageHasForms[]; 121 // The fraction of form elements whose |action| attribute points to a 122 // URL on a different domain from the document URL. 123 extern const char kPageActionOtherDomainFreq[]; 124 125 // Set if the page has any <input type="text"> elements 126 // (includes inputs with missing or unknown types). 127 extern const char kPageHasTextInputs[]; 128 // Set if the page has any <input type="password"> elements. 129 extern const char kPageHasPswdInputs[]; 130 // Set if the page has any <input type="radio"> elements. 131 extern const char kPageHasRadioInputs[]; 132 // Set if the page has any <input type="checkbox"> elements. 133 extern const char kPageHasCheckInputs[]; 134 135 //////////////////////////////////////////////////// 136 // DOM HTML link features 137 //////////////////////////////////////////////////// 138 139 // The fraction of links in the page which point to a domain other than the 140 // domain of the document. See "URL host features" above for a discussion 141 // of how the doamin is computed. 142 extern const char kPageExternalLinksFreq[]; 143 // Token feature containing each external domain that is linked to. 144 extern const char kPageLinkDomain[]; 145 // Fraction of links in the page that use https. 146 extern const char kPageSecureLinksFreq[]; 147 148 //////////////////////////////////////////////////// 149 // DOM HTML script features 150 //////////////////////////////////////////////////// 151 152 // Set if the number of <script> elements in the page is greater than 1. 153 extern const char kPageNumScriptTagsGTOne[]; 154 // Set if the number of <script> elements in the page is greater than 6. 155 extern const char kPageNumScriptTagsGTSix[]; 156 157 //////////////////////////////////////////////////// 158 // Other DOM HTML features 159 //////////////////////////////////////////////////// 160 161 // The fraction of images whose src attribute points to an external domain. 162 extern const char kPageImgOtherDomainFreq[]; 163 164 //////////////////////////////////////////////////// 165 // Page term features 166 //////////////////////////////////////////////////// 167 168 // Token feature for a term (whitespace-delimited) on a page. Terms can be 169 // single words or multi-word n-grams. Rather than adding this feature for 170 // every possible token on a page, only the terms that are mentioned in the 171 // classification model are added. 172 extern const char kPageTerm[]; 173 174 } // namespace features 175 } // namepsace safe_browsing 176 177 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ 178