1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 6 7 #include <algorithm> 8 #include <string> 9 #include <vector> 10 11 #include "base/logging.h" 12 #include "base/metrics/histogram.h" 13 #include "base/perftimer.h" 14 #include "base/strings/string_split.h" 15 #include "base/strings/string_util.h" 16 #include "chrome/renderer/safe_browsing/features.h" 17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" 18 #include "url/gurl.h" 19 20 namespace safe_browsing { 21 22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {} 23 24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} 25 26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, 27 FeatureMap* features) { 28 PerfTimer timer; 29 if (url.HostIsIPAddress()) { 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) 31 return false; 32 } else { 33 std::string host; 34 TrimString(url.host(), ".", &host); // Remove any leading/trailing dots. 35 36 // TODO(bryner): Ensure that the url encoding is consistent with 37 // the features in the model. 38 39 // Disallow unknown registries so that we don't classify 40 // partial hostnames (e.g. "www.subdomain"). 41 size_t registry_length = 42 net::registry_controlled_domains::GetRegistryLength( 43 host, 44 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, 45 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 46 47 if (registry_length == 0 || registry_length == std::string::npos) { 48 DVLOG(1) << "Could not find TLD for host: " << host; 49 return false; 50 } 51 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but " 52 "host is only a TLD: " << host; 53 size_t tld_start = host.size() - registry_length; 54 if (!features->AddBooleanFeature(features::kUrlTldToken + 55 host.substr(tld_start))) 56 return false; 57 58 // Pull off the TLD and the preceeding dot. 59 host.erase(tld_start - 1); 60 std::vector<std::string> host_tokens; 61 base::SplitStringDontTrim(host, '.', &host_tokens); 62 // Get rid of any empty components. 63 std::vector<std::string>::iterator new_end = 64 std::remove(host_tokens.begin(), host_tokens.end(), ""); 65 host_tokens.erase(new_end, host_tokens.end()); 66 if (host_tokens.empty()) { 67 DVLOG(1) << "Could not find domain for host: " << host; 68 return false; 69 } 70 if (!features->AddBooleanFeature(features::kUrlDomainToken + 71 host_tokens.back())) 72 return false; 73 host_tokens.pop_back(); 74 75 // Now we're just left with the "other" host tokens. 76 for (std::vector<std::string>::iterator it = host_tokens.begin(); 77 it != host_tokens.end(); ++it) { 78 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) 79 return false; 80 } 81 82 if (host_tokens.size() > 1) { 83 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne)) 84 return false; 85 if (host_tokens.size() > 3) { 86 if (!features->AddBooleanFeature( 87 features::kUrlNumOtherHostTokensGTThree)) 88 return false; 89 } 90 } 91 } 92 93 std::vector<std::string> long_tokens; 94 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens); 95 for (std::vector<std::string>::iterator it = long_tokens.begin(); 96 it != long_tokens.end(); ++it) { 97 if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) 98 return false; 99 } 100 101 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed()); 102 return true; 103 } 104 105 // static 106 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens( 107 const std::string& full, 108 std::vector<std::string>* tokens) { 109 // Split on common non-alphanumerics. 110 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly. 111 static const char kTokenSeparators[] = ".,\\/_-|=%:!&"; 112 std::vector<std::string> raw_splits; 113 Tokenize(full, kTokenSeparators, &raw_splits); 114 115 // Copy over only the splits that are 3 or more chars long. 116 // TODO(bryner): Determine a meaningful min size. 117 for (std::vector<std::string>::iterator it = raw_splits.begin(); 118 it != raw_splits.end(); ++it) { 119 if (it->length() >= kMinPathComponentLength) 120 tokens->push_back(*it); 121 } 122 } 123 124 } // namespace safe_browsing 125