Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
      6 
      7 #include <algorithm>
      8 #include <string>
      9 #include <vector>
     10 
     11 #include "base/logging.h"
     12 #include "base/metrics/histogram.h"
     13 #include "base/perftimer.h"
     14 #include "base/strings/string_split.h"
     15 #include "base/strings/string_util.h"
     16 #include "chrome/renderer/safe_browsing/features.h"
     17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
     18 #include "url/gurl.h"
     19 
     20 namespace safe_browsing {
     21 
     22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
     23 
     24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
     25 
     26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
     27                                                   FeatureMap* features) {
     28   PerfTimer timer;
     29   if (url.HostIsIPAddress()) {
     30     if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))
     31       return false;
     32   } else {
     33     std::string host;
     34     TrimString(url.host(), ".", &host);  // Remove any leading/trailing dots.
     35 
     36     // TODO(bryner): Ensure that the url encoding is consistent with
     37     // the features in the model.
     38 
     39     // Disallow unknown registries so that we don't classify
     40     // partial hostnames (e.g. "www.subdomain").
     41     size_t registry_length =
     42         net::registry_controlled_domains::GetRegistryLength(
     43             host,
     44             net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
     45             net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
     46 
     47     if (registry_length == 0 || registry_length == std::string::npos) {
     48       DVLOG(1) << "Could not find TLD for host: " << host;
     49       return false;
     50     }
     51     DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "
     52         "host is only a TLD: " << host;
     53     size_t tld_start = host.size() - registry_length;
     54     if (!features->AddBooleanFeature(features::kUrlTldToken +
     55                                      host.substr(tld_start)))
     56       return false;
     57 
     58     // Pull off the TLD and the preceeding dot.
     59     host.erase(tld_start - 1);
     60     std::vector<std::string> host_tokens;
     61     base::SplitStringDontTrim(host, '.', &host_tokens);
     62     // Get rid of any empty components.
     63     std::vector<std::string>::iterator new_end =
     64         std::remove(host_tokens.begin(), host_tokens.end(), "");
     65     host_tokens.erase(new_end, host_tokens.end());
     66     if (host_tokens.empty()) {
     67       DVLOG(1) << "Could not find domain for host: " << host;
     68       return false;
     69     }
     70     if (!features->AddBooleanFeature(features::kUrlDomainToken +
     71                                      host_tokens.back()))
     72       return false;
     73     host_tokens.pop_back();
     74 
     75     // Now we're just left with the "other" host tokens.
     76     for (std::vector<std::string>::iterator it = host_tokens.begin();
     77          it != host_tokens.end(); ++it) {
     78       if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))
     79         return false;
     80     }
     81 
     82     if (host_tokens.size() > 1) {
     83       if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))
     84         return false;
     85       if (host_tokens.size() > 3) {
     86         if (!features->AddBooleanFeature(
     87                 features::kUrlNumOtherHostTokensGTThree))
     88           return false;
     89       }
     90     }
     91   }
     92 
     93   std::vector<std::string> long_tokens;
     94   SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
     95   for (std::vector<std::string>::iterator it = long_tokens.begin();
     96        it != long_tokens.end(); ++it) {
     97     if (!features->AddBooleanFeature(features::kUrlPathToken + *it))
     98       return false;
     99   }
    100 
    101   UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
    102   return true;
    103 }
    104 
    105 // static
    106 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
    107     const std::string& full,
    108     std::vector<std::string>* tokens) {
    109   // Split on common non-alphanumerics.
    110   // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
    111   static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
    112   std::vector<std::string> raw_splits;
    113   Tokenize(full, kTokenSeparators, &raw_splits);
    114 
    115   // Copy over only the splits that are 3 or more chars long.
    116   // TODO(bryner): Determine a meaningful min size.
    117   for (std::vector<std::string>::iterator it = raw_splits.begin();
    118        it != raw_splits.end(); ++it) {
    119     if (it->length() >= kMinPathComponentLength)
    120       tokens->push_back(*it);
    121   }
    122 }
    123 
    124 }  // namespace safe_browsing
    125