Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
      6 
      7 #include <string>
      8 #include <vector>
      9 #include "chrome/renderer/safe_browsing/features.h"
     10 #include "chrome/renderer/safe_browsing/test_utils.h"
     11 #include "testing/gmock/include/gmock/gmock.h"
     12 #include "testing/gtest/include/gtest/gtest.h"
     13 #include "url/gurl.h"
     14 
     15 using ::testing::ElementsAre;
     16 
     17 namespace safe_browsing {
     18 
     19 class PhishingUrlFeatureExtractorTest : public ::testing::Test {
     20  protected:
     21   PhishingUrlFeatureExtractor extractor_;
     22 
     23   void SplitStringIntoLongAlphanumTokens(const std::string& full,
     24                                          std::vector<std::string>* tokens) {
     25     PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
     26                                                                    tokens);
     27   }
     28 };
     29 
     30 TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
     31   std::string url = "http://123.0.0.1/mydocuments/a.file.html";
     32   FeatureMap expected_features;
     33   expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
     34   expected_features.AddBooleanFeature(features::kUrlPathToken +
     35                                       std::string("mydocuments"));
     36   expected_features.AddBooleanFeature(features::kUrlPathToken +
     37                                       std::string("file"));
     38   expected_features.AddBooleanFeature(features::kUrlPathToken +
     39                                       std::string("html"));
     40 
     41   FeatureMap features;
     42   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
     43   ExpectFeatureMapsAreEqual(features, expected_features);
     44 
     45   url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
     46   expected_features.Clear();
     47   expected_features.AddBooleanFeature(features::kUrlTldToken +
     48                                       std::string("co.uk"));
     49   expected_features.AddBooleanFeature(features::kUrlDomainToken +
     50                                       std::string("cnn"));
     51   expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
     52                                       std::string("www"));
     53   expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
     54   expected_features.AddBooleanFeature(features::kUrlPathToken +
     55                                       std::string("sports"));
     56   expected_features.AddBooleanFeature(features::kUrlPathToken +
     57                                       std::string("index"));
     58   expected_features.AddBooleanFeature(features::kUrlPathToken +
     59                                       std::string("html"));
     60 
     61   features.Clear();
     62   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
     63   ExpectFeatureMapsAreEqual(features, expected_features);
     64 
     65   url = "http://justadomain.com/";
     66   expected_features.Clear();
     67   expected_features.AddBooleanFeature(features::kUrlTldToken +
     68                                       std::string("com"));
     69   expected_features.AddBooleanFeature(features::kUrlDomainToken +
     70                                       std::string("justadomain"));
     71 
     72   features.Clear();
     73   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
     74   ExpectFeatureMapsAreEqual(features, expected_features);
     75 
     76   url = "http://witharef.com/#abc";
     77   expected_features.Clear();
     78   expected_features.AddBooleanFeature(features::kUrlTldToken +
     79                                       std::string("com"));
     80   expected_features.AddBooleanFeature(features::kUrlDomainToken +
     81                                       std::string("witharef"));
     82 
     83   features.Clear();
     84   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
     85   ExpectFeatureMapsAreEqual(features, expected_features);
     86 
     87   url = "http://...www..lotsodots....com./";
     88   expected_features.Clear();
     89   expected_features.AddBooleanFeature(features::kUrlTldToken +
     90                                       std::string("com"));
     91   expected_features.AddBooleanFeature(features::kUrlDomainToken +
     92                                       std::string("lotsodots"));
     93   expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
     94                                       std::string("www"));
     95 
     96   features.Clear();
     97   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
     98   ExpectFeatureMapsAreEqual(features, expected_features);
     99 
    100   url = "http://unrecognized.tld/";
    101   EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
    102 
    103   url = "http://com/123";
    104   EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
    105 
    106   url = "http://.co.uk/";
    107   EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
    108 
    109   url = "file:///nohost.txt";
    110   EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
    111 
    112   url = "not:valid:at:all";
    113   EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
    114 }
    115 
    116 TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
    117   std::string full = "This.is/a_pretty\\unusual-!path,indeed";
    118   std::vector<std::string> long_tokens;
    119   SplitStringIntoLongAlphanumTokens(full, &long_tokens);
    120   EXPECT_THAT(long_tokens,
    121               ElementsAre("This", "pretty", "unusual", "path", "indeed"));
    122 
    123   long_tokens.clear();
    124   full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
    125   SplitStringIntoLongAlphanumTokens(full, &long_tokens);
    126   EXPECT_THAT(long_tokens, ElementsAre());
    127 }
    128 
    129 }  // namespace safe_browsing
    130