Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
      6 
      7 #include <string>
      8 
      9 #include "base/bind.h"
     10 #include "base/callback.h"
     11 #include "base/containers/hash_tables.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/message_loop/message_loop.h"
     14 #include "base/strings/string16.h"
     15 #include "base/strings/stringprintf.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "base/time/time.h"
     18 #include "chrome/renderer/safe_browsing/features.h"
     19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
     20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
     21 #include "chrome/renderer/safe_browsing/test_utils.h"
     22 #include "crypto/sha2.h"
     23 #include "testing/gmock/include/gmock/gmock.h"
     24 #include "testing/gtest/include/gtest/gtest.h"
     25 
     26 using base::ASCIIToUTF16;
     27 using ::testing::Return;
     28 
     29 
     30 static const uint32 kMurmurHash3Seed = 2777808611U;
     31 
     32 namespace safe_browsing {
     33 
     34 class PhishingTermFeatureExtractorTest : public ::testing::Test {
     35  protected:
     36   virtual void SetUp() {
     37     base::hash_set<std::string> terms;
     38     terms.insert("one");
     39     terms.insert("one one");
     40     terms.insert("two");
     41     terms.insert("multi word test");
     42     terms.insert("capitalization");
     43     terms.insert("space");
     44     terms.insert("separator");
     45     terms.insert("punctuation");
     46     // Chinese (translation of "hello")
     47     terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     48     // Chinese (translation of "goodbye")
     49     terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
     50 
     51     for (base::hash_set<std::string>::iterator it = terms.begin();
     52          it != terms.end(); ++it) {
     53       term_hashes_.insert(crypto::SHA256HashString(*it));
     54     }
     55 
     56     base::hash_set<std::string> words;
     57     words.insert("one");
     58     words.insert("two");
     59     words.insert("multi");
     60     words.insert("word");
     61     words.insert("test");
     62     words.insert("capitalization");
     63     words.insert("space");
     64     words.insert("separator");
     65     words.insert("punctuation");
     66     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     67     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
     68 
     69     for (base::hash_set<std::string>::iterator it = words.begin();
     70          it != words.end(); ++it) {
     71       word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
     72     }
     73 
     74     ResetExtractor(3 /* max shingles per page */);
     75   }
     76 
     77   void ResetExtractor(size_t max_shingles_per_page) {
     78     extractor_.reset(new PhishingTermFeatureExtractor(
     79         &term_hashes_,
     80         &word_hashes_,
     81         3 /* max_words_per_term */,
     82         kMurmurHash3Seed,
     83         max_shingles_per_page,
     84         4 /* shingle_size */,
     85         &clock_));
     86   }
     87 
     88   // Runs the TermFeatureExtractor on |page_text|, waiting for the
     89   // completion callback.  Returns the success boolean from the callback.
     90   bool ExtractFeatures(const base::string16* page_text,
     91                        FeatureMap* features,
     92                        std::set<uint32>* shingle_hashes) {
     93     success_ = false;
     94     extractor_->ExtractFeatures(
     95         page_text,
     96         features,
     97         shingle_hashes,
     98         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
     99                    base::Unretained(this)));
    100     msg_loop_.Run();
    101     return success_;
    102   }
    103 
    104   void PartialExtractFeatures(const base::string16* page_text,
    105                               FeatureMap* features,
    106                               std::set<uint32>* shingle_hashes) {
    107     extractor_->ExtractFeatures(
    108         page_text,
    109         features,
    110         shingle_hashes,
    111         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
    112                    base::Unretained(this)));
    113     msg_loop_.PostTask(
    114         FROM_HERE,
    115         base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
    116                    base::Unretained(this)));
    117     msg_loop_.RunUntilIdle();
    118   }
    119 
    120   // Completion callback for feature extraction.
    121   void ExtractionDone(bool success) {
    122     success_ = success;
    123     msg_loop_.Quit();
    124   }
    125 
    126   void QuitExtraction() {
    127     extractor_->CancelPendingExtraction();
    128     msg_loop_.Quit();
    129   }
    130 
    131   base::MessageLoop msg_loop_;
    132   MockFeatureExtractorClock clock_;
    133   scoped_ptr<PhishingTermFeatureExtractor> extractor_;
    134   base::hash_set<std::string> term_hashes_;
    135   base::hash_set<uint32> word_hashes_;
    136   bool success_;  // holds the success value from ExtractFeatures
    137 };
    138 
    139 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
    140   // This test doesn't exercise the extraction timing.
    141   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
    142 
    143   base::string16 page_text = ASCIIToUTF16("blah");
    144   FeatureMap expected_features;  // initially empty
    145   std::set<uint32> expected_shingle_hashes;
    146 
    147   FeatureMap features;
    148   std::set<uint32> shingle_hashes;
    149   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    150   ExpectFeatureMapsAreEqual(features, expected_features);
    151   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    152 
    153   page_text = ASCIIToUTF16("one one");
    154   expected_features.Clear();
    155   expected_features.AddBooleanFeature(features::kPageTerm +
    156                                       std::string("one"));
    157   expected_features.AddBooleanFeature(features::kPageTerm +
    158                                       std::string("one one"));
    159   expected_shingle_hashes.clear();
    160 
    161   features.Clear();
    162   shingle_hashes.clear();
    163   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    164   ExpectFeatureMapsAreEqual(features, expected_features);
    165   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    166 
    167   page_text = ASCIIToUTF16("bla bla multi word test bla");
    168   expected_features.Clear();
    169   expected_features.AddBooleanFeature(features::kPageTerm +
    170                                       std::string("multi word test"));
    171   expected_shingle_hashes.clear();
    172   expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
    173                                                    kMurmurHash3Seed));
    174   expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
    175                                                    kMurmurHash3Seed));
    176   expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
    177                                                    kMurmurHash3Seed));
    178 
    179   features.Clear();
    180   shingle_hashes.clear();
    181   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    182   ExpectFeatureMapsAreEqual(features, expected_features);
    183   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    184 
    185   // This text has all of the words for one of the terms, but they are
    186   // not in the correct order.
    187   page_text = ASCIIToUTF16("bla bla test word multi bla");
    188   expected_features.Clear();
    189   expected_shingle_hashes.clear();
    190   expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
    191                                                    kMurmurHash3Seed));
    192   expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
    193                                                    kMurmurHash3Seed));
    194   expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
    195                                                    kMurmurHash3Seed));
    196 
    197   features.Clear();
    198   shingle_hashes.clear();
    199   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    200   ExpectFeatureMapsAreEqual(features, expected_features);
    201   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    202 
    203   // Test various separators.
    204   page_text = ASCIIToUTF16("Capitalization plus non-space\n"
    205                            "separator... punctuation!");
    206   expected_features.Clear();
    207   expected_features.AddBooleanFeature(features::kPageTerm +
    208                                       std::string("capitalization"));
    209   expected_features.AddBooleanFeature(features::kPageTerm +
    210                                       std::string("space"));
    211   expected_features.AddBooleanFeature(features::kPageTerm +
    212                                       std::string("separator"));
    213   expected_features.AddBooleanFeature(features::kPageTerm +
    214                                       std::string("punctuation"));
    215   expected_shingle_hashes.clear();
    216   expected_shingle_hashes.insert(
    217       MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
    218   expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
    219                                                    kMurmurHash3Seed));
    220   expected_shingle_hashes.insert(
    221       MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
    222 
    223   features.Clear();
    224   shingle_hashes.clear();
    225   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    226   ExpectFeatureMapsAreEqual(features, expected_features);
    227   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    228 
    229   // Test a page with too many words and we should only 3 minimum hashes.
    230   page_text = ASCIIToUTF16("This page has way too many words.");
    231   expected_features.Clear();
    232   expected_shingle_hashes.clear();
    233   expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
    234                                                    kMurmurHash3Seed));
    235   expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
    236                                                    kMurmurHash3Seed));
    237   expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
    238                                                    kMurmurHash3Seed));
    239   expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
    240                                                    kMurmurHash3Seed));
    241   std::set<uint32>::iterator it = expected_shingle_hashes.end();
    242   expected_shingle_hashes.erase(--it);
    243 
    244   features.Clear();
    245   shingle_hashes.clear();
    246   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    247   ExpectFeatureMapsAreEqual(features, expected_features);
    248   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    249 
    250   // Test with empty page text.
    251   page_text = base::string16();
    252   expected_features.Clear();
    253   expected_shingle_hashes.clear();
    254   features.Clear();
    255   shingle_hashes.clear();
    256   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    257   ExpectFeatureMapsAreEqual(features, expected_features);
    258   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    259 
    260 #if !defined(OS_ANDROID)
    261   // The test code is disabled due to http://crbug.com/392234
    262   // The client-side detection feature is not enabled on Android yet.
    263   // If we decided to enable the feature, we need to fix the bug first.
    264 
    265   // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
    266   // that we can correctly separate terms in languages that don't use spaces.
    267   page_text =
    268       base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
    269                         "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
    270   expected_features.Clear();
    271   expected_features.AddBooleanFeature(
    272       features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
    273   expected_features.AddBooleanFeature(
    274       features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
    275   expected_shingle_hashes.clear();
    276   expected_shingle_hashes.insert(MurmurHash3String(
    277       "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
    278       "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
    279 
    280   features.Clear();
    281   shingle_hashes.clear();
    282   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    283   ExpectFeatureMapsAreEqual(features, expected_features);
    284   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    285 #endif
    286 }
    287 
    288 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
    289   // For this test, we'll cause the feature extraction to run multiple
    290   // iterations by incrementing the clock.
    291   ResetExtractor(200 /* max shingles per page */);
    292 
    293   // This page has a total of 30 words.  For the features to be computed
    294   // correctly, the extractor has to process the entire string of text.
    295   base::string16 page_text(ASCIIToUTF16("one "));
    296   for (int i = 0; i < 28; ++i) {
    297     page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    298   }
    299   page_text.append(ASCIIToUTF16("two"));
    300 
    301   // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
    302   // Note that this assumes kClockCheckGranularity = 5 and
    303   // kMaxTimePerChunkMs = 10.
    304   base::TimeTicks now = base::TimeTicks::Now();
    305   EXPECT_CALL(clock_, Now())
    306       // Time check at the start of extraction.
    307       .WillOnce(Return(now))
    308       // Time check at the start of the first chunk of work.
    309       .WillOnce(Return(now))
    310       // Time check after the first 5 words.
    311       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
    312       // Time check after the next 5 words.
    313       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
    314       // Time check after the next 5 words.
    315       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
    316       // Time check after the next 5 words.  This is over the chunk
    317       // time limit, so a continuation task will be posted.
    318       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
    319       // Time check at the start of the second chunk of work.
    320       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
    321       // Time check after the next 5 words.
    322       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
    323       // Time check after the next 5 words.
    324       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
    325       // A final check for the histograms.
    326       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
    327 
    328   FeatureMap expected_features;
    329   expected_features.AddBooleanFeature(features::kPageTerm +
    330                                       std::string("one"));
    331   expected_features.AddBooleanFeature(features::kPageTerm +
    332                                       std::string("two"));
    333   std::set<uint32> expected_shingle_hashes;
    334   expected_shingle_hashes.insert(
    335       MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
    336   expected_shingle_hashes.insert(
    337       MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
    338   expected_shingle_hashes.insert(
    339       MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
    340   expected_shingle_hashes.insert(
    341       MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
    342   expected_shingle_hashes.insert(
    343       MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
    344   expected_shingle_hashes.insert(
    345       MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
    346   expected_shingle_hashes.insert(
    347       MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
    348   expected_shingle_hashes.insert(
    349       MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
    350   expected_shingle_hashes.insert(
    351       MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
    352   expected_shingle_hashes.insert(
    353       MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
    354   expected_shingle_hashes.insert(
    355       MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
    356   expected_shingle_hashes.insert(
    357       MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
    358   expected_shingle_hashes.insert(
    359       MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
    360   expected_shingle_hashes.insert(
    361       MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
    362   expected_shingle_hashes.insert(
    363       MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
    364   expected_shingle_hashes.insert(
    365       MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
    366   expected_shingle_hashes.insert(
    367       MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
    368   expected_shingle_hashes.insert(
    369       MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
    370   expected_shingle_hashes.insert(
    371       MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
    372   expected_shingle_hashes.insert(
    373       MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
    374   expected_shingle_hashes.insert(
    375       MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
    376   expected_shingle_hashes.insert(
    377       MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
    378   expected_shingle_hashes.insert(
    379       MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
    380   expected_shingle_hashes.insert(
    381       MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
    382   expected_shingle_hashes.insert(
    383       MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
    384   expected_shingle_hashes.insert(
    385       MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
    386   expected_shingle_hashes.insert(
    387       MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
    388 
    389   FeatureMap features;
    390   std::set<uint32> shingle_hashes;
    391   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    392   ExpectFeatureMapsAreEqual(features, expected_features);
    393   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
    394   // Make sure none of the mock expectations carry over to the next test.
    395   ::testing::Mock::VerifyAndClearExpectations(&clock_);
    396 
    397   // Now repeat the test with the same text, but advance the clock faster so
    398   // that the extraction time exceeds the maximum total time for the feature
    399   // extractor.  Extraction should fail.  Note that this assumes
    400   // kMaxTotalTimeMs = 500.
    401   EXPECT_CALL(clock_, Now())
    402       // Time check at the start of extraction.
    403       .WillOnce(Return(now))
    404       // Time check at the start of the first chunk of work.
    405       .WillOnce(Return(now))
    406       // Time check after the first 5 words,
    407       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
    408       // Time check at the start of the second chunk of work.
    409       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
    410       // Time check after the next 5 words.  This is over the limit.
    411       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
    412       // A final time check for the histograms.
    413       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
    414 
    415   features.Clear();
    416   shingle_hashes.clear();
    417   EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
    418 }
    419 
    420 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
    421   scoped_ptr<base::string16> page_text(
    422       new base::string16(ASCIIToUTF16("one ")));
    423   for (int i = 0; i < 28; ++i) {
    424     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    425   }
    426 
    427   base::TimeTicks now = base::TimeTicks::Now();
    428   EXPECT_CALL(clock_, Now())
    429       // Time check at the start of extraction.
    430       .WillOnce(Return(now))
    431       // Time check at the start of the first chunk of work.
    432       .WillOnce(Return(now))
    433       // Time check after the first 5 words.
    434       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
    435       // Time check after the next 5 words. This should be greater than
    436       // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
    437       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
    438 
    439   FeatureMap features;
    440   std::set<uint32> shingle_hashes;
    441   // Extract first 10 words then stop.
    442   PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
    443 
    444   page_text.reset(new base::string16());
    445   for (int i = 30; i < 58; ++i) {
    446     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    447   }
    448   page_text->append(ASCIIToUTF16("multi word test "));
    449   features.Clear();
    450   shingle_hashes.clear();
    451 
    452   // This part doesn't exercise the extraction timing.
    453   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
    454 
    455   // Now extract normally and make sure nothing breaks.
    456   EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
    457 
    458   FeatureMap expected_features;
    459   expected_features.AddBooleanFeature(features::kPageTerm +
    460                                       std::string("multi word test"));
    461   ExpectFeatureMapsAreEqual(features, expected_features);
    462 }
    463 
    464 }  // namespace safe_browsing
    465