Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
      6 
      7 #include <string>
      8 
      9 #include "base/bind.h"
     10 #include "base/callback.h"
     11 #include "base/containers/hash_tables.h"
     12 #include "base/memory/scoped_ptr.h"
     13 #include "base/message_loop/message_loop.h"
     14 #include "base/strings/string16.h"
     15 #include "base/strings/stringprintf.h"
     16 #include "base/strings/utf_string_conversions.h"
     17 #include "base/time/time.h"
     18 #include "chrome/renderer/safe_browsing/features.h"
     19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
     20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
     21 #include "chrome/renderer/safe_browsing/test_utils.h"
     22 #include "crypto/sha2.h"
     23 #include "testing/gmock/include/gmock/gmock.h"
     24 #include "testing/gtest/include/gtest/gtest.h"
     25 
     26 using ::testing::Return;
     27 
     28 namespace safe_browsing {
     29 
     30 class PhishingTermFeatureExtractorTest : public ::testing::Test {
     31  protected:
     32   virtual void SetUp() {
     33     base::hash_set<std::string> terms;
     34     terms.insert("one");
     35     terms.insert("one one");
     36     terms.insert("two");
     37     terms.insert("multi word test");
     38     terms.insert("capitalization");
     39     terms.insert("space");
     40     terms.insert("separator");
     41     terms.insert("punctuation");
     42     // Chinese (translation of "hello")
     43     terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     44     // Chinese (translation of "goodbye")
     45     terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
     46 
     47     for (base::hash_set<std::string>::iterator it = terms.begin();
     48          it != terms.end(); ++it) {
     49       term_hashes_.insert(crypto::SHA256HashString(*it));
     50     }
     51 
     52     base::hash_set<std::string> words;
     53     words.insert("one");
     54     words.insert("two");
     55     words.insert("multi");
     56     words.insert("word");
     57     words.insert("test");
     58     words.insert("capitalization");
     59     words.insert("space");
     60     words.insert("separator");
     61     words.insert("punctuation");
     62     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     63     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
     64 
     65     static const uint32 kMurmurHash3Seed = 2777808611U;
     66     for (base::hash_set<std::string>::iterator it = words.begin();
     67          it != words.end(); ++it) {
     68       word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
     69     }
     70 
     71     extractor_.reset(new PhishingTermFeatureExtractor(
     72         &term_hashes_,
     73         &word_hashes_,
     74         3 /* max_words_per_term */,
     75         kMurmurHash3Seed,
     76         &clock_));
     77   }
     78 
     79   // Runs the TermFeatureExtractor on |page_text|, waiting for the
     80   // completion callback.  Returns the success boolean from the callback.
     81   bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
     82     success_ = false;
     83     extractor_->ExtractFeatures(
     84         page_text,
     85         features,
     86         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
     87                    base::Unretained(this)));
     88     msg_loop_.Run();
     89     return success_;
     90   }
     91 
     92   void PartialExtractFeatures(const string16* page_text, FeatureMap* features) {
     93     extractor_->ExtractFeatures(
     94         page_text,
     95         features,
     96         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
     97                    base::Unretained(this)));
     98     msg_loop_.PostTask(
     99         FROM_HERE,
    100         base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
    101                    base::Unretained(this)));
    102     msg_loop_.RunUntilIdle();
    103   }
    104 
    105   // Completion callback for feature extraction.
    106   void ExtractionDone(bool success) {
    107     success_ = success;
    108     msg_loop_.Quit();
    109   }
    110 
    111   void QuitExtraction() {
    112     extractor_->CancelPendingExtraction();
    113     msg_loop_.Quit();
    114   }
    115 
    116   base::MessageLoop msg_loop_;
    117   MockFeatureExtractorClock clock_;
    118   scoped_ptr<PhishingTermFeatureExtractor> extractor_;
    119   base::hash_set<std::string> term_hashes_;
    120   base::hash_set<uint32> word_hashes_;
    121   bool success_;  // holds the success value from ExtractFeatures
    122 };
    123 
    124 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
    125   // This test doesn't exercise the extraction timing.
    126   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
    127 
    128   string16 page_text = ASCIIToUTF16("blah");
    129   FeatureMap expected_features;  // initially empty
    130 
    131   FeatureMap features;
    132   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    133   ExpectFeatureMapsAreEqual(features, expected_features);
    134 
    135   page_text = ASCIIToUTF16("one one");
    136   expected_features.Clear();
    137   expected_features.AddBooleanFeature(features::kPageTerm +
    138                                       std::string("one"));
    139   expected_features.AddBooleanFeature(features::kPageTerm +
    140                                       std::string("one one"));
    141 
    142   features.Clear();
    143   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    144   ExpectFeatureMapsAreEqual(features, expected_features);
    145 
    146   page_text = ASCIIToUTF16("bla bla multi word test bla");
    147   expected_features.Clear();
    148   expected_features.AddBooleanFeature(features::kPageTerm +
    149                                       std::string("multi word test"));
    150 
    151   features.Clear();
    152   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    153   ExpectFeatureMapsAreEqual(features, expected_features);
    154 
    155   // This text has all of the words for one of the terms, but they are
    156   // not in the correct order.
    157   page_text = ASCIIToUTF16("bla bla test word multi bla");
    158   expected_features.Clear();
    159 
    160   features.Clear();
    161   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    162   ExpectFeatureMapsAreEqual(features, expected_features);
    163 
    164   page_text = ASCIIToUTF16("Capitalization plus non-space\n"
    165                            "separator... punctuation!");
    166   expected_features.Clear();
    167   expected_features.AddBooleanFeature(features::kPageTerm +
    168                                       std::string("capitalization"));
    169   expected_features.AddBooleanFeature(features::kPageTerm +
    170                                       std::string("space"));
    171   expected_features.AddBooleanFeature(features::kPageTerm +
    172                                       std::string("separator"));
    173   expected_features.AddBooleanFeature(features::kPageTerm +
    174                                       std::string("punctuation"));
    175 
    176   features.Clear();
    177   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    178   ExpectFeatureMapsAreEqual(features, expected_features);
    179 
    180   // Test with empty page text.
    181   page_text = string16();
    182   expected_features.Clear();
    183   features.Clear();
    184   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    185   ExpectFeatureMapsAreEqual(features, expected_features);
    186 
    187   // Chinese translation of the phrase "hello goodbye". This tests that
    188   // we can correctly separate terms in languages that don't use spaces.
    189   page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
    190   expected_features.Clear();
    191   expected_features.AddBooleanFeature(
    192       features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
    193   expected_features.AddBooleanFeature(
    194       features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
    195 
    196   features.Clear();
    197   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    198   ExpectFeatureMapsAreEqual(features, expected_features);
    199 }
    200 
    201 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
    202   // For this test, we'll cause the feature extraction to run multiple
    203   // iterations by incrementing the clock.
    204 
    205   // This page has a total of 30 words.  For the features to be computed
    206   // correctly, the extractor has to process the entire string of text.
    207   string16 page_text(ASCIIToUTF16("one "));
    208   for (int i = 0; i < 28; ++i) {
    209     page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    210   }
    211   page_text.append(ASCIIToUTF16("two"));
    212 
    213   // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
    214   // Note that this assumes kClockCheckGranularity = 5 and
    215   // kMaxTimePerChunkMs = 10.
    216   base::TimeTicks now = base::TimeTicks::Now();
    217   EXPECT_CALL(clock_, Now())
    218       // Time check at the start of extraction.
    219       .WillOnce(Return(now))
    220       // Time check at the start of the first chunk of work.
    221       .WillOnce(Return(now))
    222       // Time check after the first 5 words.
    223       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
    224       // Time check after the next 5 words.
    225       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
    226       // Time check after the next 5 words.
    227       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
    228       // Time check after the next 5 words.  This is over the chunk
    229       // time limit, so a continuation task will be posted.
    230       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
    231       // Time check at the start of the second chunk of work.
    232       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
    233       // Time check after the next 5 words.
    234       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
    235       // Time check after the next 5 words.
    236       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
    237       // A final check for the histograms.
    238       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
    239 
    240   FeatureMap expected_features;
    241   expected_features.AddBooleanFeature(features::kPageTerm +
    242                                       std::string("one"));
    243   expected_features.AddBooleanFeature(features::kPageTerm +
    244                                       std::string("two"));
    245 
    246   FeatureMap features;
    247   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
    248   ExpectFeatureMapsAreEqual(features, expected_features);
    249   // Make sure none of the mock expectations carry over to the next test.
    250   ::testing::Mock::VerifyAndClearExpectations(&clock_);
    251 
    252   // Now repeat the test with the same text, but advance the clock faster so
    253   // that the extraction time exceeds the maximum total time for the feature
    254   // extractor.  Extraction should fail.  Note that this assumes
    255   // kMaxTotalTimeMs = 500.
    256   EXPECT_CALL(clock_, Now())
    257       // Time check at the start of extraction.
    258       .WillOnce(Return(now))
    259       // Time check at the start of the first chunk of work.
    260       .WillOnce(Return(now))
    261       // Time check after the first 5 words,
    262       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
    263       // Time check at the start of the second chunk of work.
    264       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
    265       // Time check after the next 5 words.  This is over the limit.
    266       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
    267       // A final time check for the histograms.
    268       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
    269 
    270   features.Clear();
    271   EXPECT_FALSE(ExtractFeatures(&page_text, &features));
    272 }
    273 
    274 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
    275   scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one ")));
    276   for (int i = 0; i < 28; ++i) {
    277     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    278   }
    279 
    280   base::TimeTicks now = base::TimeTicks::Now();
    281   EXPECT_CALL(clock_, Now())
    282       // Time check at the start of extraction.
    283       .WillOnce(Return(now))
    284       // Time check at the start of the first chunk of work.
    285       .WillOnce(Return(now))
    286       // Time check after the first 5 words.
    287       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
    288       // Time check after the next 5 words. This should be greater than
    289       // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
    290       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
    291 
    292   FeatureMap features;
    293   // Extract first 10 words then stop.
    294   PartialExtractFeatures(page_text.get(), &features);
    295 
    296   page_text.reset(new string16());
    297   for (int i = 30; i < 58; ++i) {
    298     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
    299   }
    300   page_text->append(ASCIIToUTF16("multi word test "));
    301   features.Clear();
    302 
    303   // This part doesn't exercise the extraction timing.
    304   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
    305 
    306   // Now extract normally and make sure nothing breaks.
    307   EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
    308 
    309   FeatureMap expected_features;
    310   expected_features.AddBooleanFeature(features::kPageTerm +
    311                                       std::string("multi word test"));
    312   ExpectFeatureMapsAreEqual(features, expected_features);
    313 }
    314 
    315 }  // namespace safe_browsing
    316