1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 6 7 #include <string> 8 9 #include "base/bind.h" 10 #include "base/callback.h" 11 #include "base/containers/hash_tables.h" 12 #include "base/memory/scoped_ptr.h" 13 #include "base/message_loop/message_loop.h" 14 #include "base/strings/string16.h" 15 #include "base/strings/stringprintf.h" 16 #include "base/strings/utf_string_conversions.h" 17 #include "base/time/time.h" 18 #include "chrome/renderer/safe_browsing/features.h" 19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" 21 #include "chrome/renderer/safe_browsing/test_utils.h" 22 #include "crypto/sha2.h" 23 #include "testing/gmock/include/gmock/gmock.h" 24 #include "testing/gtest/include/gtest/gtest.h" 25 26 using ::testing::Return; 27 28 namespace safe_browsing { 29 30 class PhishingTermFeatureExtractorTest : public ::testing::Test { 31 protected: 32 virtual void SetUp() { 33 base::hash_set<std::string> terms; 34 terms.insert("one"); 35 terms.insert("one one"); 36 terms.insert("two"); 37 terms.insert("multi word test"); 38 terms.insert("capitalization"); 39 terms.insert("space"); 40 terms.insert("separator"); 41 terms.insert("punctuation"); 42 // Chinese (translation of "hello") 43 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); 44 // Chinese (translation of "goodbye") 45 terms.insert("\xe5\x86\x8d\xe8\xa7\x81"); 46 47 for (base::hash_set<std::string>::iterator it = terms.begin(); 48 it != terms.end(); ++it) { 49 term_hashes_.insert(crypto::SHA256HashString(*it)); 50 } 51 52 base::hash_set<std::string> words; 53 words.insert("one"); 54 words.insert("two"); 55 words.insert("multi"); 56 words.insert("word"); 57 words.insert("test"); 58 words.insert("capitalization"); 59 words.insert("space"); 60 words.insert("separator"); 61 words.insert("punctuation"); 62 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); 63 words.insert("\xe5\x86\x8d\xe8\xa7\x81"); 64 65 static const uint32 kMurmurHash3Seed = 2777808611U; 66 for (base::hash_set<std::string>::iterator it = words.begin(); 67 it != words.end(); ++it) { 68 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); 69 } 70 71 extractor_.reset(new PhishingTermFeatureExtractor( 72 &term_hashes_, 73 &word_hashes_, 74 3 /* max_words_per_term */, 75 kMurmurHash3Seed, 76 &clock_)); 77 } 78 79 // Runs the TermFeatureExtractor on |page_text|, waiting for the 80 // completion callback. Returns the success boolean from the callback. 81 bool ExtractFeatures(const string16* page_text, FeatureMap* features) { 82 success_ = false; 83 extractor_->ExtractFeatures( 84 page_text, 85 features, 86 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 87 base::Unretained(this))); 88 msg_loop_.Run(); 89 return success_; 90 } 91 92 void PartialExtractFeatures(const string16* page_text, FeatureMap* features) { 93 extractor_->ExtractFeatures( 94 page_text, 95 features, 96 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone, 97 base::Unretained(this))); 98 msg_loop_.PostTask( 99 FROM_HERE, 100 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction, 101 base::Unretained(this))); 102 msg_loop_.RunUntilIdle(); 103 } 104 105 // Completion callback for feature extraction. 106 void ExtractionDone(bool success) { 107 success_ = success; 108 msg_loop_.Quit(); 109 } 110 111 void QuitExtraction() { 112 extractor_->CancelPendingExtraction(); 113 msg_loop_.Quit(); 114 } 115 116 base::MessageLoop msg_loop_; 117 MockFeatureExtractorClock clock_; 118 scoped_ptr<PhishingTermFeatureExtractor> extractor_; 119 base::hash_set<std::string> term_hashes_; 120 base::hash_set<uint32> word_hashes_; 121 bool success_; // holds the success value from ExtractFeatures 122 }; 123 124 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) { 125 // This test doesn't exercise the extraction timing. 126 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 127 128 string16 page_text = ASCIIToUTF16("blah"); 129 FeatureMap expected_features; // initially empty 130 131 FeatureMap features; 132 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 133 ExpectFeatureMapsAreEqual(features, expected_features); 134 135 page_text = ASCIIToUTF16("one one"); 136 expected_features.Clear(); 137 expected_features.AddBooleanFeature(features::kPageTerm + 138 std::string("one")); 139 expected_features.AddBooleanFeature(features::kPageTerm + 140 std::string("one one")); 141 142 features.Clear(); 143 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 144 ExpectFeatureMapsAreEqual(features, expected_features); 145 146 page_text = ASCIIToUTF16("bla bla multi word test bla"); 147 expected_features.Clear(); 148 expected_features.AddBooleanFeature(features::kPageTerm + 149 std::string("multi word test")); 150 151 features.Clear(); 152 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 153 ExpectFeatureMapsAreEqual(features, expected_features); 154 155 // This text has all of the words for one of the terms, but they are 156 // not in the correct order. 157 page_text = ASCIIToUTF16("bla bla test word multi bla"); 158 expected_features.Clear(); 159 160 features.Clear(); 161 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 162 ExpectFeatureMapsAreEqual(features, expected_features); 163 164 page_text = ASCIIToUTF16("Capitalization plus non-space\n" 165 "separator... punctuation!"); 166 expected_features.Clear(); 167 expected_features.AddBooleanFeature(features::kPageTerm + 168 std::string("capitalization")); 169 expected_features.AddBooleanFeature(features::kPageTerm + 170 std::string("space")); 171 expected_features.AddBooleanFeature(features::kPageTerm + 172 std::string("separator")); 173 expected_features.AddBooleanFeature(features::kPageTerm + 174 std::string("punctuation")); 175 176 features.Clear(); 177 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 178 ExpectFeatureMapsAreEqual(features, expected_features); 179 180 // Test with empty page text. 181 page_text = string16(); 182 expected_features.Clear(); 183 features.Clear(); 184 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 185 ExpectFeatureMapsAreEqual(features, expected_features); 186 187 // Chinese translation of the phrase "hello goodbye". This tests that 188 // we can correctly separate terms in languages that don't use spaces. 189 page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"); 190 expected_features.Clear(); 191 expected_features.AddBooleanFeature( 192 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd")); 193 expected_features.AddBooleanFeature( 194 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81")); 195 196 features.Clear(); 197 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 198 ExpectFeatureMapsAreEqual(features, expected_features); 199 } 200 201 TEST_F(PhishingTermFeatureExtractorTest, Continuation) { 202 // For this test, we'll cause the feature extraction to run multiple 203 // iterations by incrementing the clock. 204 205 // This page has a total of 30 words. For the features to be computed 206 // correctly, the extractor has to process the entire string of text. 207 string16 page_text(ASCIIToUTF16("one ")); 208 for (int i = 0; i < 28; ++i) { 209 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 210 } 211 page_text.append(ASCIIToUTF16("two")); 212 213 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks. 214 // Note that this assumes kClockCheckGranularity = 5 and 215 // kMaxTimePerChunkMs = 10. 216 base::TimeTicks now = base::TimeTicks::Now(); 217 EXPECT_CALL(clock_, Now()) 218 // Time check at the start of extraction. 219 .WillOnce(Return(now)) 220 // Time check at the start of the first chunk of work. 221 .WillOnce(Return(now)) 222 // Time check after the first 5 words. 223 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3))) 224 // Time check after the next 5 words. 225 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6))) 226 // Time check after the next 5 words. 227 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9))) 228 // Time check after the next 5 words. This is over the chunk 229 // time limit, so a continuation task will be posted. 230 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12))) 231 // Time check at the start of the second chunk of work. 232 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22))) 233 // Time check after the next 5 words. 234 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25))) 235 // Time check after the next 5 words. 236 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28))) 237 // A final check for the histograms. 238 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))); 239 240 FeatureMap expected_features; 241 expected_features.AddBooleanFeature(features::kPageTerm + 242 std::string("one")); 243 expected_features.AddBooleanFeature(features::kPageTerm + 244 std::string("two")); 245 246 FeatureMap features; 247 ASSERT_TRUE(ExtractFeatures(&page_text, &features)); 248 ExpectFeatureMapsAreEqual(features, expected_features); 249 // Make sure none of the mock expectations carry over to the next test. 250 ::testing::Mock::VerifyAndClearExpectations(&clock_); 251 252 // Now repeat the test with the same text, but advance the clock faster so 253 // that the extraction time exceeds the maximum total time for the feature 254 // extractor. Extraction should fail. Note that this assumes 255 // kMaxTotalTimeMs = 500. 256 EXPECT_CALL(clock_, Now()) 257 // Time check at the start of extraction. 258 .WillOnce(Return(now)) 259 // Time check at the start of the first chunk of work. 260 .WillOnce(Return(now)) 261 // Time check after the first 5 words, 262 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) 263 // Time check at the start of the second chunk of work. 264 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) 265 // Time check after the next 5 words. This is over the limit. 266 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) 267 // A final time check for the histograms. 268 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); 269 270 features.Clear(); 271 EXPECT_FALSE(ExtractFeatures(&page_text, &features)); 272 } 273 274 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) { 275 scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one "))); 276 for (int i = 0; i < 28; ++i) { 277 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 278 } 279 280 base::TimeTicks now = base::TimeTicks::Now(); 281 EXPECT_CALL(clock_, Now()) 282 // Time check at the start of extraction. 283 .WillOnce(Return(now)) 284 // Time check at the start of the first chunk of work. 285 .WillOnce(Return(now)) 286 // Time check after the first 5 words. 287 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7))) 288 // Time check after the next 5 words. This should be greater than 289 // kMaxTimePerChunkMs so that we stop and schedule extraction for later. 290 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14))); 291 292 FeatureMap features; 293 // Extract first 10 words then stop. 294 PartialExtractFeatures(page_text.get(), &features); 295 296 page_text.reset(new string16()); 297 for (int i = 30; i < 58; ++i) { 298 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i))); 299 } 300 page_text->append(ASCIIToUTF16("multi word test ")); 301 features.Clear(); 302 303 // This part doesn't exercise the extraction timing. 304 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); 305 306 // Now extract normally and make sure nothing breaks. 307 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features)); 308 309 FeatureMap expected_features; 310 expected_features.AddBooleanFeature(features::kPageTerm + 311 std::string("multi word test")); 312 ExpectFeatureMapsAreEqual(features, expected_features); 313 } 314 315 } // namespace safe_browsing 316