Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
      6 
      7 #include <string>
      8 
      9 #include "base/bind.h"
     10 #include "base/command_line.h"
     11 #include "base/memory/scoped_ptr.h"
     12 #include "base/strings/string16.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "chrome/common/chrome_switches.h"
     15 #include "chrome/common/safe_browsing/client_model.pb.h"
     16 #include "chrome/common/safe_browsing/csd.pb.h"
     17 #include "chrome/renderer/safe_browsing/features.h"
     18 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
     19 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
     20 #include "chrome/renderer/safe_browsing/scorer.h"
     21 #include "chrome/test/base/in_process_browser_test.h"
     22 #include "chrome/test/base/ui_test_utils.h"
     23 #include "content/public/renderer/render_view.h"
     24 #include "crypto/sha2.h"
     25 #include "net/dns/mock_host_resolver.h"
     26 #include "net/test/embedded_test_server/embedded_test_server.h"
     27 #include "net/test/embedded_test_server/http_response.h"
     28 #include "testing/gmock/include/gmock/gmock.h"
     29 #include "url/gurl.h"
     30 
     31 using ::testing::AllOf;
     32 using ::testing::Contains;
     33 using ::testing::Not;
     34 using ::testing::Pair;
     35 
     36 namespace {
     37 
     38 // The first RenderFrame is routing ID 1, and the first RenderView is 2.
     39 const int kRenderViewRoutingId = 2;
     40 
     41 }
     42 
     43 namespace safe_browsing {
     44 
     45 class PhishingClassifierTest : public InProcessBrowserTest {
     46  protected:
     47   PhishingClassifierTest()
     48       : url_tld_token_net_(features::kUrlTldToken + std::string("net")),
     49         page_link_domain_phishing_(features::kPageLinkDomain +
     50                                    std::string("phishing.com")),
     51         page_term_login_(features::kPageTerm + std::string("login")) {
     52   }
     53 
     54   virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE {
     55     command_line->AppendSwitch(switches::kSingleProcess);
     56 #if defined(OS_WIN)
     57     // Don't want to try to create a GPU process.
     58     command_line->AppendSwitch(switches::kDisableGpu);
     59 #endif
     60   }
     61 
     62   virtual void SetUpOnMainThread() OVERRIDE {
     63     // Construct a model to test with.  We include one feature from each of
     64     // the feature extractors, which allows us to verify that they all ran.
     65     ClientSideModel model;
     66 
     67     model.add_hashes(crypto::SHA256HashString(url_tld_token_net_));
     68     model.add_hashes(crypto::SHA256HashString(page_link_domain_phishing_));
     69     model.add_hashes(crypto::SHA256HashString(page_term_login_));
     70     model.add_hashes(crypto::SHA256HashString("login"));
     71     model.add_hashes(crypto::SHA256HashString(features::kUrlTldToken +
     72                                               std::string("net")));
     73     model.add_hashes(crypto::SHA256HashString(features::kPageLinkDomain +
     74                                               std::string("phishing.com")));
     75     model.add_hashes(crypto::SHA256HashString(features::kPageTerm +
     76                                               std::string("login")));
     77     model.add_hashes(crypto::SHA256HashString("login"));
     78 
     79     // Add a default rule with a non-phishy weight.
     80     ClientSideModel::Rule* rule = model.add_rule();
     81     rule->set_weight(-1.0);
     82 
     83     // To give a phishy score, the total weight needs to be >= 0
     84     // (0.5 when converted to a probability).  This will only happen
     85     // if all of the listed features are present.
     86     rule = model.add_rule();
     87     rule->add_feature(0);
     88     rule->add_feature(1);
     89     rule->add_feature(2);
     90     rule->set_weight(1.0);
     91 
     92     model.add_page_term(3);
     93     model.set_murmur_hash_seed(2777808611U);
     94     model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
     95     model.set_max_words_per_term(1);
     96     model.set_max_shingles_per_page(100);
     97     model.set_shingle_size(3);
     98 
     99     clock_ = new MockFeatureExtractorClock;
    100     scorer_.reset(Scorer::Create(model.SerializeAsString()));
    101     ASSERT_TRUE(scorer_.get());
    102 
    103     classifier_.reset(new PhishingClassifier(
    104         content::RenderView::FromRoutingID(kRenderViewRoutingId),
    105         clock_));
    106   }
    107 
    108   virtual void TearDownOnMainThread() OVERRIDE {
    109     content::RunAllPendingInMessageLoop();
    110   }
    111 
    112   // Helper method to start phishing classification and wait for it to
    113   // complete.  Returns the true if the page is classified as phishy and
    114   // false otherwise.
    115   bool RunPhishingClassifier(const base::string16* page_text,
    116                              float* phishy_score,
    117                              FeatureMap* features) {
    118     ClientPhishingRequest verdict;
    119     // The classifier accesses the RenderView and must run in the RenderThread.
    120     PostTaskToInProcessRendererAndWait(
    121         base::Bind(&PhishingClassifierTest::DoRunPhishingClassifier,
    122                    base::Unretained(this),
    123                    page_text, phishy_score, features, &verdict));
    124     return verdict.is_phishing();
    125   }
    126 
    127   void DoRunPhishingClassifier(const base::string16* page_text,
    128                                float* phishy_score,
    129                                FeatureMap* features,
    130                                ClientPhishingRequest* verdict) {
    131     *phishy_score = PhishingClassifier::kInvalidScore;
    132     features->Clear();
    133 
    134     // Force synchronous behavior for ease of unittesting.
    135     base::RunLoop run_loop;
    136     classifier_->BeginClassification(
    137         page_text,
    138         base::Bind(&PhishingClassifierTest::ClassificationFinished,
    139                    base::Unretained(this), &run_loop, verdict));
    140     content::RunThisRunLoop(&run_loop);
    141 
    142     *phishy_score = verdict->client_score();
    143     for (int i = 0; i < verdict->feature_map_size(); ++i) {
    144       features->AddRealFeature(verdict->feature_map(i).name(),
    145                                verdict->feature_map(i).value());
    146     }
    147   }
    148 
    149   // Completion callback for classification.
    150   void ClassificationFinished(base::RunLoop* run_loop,
    151                               ClientPhishingRequest* verdict_out,
    152                               const ClientPhishingRequest& verdict) {
    153     *verdict_out = verdict;  // Copy the verdict.
    154     run_loop->Quit();
    155   }
    156 
    157   scoped_ptr<net::test_server::EmbeddedTestServer> embedded_test_server_;
    158   net::test_server::EmbeddedTestServer* embedded_test_server() {
    159     // TODO(ajwong): Merge this into BrowserTestBase.
    160     if (!embedded_test_server_) {
    161       embedded_test_server_.reset(new net::test_server::EmbeddedTestServer());
    162       embedded_test_server_->RegisterRequestHandler(
    163           base::Bind(&PhishingClassifierTest::HandleRequest,
    164                      base::Unretained(this)));
    165       CHECK(embedded_test_server_->InitializeAndWaitUntilReady());
    166     }
    167     return embedded_test_server_.get();
    168   }
    169 
    170   void LoadHtml(const std::string& host, const std::string& content) {
    171     GURL::Replacements replace_host;
    172     replace_host.SetHostStr(host);
    173     response_content_ = content;
    174     ui_test_utils::NavigateToURL(
    175         browser(),
    176         embedded_test_server()->base_url().ReplaceComponents(replace_host));
    177   }
    178 
    179   void LoadHtmlPost(const std::string& host, const std::string& content) {
    180     GURL::Replacements replace_host;
    181     replace_host.SetHostStr(host);
    182     response_content_ = content;
    183     ui_test_utils::NavigateToURLWithPost(
    184         browser(),
    185         embedded_test_server()->base_url().ReplaceComponents(replace_host));
    186   }
    187 
    188   scoped_ptr<net::test_server::HttpResponse>
    189       HandleRequest(const net::test_server::HttpRequest& request) {
    190     scoped_ptr<net::test_server::BasicHttpResponse> http_response(
    191         new net::test_server::BasicHttpResponse());
    192     http_response->set_code(net::HTTP_OK);
    193     http_response->set_content_type("text/html");
    194     http_response->set_content(response_content_);
    195     return http_response.PassAs<net::test_server::HttpResponse>();
    196   }
    197 
    198   std::string response_content_;
    199   scoped_ptr<Scorer> scorer_;
    200   scoped_ptr<PhishingClassifier> classifier_;
    201   MockFeatureExtractorClock* clock_;  // Owned by classifier_.
    202 
    203   // Features that are in the model.
    204   const std::string url_tld_token_net_;
    205   const std::string page_link_domain_phishing_;
    206   const std::string page_term_login_;
    207 };
    208 
    209 // This test flakes on Mac with force compositing mode.
    210 // http://crbug.com/316709
    211 #if defined(OS_MACOSX)
    212 #define MAYBE_TestClassification DISABLED_TestClassification
    213 #else
    214 #define MAYBE_TestClassification TestClassification
    215 #endif
    216 IN_PROC_BROWSER_TEST_F(PhishingClassifierTest, MAYBE_TestClassification) {
    217   host_resolver()->AddRule("*", "127.0.0.1");
    218 
    219   // No scorer yet, so the classifier is not ready.
    220   ASSERT_FALSE(classifier_->is_ready());
    221 
    222   // Now set the scorer.
    223   classifier_->set_phishing_scorer(scorer_.get());
    224   ASSERT_TRUE(classifier_->is_ready());
    225 
    226   // This test doesn't exercise the extraction timing.
    227   EXPECT_CALL(*clock_, Now())
    228       .WillRepeatedly(::testing::Return(base::TimeTicks::Now()));
    229 
    230   base::string16 page_text = base::ASCIIToUTF16("login");
    231   float phishy_score;
    232   FeatureMap features;
    233 
    234   LoadHtml("host.net",
    235       "<html><body><a href=\"http://phishing.com/\">login</a></body></html>");
    236   EXPECT_TRUE(RunPhishingClassifier(&page_text, &phishy_score, &features));
    237   // Note: features.features() might contain other features that simply aren't
    238   // in the model.
    239   EXPECT_THAT(features.features(),
    240               AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
    241                     Contains(Pair(page_link_domain_phishing_, 1.0)),
    242                     Contains(Pair(page_term_login_, 1.0))));
    243   EXPECT_FLOAT_EQ(0.5, phishy_score);
    244 
    245   // Change the link domain to something non-phishy.
    246   LoadHtml("host.net",
    247            "<html><body><a href=\"http://safe.com/\">login</a></body></html>");
    248   EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
    249   EXPECT_THAT(features.features(),
    250               AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
    251                     Contains(Pair(page_term_login_, 1.0))));
    252   EXPECT_THAT(features.features(),
    253               Not(Contains(Pair(page_link_domain_phishing_, 1.0))));
    254   EXPECT_GE(phishy_score, 0.0);
    255   EXPECT_LT(phishy_score, 0.5);
    256 
    257   // Extraction should fail for this case since there is no TLD.
    258   LoadHtml("localhost", "<html><body>content</body></html>");
    259   EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
    260   EXPECT_EQ(0U, features.features().size());
    261   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
    262 
    263   // Extraction should also fail for this case because the URL is not http.
    264   net::SpawnedTestServer https_server(
    265       net::SpawnedTestServer::TYPE_HTTPS,
    266       net::SpawnedTestServer::kLocalhost,
    267       base::FilePath(FILE_PATH_LITERAL("chrome/test/data")));
    268   ASSERT_TRUE(https_server.Start());
    269   std::string host_str("host.net");  // Must outlive replace_host.
    270   GURL::Replacements replace_host;
    271   replace_host.SetHostStr(host_str);
    272   GURL test_url = https_server.GetURL("/files/title1.html");
    273   ui_test_utils::NavigateToURL(browser(),
    274                                test_url.ReplaceComponents(replace_host));
    275   EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
    276   EXPECT_EQ(0U, features.features().size());
    277   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
    278 
    279   // Extraction should fail for this case because the URL is a POST request.
    280   LoadHtmlPost("host.net", "<html><body>content</body></html>");
    281   EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
    282   EXPECT_EQ(0U, features.features().size());
    283   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
    284 }
    285 
    286 // Test flakes with LSAN enabled. See http://crbug.com/373155.
    287 #if defined(LEAK_SANITIZER)
    288 #define MAYBE_DisableDetection DISABLED_DisableDetection
    289 #else
    290 #define MAYBE_DisableDetection DisableDetection
    291 #endif
    292 IN_PROC_BROWSER_TEST_F(PhishingClassifierTest, MAYBE_DisableDetection) {
    293   // No scorer yet, so the classifier is not ready.
    294   EXPECT_FALSE(classifier_->is_ready());
    295 
    296   // Now set the scorer.
    297   classifier_->set_phishing_scorer(scorer_.get());
    298   EXPECT_TRUE(classifier_->is_ready());
    299 
    300   // Set a NULL scorer, which turns detection back off.
    301   classifier_->set_phishing_scorer(NULL);
    302   EXPECT_FALSE(classifier_->is_ready());
    303 }
    304 
    305 }  // namespace safe_browsing
    306