Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
      6 
      7 #include <string>
      8 
      9 #include "base/bind.h"
     10 #include "base/callback.h"
     11 #include "base/compiler_specific.h"
     12 #include "base/logging.h"
     13 #include "base/message_loop/message_loop.h"
     14 #include "base/metrics/histogram.h"
     15 #include "base/strings/string_util.h"
     16 #include "chrome/common/safe_browsing/csd.pb.h"
     17 #include "chrome/common/url_constants.h"
     18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
     19 #include "chrome/renderer/safe_browsing/features.h"
     20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
     21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
     22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
     23 #include "chrome/renderer/safe_browsing/scorer.h"
     24 #include "content/public/renderer/render_view.h"
     25 #include "crypto/sha2.h"
     26 #include "third_party/WebKit/public/platform/WebURL.h"
     27 #include "third_party/WebKit/public/platform/WebURLRequest.h"
     28 #include "third_party/WebKit/public/web/WebDataSource.h"
     29 #include "third_party/WebKit/public/web/WebDocument.h"
     30 #include "third_party/WebKit/public/web/WebFrame.h"
     31 #include "third_party/WebKit/public/web/WebView.h"
     32 #include "url/gurl.h"
     33 
     34 namespace safe_browsing {
     35 
     36 const float PhishingClassifier::kInvalidScore = -1.0;
     37 const float PhishingClassifier::kPhishyThreshold = 0.5;
     38 
     39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
     40                                        FeatureExtractorClock* clock)
     41     : render_view_(render_view),
     42       scorer_(NULL),
     43       clock_(clock),
     44       weak_factory_(this) {
     45   Clear();
     46 }
     47 
     48 PhishingClassifier::~PhishingClassifier() {
     49   // The RenderView should have called CancelPendingClassification() before
     50   // we are destroyed.
     51   CheckNoPendingClassification();
     52 }
     53 
     54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
     55   CheckNoPendingClassification();
     56   scorer_ = scorer;
     57   if (scorer_) {
     58     url_extractor_.reset(new PhishingUrlFeatureExtractor);
     59     dom_extractor_.reset(
     60         new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
     61     term_extractor_.reset(new PhishingTermFeatureExtractor(
     62         &scorer_->page_terms(),
     63         &scorer_->page_words(),
     64         scorer_->max_words_per_term(),
     65         scorer_->murmurhash3_seed(),
     66         scorer_->max_shingles_per_page(),
     67         scorer_->shingle_size(),
     68         clock_.get()));
     69   } else {
     70     // We're disabling client-side phishing detection, so tear down all
     71     // of the relevant objects.
     72     url_extractor_.reset();
     73     dom_extractor_.reset();
     74     term_extractor_.reset();
     75   }
     76 }
     77 
     78 bool PhishingClassifier::is_ready() const {
     79   return scorer_ != NULL;
     80 }
     81 
     82 void PhishingClassifier::BeginClassification(
     83     const base::string16* page_text,
     84     const DoneCallback& done_callback) {
     85   DCHECK(is_ready());
     86 
     87   // The RenderView should have called CancelPendingClassification() before
     88   // starting a new classification, so DCHECK this.
     89   CheckNoPendingClassification();
     90   // However, in an opt build, we will go ahead and clean up the pending
     91   // classification so that we can start in a known state.
     92   CancelPendingClassification();
     93 
     94   page_text_ = page_text;
     95   done_callback_ = done_callback;
     96 
     97   // For consistency, we always want to invoke the DoneCallback
     98   // asynchronously, rather than directly from this method.  To ensure that
     99   // this is the case, post a task to begin feature extraction on the next
    100   // iteration of the message loop.
    101   base::MessageLoop::current()->PostTask(
    102       FROM_HERE,
    103       base::Bind(&PhishingClassifier::BeginFeatureExtraction,
    104                  weak_factory_.GetWeakPtr()));
    105 }
    106 
    107 void PhishingClassifier::BeginFeatureExtraction() {
    108   blink::WebView* web_view = render_view_->GetWebView();
    109   if (!web_view) {
    110     RunFailureCallback();
    111     return;
    112   }
    113 
    114   blink::WebFrame* frame = web_view->mainFrame();
    115   if (!frame) {
    116     RunFailureCallback();
    117     return;
    118   }
    119 
    120   // Check whether the URL is one that we should classify.
    121   // Currently, we only classify http: URLs that are GET requests.
    122   GURL url(frame->document().url());
    123   if (!url.SchemeIs(url::kHttpScheme)) {
    124     RunFailureCallback();
    125     return;
    126   }
    127 
    128   blink::WebDataSource* ds = frame->dataSource();
    129   if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) {
    130     RunFailureCallback();
    131     return;
    132   }
    133 
    134   features_.reset(new FeatureMap);
    135   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
    136     RunFailureCallback();
    137     return;
    138   }
    139 
    140   // DOM feature extraction can take awhile, so it runs asynchronously
    141   // in several chunks of work and invokes the callback when finished.
    142   dom_extractor_->ExtractFeatures(
    143       features_.get(),
    144       base::Bind(&PhishingClassifier::DOMExtractionFinished,
    145                  base::Unretained(this)));
    146 }
    147 
    148 void PhishingClassifier::CancelPendingClassification() {
    149   // Note that cancelling the feature extractors is simply a no-op if they
    150   // were not running.
    151   DCHECK(is_ready());
    152   dom_extractor_->CancelPendingExtraction();
    153   term_extractor_->CancelPendingExtraction();
    154   weak_factory_.InvalidateWeakPtrs();
    155   Clear();
    156 }
    157 
    158 void PhishingClassifier::DOMExtractionFinished(bool success) {
    159   shingle_hashes_.reset(new std::set<uint32>);
    160   if (success) {
    161     // Term feature extraction can take awhile, so it runs asynchronously
    162     // in several chunks of work and invokes the callback when finished.
    163     term_extractor_->ExtractFeatures(
    164         page_text_,
    165         features_.get(),
    166         shingle_hashes_.get(),
    167         base::Bind(&PhishingClassifier::TermExtractionFinished,
    168                    base::Unretained(this)));
    169   } else {
    170     RunFailureCallback();
    171   }
    172 }
    173 
    174 void PhishingClassifier::TermExtractionFinished(bool success) {
    175   if (success) {
    176     blink::WebView* web_view = render_view_->GetWebView();
    177     if (!web_view) {
    178       RunFailureCallback();
    179       return;
    180     }
    181     blink::WebFrame* main_frame = web_view->mainFrame();
    182     if (!main_frame) {
    183       RunFailureCallback();
    184       return;
    185     }
    186 
    187     // Hash all of the features so that they match the model, then compute
    188     // the score.
    189     FeatureMap hashed_features;
    190     ClientPhishingRequest verdict;
    191     verdict.set_model_version(scorer_->model_version());
    192     verdict.set_url(main_frame->document().url().spec());
    193     for (base::hash_map<std::string, double>::const_iterator it =
    194              features_->features().begin();
    195          it != features_->features().end(); ++it) {
    196       VLOG(2) << "Feature: " << it->first << " = " << it->second;
    197       bool result = hashed_features.AddRealFeature(
    198           crypto::SHA256HashString(it->first), it->second);
    199       DCHECK(result);
    200       ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
    201       feature->set_name(it->first);
    202       feature->set_value(it->second);
    203     }
    204     for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
    205          it != shingle_hashes_->end(); ++it) {
    206       verdict.add_shingle_hashes(*it);
    207     }
    208     float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
    209     verdict.set_client_score(score);
    210     verdict.set_is_phishing(score >= kPhishyThreshold);
    211     RunCallback(verdict);
    212   } else {
    213     RunFailureCallback();
    214   }
    215 }
    216 
    217 void PhishingClassifier::CheckNoPendingClassification() {
    218   DCHECK(done_callback_.is_null());
    219   DCHECK(!page_text_);
    220   if (!done_callback_.is_null() || page_text_) {
    221     LOG(ERROR) << "Classification in progress, missing call to "
    222                << "CancelPendingClassification";
    223     UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
    224                          1);
    225   }
    226 }
    227 
    228 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
    229   done_callback_.Run(verdict);
    230   Clear();
    231 }
    232 
    233 void PhishingClassifier::RunFailureCallback() {
    234   ClientPhishingRequest verdict;
    235   // In this case we're not guaranteed to have a valid URL.  Just set it
    236   // to the empty string to make sure we have a valid protocol buffer.
    237   verdict.set_url("");
    238   verdict.set_client_score(kInvalidScore);
    239   verdict.set_is_phishing(false);
    240   RunCallback(verdict);
    241 }
    242 
    243 void PhishingClassifier::Clear() {
    244   page_text_ = NULL;
    245   done_callback_.Reset();
    246   features_.reset(NULL);
    247   shingle_hashes_.reset(NULL);
    248 }
    249 
    250 }  // namespace safe_browsing
    251