Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"
      6 
      7 #include <set>
      8 
      9 #include "base/bind.h"
     10 #include "base/callback.h"
     11 #include "base/lazy_instance.h"
     12 #include "base/logging.h"
     13 #include "base/metrics/histogram.h"
     14 #include "chrome/common/safe_browsing/csd.pb.h"
     15 #include "chrome/common/safe_browsing/safebrowsing_messages.h"
     16 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
     17 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
     18 #include "chrome/renderer/safe_browsing/scorer.h"
     19 #include "content/public/renderer/document_state.h"
     20 #include "content/public/renderer/navigation_state.h"
     21 #include "content/public/renderer/render_thread.h"
     22 #include "content/public/renderer/render_view.h"
     23 #include "third_party/WebKit/public/platform/WebURL.h"
     24 #include "third_party/WebKit/public/web/WebDocument.h"
     25 #include "third_party/WebKit/public/web/WebLocalFrame.h"
     26 #include "third_party/WebKit/public/web/WebView.h"
     27 
     28 using content::DocumentState;
     29 using content::NavigationState;
     30 using content::RenderThread;
     31 
     32 namespace safe_browsing {
     33 
     34 static GURL StripRef(const GURL& url) {
     35   GURL::Replacements replacements;
     36   replacements.ClearRef();
     37   return url.ReplaceComponents(replacements);
     38 }
     39 
     40 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
     41 static base::LazyInstance<PhishingClassifierDelegates>
     42     g_delegates = LAZY_INSTANCE_INITIALIZER;
     43 
     44 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
     45     g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;
     46 
     47 // static
     48 PhishingClassifierFilter* PhishingClassifierFilter::Create() {
     49   // Private constructor and public static Create() method to facilitate
     50   // stubbing out this class for binary-size reduction purposes.
     51   return new PhishingClassifierFilter();
     52 }
     53 
     54 PhishingClassifierFilter::PhishingClassifierFilter()
     55     : RenderProcessObserver() {}
     56 
     57 PhishingClassifierFilter::~PhishingClassifierFilter() {}
     58 
     59 bool PhishingClassifierFilter::OnControlMessageReceived(
     60     const IPC::Message& message) {
     61   bool handled = true;
     62   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
     63     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
     64     IPC_MESSAGE_UNHANDLED(handled = false)
     65   IPC_END_MESSAGE_MAP()
     66   return handled;
     67 }
     68 
     69 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
     70   safe_browsing::Scorer* scorer = NULL;
     71   // An empty model string means we should disable client-side phishing
     72   // detection.
     73   if (!model.empty()) {
     74     scorer = safe_browsing::Scorer::Create(model);
     75     if (!scorer) {
     76       DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
     77       return;
     78     }
     79   }
     80   PhishingClassifierDelegates::iterator i;
     81   for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
     82     (*i)->SetPhishingScorer(scorer);
     83   }
     84   g_phishing_scorer.Get().reset(scorer);
     85 }
     86 
     87 // static
     88 PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
     89     content::RenderView* render_view, PhishingClassifier* classifier) {
     90   // Private constructor and public static Create() method to facilitate
     91   // stubbing out this class for binary-size reduction purposes.
     92   return new PhishingClassifierDelegate(render_view, classifier);
     93 }
     94 
     95 PhishingClassifierDelegate::PhishingClassifierDelegate(
     96     content::RenderView* render_view,
     97     PhishingClassifier* classifier)
     98     : content::RenderViewObserver(render_view),
     99       last_main_frame_transition_(ui::PAGE_TRANSITION_LINK),
    100       have_page_text_(false),
    101       is_classifying_(false) {
    102   g_delegates.Get().insert(this);
    103   if (!classifier) {
    104     classifier = new PhishingClassifier(render_view,
    105                                         new FeatureExtractorClock());
    106   }
    107 
    108   classifier_.reset(classifier);
    109 
    110   if (g_phishing_scorer.Get().get())
    111     SetPhishingScorer(g_phishing_scorer.Get().get());
    112 }
    113 
    114 PhishingClassifierDelegate::~PhishingClassifierDelegate() {
    115   CancelPendingClassification(SHUTDOWN);
    116   g_delegates.Get().erase(this);
    117 }
    118 
    119 void PhishingClassifierDelegate::SetPhishingScorer(
    120     const safe_browsing::Scorer* scorer) {
    121   if (!render_view()->GetWebView())
    122     return;  // RenderView is tearing down.
    123   if (is_classifying_) {
    124     // If there is a classification going on right now it means we're
    125     // actually replacing an existing scorer with a new model.  In
    126     // this case we simply cancel the current classification.
    127     // TODO(noelutz): if this happens too frequently we could also
    128     // replace the old scorer with the new one once classification is done
    129     // but this would complicate the code somewhat.
    130     CancelPendingClassification(NEW_PHISHING_SCORER);
    131   }
    132   classifier_->set_phishing_scorer(scorer);
    133   // Start classifying the current page if all conditions are met.
    134   // See MaybeStartClassification() for details.
    135   MaybeStartClassification();
    136 }
    137 
    138 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
    139   last_url_received_from_browser_ = StripRef(url);
    140   // Start classifying the current page if all conditions are met.
    141   // See MaybeStartClassification() for details.
    142   MaybeStartClassification();
    143 }
    144 
    145 void PhishingClassifierDelegate::DidCommitProvisionalLoad(
    146     blink::WebLocalFrame* frame, bool is_new_navigation) {
    147   // A new page is starting to load, so cancel classificaiton.
    148   //
    149   // TODO(bryner): We shouldn't need to cancel classification if the navigation
    150   // is within the same page.  However, if we let classification continue in
    151   // this case, we need to properly deal with the fact that PageCaptured will
    152   // be called again for the in-page navigation.  We need to be sure not to
    153   // swap out the page text while the term feature extractor is still running.
    154   DocumentState* document_state = DocumentState::FromDataSource(
    155       frame->dataSource());
    156   NavigationState* navigation_state = document_state->navigation_state();
    157   CancelPendingClassification(navigation_state->was_within_same_page() ?
    158                               NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
    159   if (frame == render_view()->GetWebView()->mainFrame()) {
    160     last_main_frame_transition_ = navigation_state->transition_type();
    161   }
    162 }
    163 
    164 void PhishingClassifierDelegate::PageCaptured(base::string16* page_text,
    165                                               bool preliminary_capture) {
    166   if (preliminary_capture) {
    167     return;
    168   }
    169   // Make sure there's no classification in progress.  We don't want to swap
    170   // out the page text string from underneath the term feature extractor.
    171   //
    172   // Note: Currently, if the url hasn't changed, we won't restart
    173   // classification in this case.  We may want to adjust this.
    174   CancelPendingClassification(PAGE_RECAPTURED);
    175   last_finished_load_url_ = GetToplevelUrl();
    176   classifier_page_text_.swap(*page_text);
    177   have_page_text_ = true;
    178   MaybeStartClassification();
    179 }
    180 
    181 void PhishingClassifierDelegate::CancelPendingClassification(
    182     CancelClassificationReason reason) {
    183   if (is_classifying_) {
    184     UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
    185                               reason,
    186                               CANCEL_CLASSIFICATION_MAX);
    187     is_classifying_ = false;
    188   }
    189   if (classifier_->is_ready()) {
    190     classifier_->CancelPendingClassification();
    191   }
    192   classifier_page_text_.clear();
    193   have_page_text_ = false;
    194 }
    195 
    196 bool PhishingClassifierDelegate::OnMessageReceived(
    197     const IPC::Message& message) {
    198   bool handled = true;
    199   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
    200     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
    201                         OnStartPhishingDetection)
    202     IPC_MESSAGE_UNHANDLED(handled = false)
    203   IPC_END_MESSAGE_MAP()
    204   return handled;
    205 }
    206 
    207 void PhishingClassifierDelegate::ClassificationDone(
    208     const ClientPhishingRequest& verdict) {
    209   // We no longer need the page text.
    210   classifier_page_text_.clear();
    211   VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
    212           << " score = " << verdict.client_score();
    213   if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
    214     DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
    215     RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
    216         routing_id(), verdict.SerializeAsString()));
    217   }
    218 }
    219 
    220 GURL PhishingClassifierDelegate::GetToplevelUrl() {
    221   return render_view()->GetWebView()->mainFrame()->document().url();
    222 }
    223 
    224 void PhishingClassifierDelegate::MaybeStartClassification() {
    225   // We can begin phishing classification when the following conditions are
    226   // met:
    227   //  1. A Scorer has been created
    228   //  2. The browser has sent a StartPhishingDetection message for the current
    229   //     toplevel URL.
    230   //  3. The page has finished loading and the page text has been extracted.
    231   //  4. The load is a new navigation (not a session history navigation).
    232   //  5. The toplevel URL has not already been classified.
    233   //
    234   // Note that if we determine that this particular navigation should not be
    235   // classified at all (as opposed to deferring it until we get an IPC or the
    236   // load completes), we discard the page text since it won't be needed.
    237   if (!classifier_->is_ready()) {
    238     VLOG(2) << "Not starting classification, no Scorer created.";
    239     // Keep classifier_page_text_, in case a Scorer is set later.
    240     return;
    241   }
    242 
    243   if (last_main_frame_transition_ & ui::PAGE_TRANSITION_FORWARD_BACK) {
    244     // Skip loads from session history navigation.  However, update the
    245     // last URL sent to the classifier, so that we'll properly detect
    246     // in-page navigations.
    247     VLOG(2) << "Not starting classification for back/forward navigation";
    248     last_url_sent_to_classifier_ = last_finished_load_url_;
    249     classifier_page_text_.clear();  // we won't need this.
    250     have_page_text_ = false;
    251     return;
    252   }
    253 
    254   GURL stripped_last_load_url(StripRef(last_finished_load_url_));
    255   if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
    256     // We've already classified this toplevel URL, so this was likely an
    257     // in-page navigation or a subframe navigation.  The browser should not
    258     // send a StartPhishingDetection IPC in this case.
    259     VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
    260     classifier_page_text_.clear();  // we won't need this.
    261     have_page_text_ = false;
    262     return;
    263   }
    264 
    265   if (!have_page_text_) {
    266     VLOG(2) << "Not starting classification, there is no page text ready.";
    267     return;
    268   }
    269 
    270   if (last_url_received_from_browser_ != stripped_last_load_url) {
    271     // The browser has not yet confirmed that this URL should be classified,
    272     // so defer classification for now.  Note: the ref does not affect
    273     // any of the browser's preclassification checks, so we don't require it
    274     // to match.
    275     VLOG(2) << "Not starting classification, last url from browser is "
    276             << last_url_received_from_browser_ << ", last finished load is "
    277             << last_finished_load_url_;
    278     // Keep classifier_page_text_, in case the browser notifies us later that
    279     // we should classify the URL.
    280     return;
    281   }
    282 
    283   VLOG(2) << "Starting classification for " << last_finished_load_url_;
    284   last_url_sent_to_classifier_ = last_finished_load_url_;
    285   is_classifying_ = true;
    286   classifier_->BeginClassification(
    287       &classifier_page_text_,
    288       base::Bind(&PhishingClassifierDelegate::ClassificationDone,
    289                  base::Unretained(this)));
    290 }
    291 
    292 }  // namespace safe_browsing
    293