1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_classifier.h" 6 7 #include <string> 8 9 #include "base/bind.h" 10 #include "base/callback.h" 11 #include "base/compiler_specific.h" 12 #include "base/logging.h" 13 #include "base/message_loop/message_loop.h" 14 #include "base/metrics/histogram.h" 15 #include "base/strings/string_util.h" 16 #include "chrome/common/safe_browsing/csd.pb.h" 17 #include "chrome/common/url_constants.h" 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 19 #include "chrome/renderer/safe_browsing/features.h" 20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 23 #include "chrome/renderer/safe_browsing/scorer.h" 24 #include "content/public/renderer/render_view.h" 25 #include "crypto/sha2.h" 26 #include "third_party/WebKit/public/platform/WebURL.h" 27 #include "third_party/WebKit/public/platform/WebURLRequest.h" 28 #include "third_party/WebKit/public/web/WebDataSource.h" 29 #include "third_party/WebKit/public/web/WebDocument.h" 30 #include "third_party/WebKit/public/web/WebFrame.h" 31 #include "third_party/WebKit/public/web/WebView.h" 32 #include "url/gurl.h" 33 34 namespace safe_browsing { 35 36 const float PhishingClassifier::kInvalidScore = -1.0; 37 const float PhishingClassifier::kPhishyThreshold = 0.5; 38 39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view, 40 FeatureExtractorClock* clock) 41 : render_view_(render_view), 42 scorer_(NULL), 43 clock_(clock), 44 weak_factory_(this) { 45 Clear(); 46 } 47 48 PhishingClassifier::~PhishingClassifier() { 49 // The RenderView should have called CancelPendingClassification() before 50 // we are destroyed. 51 CheckNoPendingClassification(); 52 } 53 54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { 55 CheckNoPendingClassification(); 56 scorer_ = scorer; 57 if (scorer_) { 58 url_extractor_.reset(new PhishingUrlFeatureExtractor); 59 dom_extractor_.reset( 60 new PhishingDOMFeatureExtractor(render_view_, clock_.get())); 61 term_extractor_.reset(new PhishingTermFeatureExtractor( 62 &scorer_->page_terms(), 63 &scorer_->page_words(), 64 scorer_->max_words_per_term(), 65 scorer_->murmurhash3_seed(), 66 clock_.get())); 67 } else { 68 // We're disabling client-side phishing detection, so tear down all 69 // of the relevant objects. 70 url_extractor_.reset(); 71 dom_extractor_.reset(); 72 term_extractor_.reset(); 73 } 74 } 75 76 bool PhishingClassifier::is_ready() const { 77 return scorer_ != NULL; 78 } 79 80 void PhishingClassifier::BeginClassification( 81 const base::string16* page_text, 82 const DoneCallback& done_callback) { 83 DCHECK(is_ready()); 84 85 // The RenderView should have called CancelPendingClassification() before 86 // starting a new classification, so DCHECK this. 87 CheckNoPendingClassification(); 88 // However, in an opt build, we will go ahead and clean up the pending 89 // classification so that we can start in a known state. 90 CancelPendingClassification(); 91 92 page_text_ = page_text; 93 done_callback_ = done_callback; 94 95 // For consistency, we always want to invoke the DoneCallback 96 // asynchronously, rather than directly from this method. To ensure that 97 // this is the case, post a task to begin feature extraction on the next 98 // iteration of the message loop. 99 base::MessageLoop::current()->PostTask( 100 FROM_HERE, 101 base::Bind(&PhishingClassifier::BeginFeatureExtraction, 102 weak_factory_.GetWeakPtr())); 103 } 104 105 void PhishingClassifier::BeginFeatureExtraction() { 106 blink::WebView* web_view = render_view_->GetWebView(); 107 if (!web_view) { 108 RunFailureCallback(); 109 return; 110 } 111 112 blink::WebFrame* frame = web_view->mainFrame(); 113 if (!frame) { 114 RunFailureCallback(); 115 return; 116 } 117 118 // Check whether the URL is one that we should classify. 119 // Currently, we only classify http: URLs that are GET requests. 120 GURL url(frame->document().url()); 121 if (!url.SchemeIs(content::kHttpScheme)) { 122 RunFailureCallback(); 123 return; 124 } 125 126 blink::WebDataSource* ds = frame->dataSource(); 127 if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) { 128 RunFailureCallback(); 129 return; 130 } 131 132 features_.reset(new FeatureMap); 133 if (!url_extractor_->ExtractFeatures(url, features_.get())) { 134 RunFailureCallback(); 135 return; 136 } 137 138 // DOM feature extraction can take awhile, so it runs asynchronously 139 // in several chunks of work and invokes the callback when finished. 140 dom_extractor_->ExtractFeatures( 141 features_.get(), 142 base::Bind(&PhishingClassifier::DOMExtractionFinished, 143 base::Unretained(this))); 144 } 145 146 void PhishingClassifier::CancelPendingClassification() { 147 // Note that cancelling the feature extractors is simply a no-op if they 148 // were not running. 149 DCHECK(is_ready()); 150 dom_extractor_->CancelPendingExtraction(); 151 term_extractor_->CancelPendingExtraction(); 152 weak_factory_.InvalidateWeakPtrs(); 153 Clear(); 154 } 155 156 void PhishingClassifier::DOMExtractionFinished(bool success) { 157 if (success) { 158 // Term feature extraction can take awhile, so it runs asynchronously 159 // in several chunks of work and invokes the callback when finished. 160 term_extractor_->ExtractFeatures( 161 page_text_, 162 features_.get(), 163 base::Bind(&PhishingClassifier::TermExtractionFinished, 164 base::Unretained(this))); 165 } else { 166 RunFailureCallback(); 167 } 168 } 169 170 void PhishingClassifier::TermExtractionFinished(bool success) { 171 if (success) { 172 blink::WebView* web_view = render_view_->GetWebView(); 173 if (!web_view) { 174 RunFailureCallback(); 175 return; 176 } 177 blink::WebFrame* main_frame = web_view->mainFrame(); 178 if (!main_frame) { 179 RunFailureCallback(); 180 return; 181 } 182 183 // Hash all of the features so that they match the model, then compute 184 // the score. 185 FeatureMap hashed_features; 186 ClientPhishingRequest verdict; 187 verdict.set_model_version(scorer_->model_version()); 188 verdict.set_url(main_frame->document().url().spec()); 189 for (base::hash_map<std::string, double>::const_iterator it = 190 features_->features().begin(); 191 it != features_->features().end(); ++it) { 192 VLOG(2) << "Feature: " << it->first << " = " << it->second; 193 bool result = hashed_features.AddRealFeature( 194 crypto::SHA256HashString(it->first), it->second); 195 DCHECK(result); 196 ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); 197 feature->set_name(it->first); 198 feature->set_value(it->second); 199 } 200 float score = static_cast<float>(scorer_->ComputeScore(hashed_features)); 201 verdict.set_client_score(score); 202 verdict.set_is_phishing(score >= kPhishyThreshold); 203 RunCallback(verdict); 204 } else { 205 RunFailureCallback(); 206 } 207 } 208 209 void PhishingClassifier::CheckNoPendingClassification() { 210 DCHECK(done_callback_.is_null()); 211 DCHECK(!page_text_); 212 if (!done_callback_.is_null() || page_text_) { 213 LOG(ERROR) << "Classification in progress, missing call to " 214 << "CancelPendingClassification"; 215 UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed", 216 1); 217 } 218 } 219 220 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) { 221 done_callback_.Run(verdict); 222 Clear(); 223 } 224 225 void PhishingClassifier::RunFailureCallback() { 226 ClientPhishingRequest verdict; 227 // In this case we're not guaranteed to have a valid URL. Just set it 228 // to the empty string to make sure we have a valid protocol buffer. 229 verdict.set_url(""); 230 verdict.set_client_score(kInvalidScore); 231 verdict.set_is_phishing(false); 232 RunCallback(verdict); 233 } 234 235 void PhishingClassifier::Clear() { 236 page_text_ = NULL; 237 done_callback_.Reset(); 238 features_.reset(NULL); 239 } 240 241 } // namespace safe_browsing 242