1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_classifier.h" 6 7 #include <string> 8 9 #include "base/bind.h" 10 #include "base/callback.h" 11 #include "base/compiler_specific.h" 12 #include "base/logging.h" 13 #include "base/message_loop/message_loop.h" 14 #include "base/metrics/histogram.h" 15 #include "base/strings/string_util.h" 16 #include "chrome/common/safe_browsing/csd.pb.h" 17 #include "chrome/common/url_constants.h" 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 19 #include "chrome/renderer/safe_browsing/features.h" 20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" 23 #include "chrome/renderer/safe_browsing/scorer.h" 24 #include "content/public/renderer/render_view.h" 25 #include "crypto/sha2.h" 26 #include "third_party/WebKit/public/platform/WebURL.h" 27 #include "third_party/WebKit/public/platform/WebURLRequest.h" 28 #include "third_party/WebKit/public/web/WebDataSource.h" 29 #include "third_party/WebKit/public/web/WebDocument.h" 30 #include "third_party/WebKit/public/web/WebFrame.h" 31 #include "third_party/WebKit/public/web/WebView.h" 32 #include "url/gurl.h" 33 34 namespace safe_browsing { 35 36 const float PhishingClassifier::kInvalidScore = -1.0; 37 const float PhishingClassifier::kPhishyThreshold = 0.5; 38 39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view, 40 FeatureExtractorClock* clock) 41 : render_view_(render_view), 42 scorer_(NULL), 43 clock_(clock), 44 weak_factory_(this) { 45 Clear(); 46 } 47 48 PhishingClassifier::~PhishingClassifier() { 49 // The RenderView should have called CancelPendingClassification() before 50 // we are destroyed. 51 CheckNoPendingClassification(); 52 } 53 54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { 55 CheckNoPendingClassification(); 56 scorer_ = scorer; 57 if (scorer_) { 58 url_extractor_.reset(new PhishingUrlFeatureExtractor); 59 dom_extractor_.reset( 60 new PhishingDOMFeatureExtractor(render_view_, clock_.get())); 61 term_extractor_.reset(new PhishingTermFeatureExtractor( 62 &scorer_->page_terms(), 63 &scorer_->page_words(), 64 scorer_->max_words_per_term(), 65 scorer_->murmurhash3_seed(), 66 scorer_->max_shingles_per_page(), 67 scorer_->shingle_size(), 68 clock_.get())); 69 } else { 70 // We're disabling client-side phishing detection, so tear down all 71 // of the relevant objects. 72 url_extractor_.reset(); 73 dom_extractor_.reset(); 74 term_extractor_.reset(); 75 } 76 } 77 78 bool PhishingClassifier::is_ready() const { 79 return scorer_ != NULL; 80 } 81 82 void PhishingClassifier::BeginClassification( 83 const base::string16* page_text, 84 const DoneCallback& done_callback) { 85 DCHECK(is_ready()); 86 87 // The RenderView should have called CancelPendingClassification() before 88 // starting a new classification, so DCHECK this. 89 CheckNoPendingClassification(); 90 // However, in an opt build, we will go ahead and clean up the pending 91 // classification so that we can start in a known state. 92 CancelPendingClassification(); 93 94 page_text_ = page_text; 95 done_callback_ = done_callback; 96 97 // For consistency, we always want to invoke the DoneCallback 98 // asynchronously, rather than directly from this method. To ensure that 99 // this is the case, post a task to begin feature extraction on the next 100 // iteration of the message loop. 101 base::MessageLoop::current()->PostTask( 102 FROM_HERE, 103 base::Bind(&PhishingClassifier::BeginFeatureExtraction, 104 weak_factory_.GetWeakPtr())); 105 } 106 107 void PhishingClassifier::BeginFeatureExtraction() { 108 blink::WebView* web_view = render_view_->GetWebView(); 109 if (!web_view) { 110 RunFailureCallback(); 111 return; 112 } 113 114 blink::WebFrame* frame = web_view->mainFrame(); 115 if (!frame) { 116 RunFailureCallback(); 117 return; 118 } 119 120 // Check whether the URL is one that we should classify. 121 // Currently, we only classify http: URLs that are GET requests. 122 GURL url(frame->document().url()); 123 if (!url.SchemeIs(url::kHttpScheme)) { 124 RunFailureCallback(); 125 return; 126 } 127 128 blink::WebDataSource* ds = frame->dataSource(); 129 if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) { 130 RunFailureCallback(); 131 return; 132 } 133 134 features_.reset(new FeatureMap); 135 if (!url_extractor_->ExtractFeatures(url, features_.get())) { 136 RunFailureCallback(); 137 return; 138 } 139 140 // DOM feature extraction can take awhile, so it runs asynchronously 141 // in several chunks of work and invokes the callback when finished. 142 dom_extractor_->ExtractFeatures( 143 features_.get(), 144 base::Bind(&PhishingClassifier::DOMExtractionFinished, 145 base::Unretained(this))); 146 } 147 148 void PhishingClassifier::CancelPendingClassification() { 149 // Note that cancelling the feature extractors is simply a no-op if they 150 // were not running. 151 DCHECK(is_ready()); 152 dom_extractor_->CancelPendingExtraction(); 153 term_extractor_->CancelPendingExtraction(); 154 weak_factory_.InvalidateWeakPtrs(); 155 Clear(); 156 } 157 158 void PhishingClassifier::DOMExtractionFinished(bool success) { 159 shingle_hashes_.reset(new std::set<uint32>); 160 if (success) { 161 // Term feature extraction can take awhile, so it runs asynchronously 162 // in several chunks of work and invokes the callback when finished. 163 term_extractor_->ExtractFeatures( 164 page_text_, 165 features_.get(), 166 shingle_hashes_.get(), 167 base::Bind(&PhishingClassifier::TermExtractionFinished, 168 base::Unretained(this))); 169 } else { 170 RunFailureCallback(); 171 } 172 } 173 174 void PhishingClassifier::TermExtractionFinished(bool success) { 175 if (success) { 176 blink::WebView* web_view = render_view_->GetWebView(); 177 if (!web_view) { 178 RunFailureCallback(); 179 return; 180 } 181 blink::WebFrame* main_frame = web_view->mainFrame(); 182 if (!main_frame) { 183 RunFailureCallback(); 184 return; 185 } 186 187 // Hash all of the features so that they match the model, then compute 188 // the score. 189 FeatureMap hashed_features; 190 ClientPhishingRequest verdict; 191 verdict.set_model_version(scorer_->model_version()); 192 verdict.set_url(main_frame->document().url().spec()); 193 for (base::hash_map<std::string, double>::const_iterator it = 194 features_->features().begin(); 195 it != features_->features().end(); ++it) { 196 VLOG(2) << "Feature: " << it->first << " = " << it->second; 197 bool result = hashed_features.AddRealFeature( 198 crypto::SHA256HashString(it->first), it->second); 199 DCHECK(result); 200 ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); 201 feature->set_name(it->first); 202 feature->set_value(it->second); 203 } 204 for (std::set<uint32>::const_iterator it = shingle_hashes_->begin(); 205 it != shingle_hashes_->end(); ++it) { 206 verdict.add_shingle_hashes(*it); 207 } 208 float score = static_cast<float>(scorer_->ComputeScore(hashed_features)); 209 verdict.set_client_score(score); 210 verdict.set_is_phishing(score >= kPhishyThreshold); 211 RunCallback(verdict); 212 } else { 213 RunFailureCallback(); 214 } 215 } 216 217 void PhishingClassifier::CheckNoPendingClassification() { 218 DCHECK(done_callback_.is_null()); 219 DCHECK(!page_text_); 220 if (!done_callback_.is_null() || page_text_) { 221 LOG(ERROR) << "Classification in progress, missing call to " 222 << "CancelPendingClassification"; 223 UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed", 224 1); 225 } 226 } 227 228 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) { 229 done_callback_.Run(verdict); 230 Clear(); 231 } 232 233 void PhishingClassifier::RunFailureCallback() { 234 ClientPhishingRequest verdict; 235 // In this case we're not guaranteed to have a valid URL. Just set it 236 // to the empty string to make sure we have a valid protocol buffer. 237 verdict.set_url(""); 238 verdict.set_client_score(kInvalidScore); 239 verdict.set_is_phishing(false); 240 RunCallback(verdict); 241 } 242 243 void PhishingClassifier::Clear() { 244 page_text_ = NULL; 245 done_callback_.Reset(); 246 features_.reset(NULL); 247 shingle_hashes_.reset(NULL); 248 } 249 250 } // namespace safe_browsing 251