1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h" 6 7 #include <set> 8 9 #include "base/bind.h" 10 #include "base/callback.h" 11 #include "base/lazy_instance.h" 12 #include "base/logging.h" 13 #include "base/metrics/histogram.h" 14 #include "chrome/common/safe_browsing/csd.pb.h" 15 #include "chrome/common/safe_browsing/safebrowsing_messages.h" 16 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 17 #include "chrome/renderer/safe_browsing/phishing_classifier.h" 18 #include "chrome/renderer/safe_browsing/scorer.h" 19 #include "content/public/renderer/document_state.h" 20 #include "content/public/renderer/navigation_state.h" 21 #include "content/public/renderer/render_thread.h" 22 #include "content/public/renderer/render_view.h" 23 #include "third_party/WebKit/public/platform/WebURL.h" 24 #include "third_party/WebKit/public/web/WebDocument.h" 25 #include "third_party/WebKit/public/web/WebLocalFrame.h" 26 #include "third_party/WebKit/public/web/WebView.h" 27 28 using content::DocumentState; 29 using content::NavigationState; 30 using content::RenderThread; 31 32 namespace safe_browsing { 33 34 static GURL StripRef(const GURL& url) { 35 GURL::Replacements replacements; 36 replacements.ClearRef(); 37 return url.ReplaceComponents(replacements); 38 } 39 40 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates; 41 static base::LazyInstance<PhishingClassifierDelegates> 42 g_delegates = LAZY_INSTANCE_INITIALIZER; 43 44 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> > 45 g_phishing_scorer = LAZY_INSTANCE_INITIALIZER; 46 47 // static 48 PhishingClassifierFilter* PhishingClassifierFilter::Create() { 49 // Private constructor and public static Create() method to facilitate 50 // stubbing out this class for binary-size reduction purposes. 51 return new PhishingClassifierFilter(); 52 } 53 54 PhishingClassifierFilter::PhishingClassifierFilter() 55 : RenderProcessObserver() {} 56 57 PhishingClassifierFilter::~PhishingClassifierFilter() {} 58 59 bool PhishingClassifierFilter::OnControlMessageReceived( 60 const IPC::Message& message) { 61 bool handled = true; 62 IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message) 63 IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel) 64 IPC_MESSAGE_UNHANDLED(handled = false) 65 IPC_END_MESSAGE_MAP() 66 return handled; 67 } 68 69 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) { 70 safe_browsing::Scorer* scorer = NULL; 71 // An empty model string means we should disable client-side phishing 72 // detection. 73 if (!model.empty()) { 74 scorer = safe_browsing::Scorer::Create(model); 75 if (!scorer) { 76 DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?"; 77 return; 78 } 79 } 80 PhishingClassifierDelegates::iterator i; 81 for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) { 82 (*i)->SetPhishingScorer(scorer); 83 } 84 g_phishing_scorer.Get().reset(scorer); 85 } 86 87 // static 88 PhishingClassifierDelegate* PhishingClassifierDelegate::Create( 89 content::RenderView* render_view, PhishingClassifier* classifier) { 90 // Private constructor and public static Create() method to facilitate 91 // stubbing out this class for binary-size reduction purposes. 92 return new PhishingClassifierDelegate(render_view, classifier); 93 } 94 95 PhishingClassifierDelegate::PhishingClassifierDelegate( 96 content::RenderView* render_view, 97 PhishingClassifier* classifier) 98 : content::RenderViewObserver(render_view), 99 last_main_frame_transition_(ui::PAGE_TRANSITION_LINK), 100 have_page_text_(false), 101 is_classifying_(false) { 102 g_delegates.Get().insert(this); 103 if (!classifier) { 104 classifier = new PhishingClassifier(render_view, 105 new FeatureExtractorClock()); 106 } 107 108 classifier_.reset(classifier); 109 110 if (g_phishing_scorer.Get().get()) 111 SetPhishingScorer(g_phishing_scorer.Get().get()); 112 } 113 114 PhishingClassifierDelegate::~PhishingClassifierDelegate() { 115 CancelPendingClassification(SHUTDOWN); 116 g_delegates.Get().erase(this); 117 } 118 119 void PhishingClassifierDelegate::SetPhishingScorer( 120 const safe_browsing::Scorer* scorer) { 121 if (!render_view()->GetWebView()) 122 return; // RenderView is tearing down. 123 if (is_classifying_) { 124 // If there is a classification going on right now it means we're 125 // actually replacing an existing scorer with a new model. In 126 // this case we simply cancel the current classification. 127 // TODO(noelutz): if this happens too frequently we could also 128 // replace the old scorer with the new one once classification is done 129 // but this would complicate the code somewhat. 130 CancelPendingClassification(NEW_PHISHING_SCORER); 131 } 132 classifier_->set_phishing_scorer(scorer); 133 // Start classifying the current page if all conditions are met. 134 // See MaybeStartClassification() for details. 135 MaybeStartClassification(); 136 } 137 138 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) { 139 last_url_received_from_browser_ = StripRef(url); 140 // Start classifying the current page if all conditions are met. 141 // See MaybeStartClassification() for details. 142 MaybeStartClassification(); 143 } 144 145 void PhishingClassifierDelegate::DidCommitProvisionalLoad( 146 blink::WebLocalFrame* frame, bool is_new_navigation) { 147 // A new page is starting to load, so cancel classificaiton. 148 // 149 // TODO(bryner): We shouldn't need to cancel classification if the navigation 150 // is within the same page. However, if we let classification continue in 151 // this case, we need to properly deal with the fact that PageCaptured will 152 // be called again for the in-page navigation. We need to be sure not to 153 // swap out the page text while the term feature extractor is still running. 154 DocumentState* document_state = DocumentState::FromDataSource( 155 frame->dataSource()); 156 NavigationState* navigation_state = document_state->navigation_state(); 157 CancelPendingClassification(navigation_state->was_within_same_page() ? 158 NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY); 159 if (frame == render_view()->GetWebView()->mainFrame()) { 160 last_main_frame_transition_ = navigation_state->transition_type(); 161 } 162 } 163 164 void PhishingClassifierDelegate::PageCaptured(base::string16* page_text, 165 bool preliminary_capture) { 166 if (preliminary_capture) { 167 return; 168 } 169 // Make sure there's no classification in progress. We don't want to swap 170 // out the page text string from underneath the term feature extractor. 171 // 172 // Note: Currently, if the url hasn't changed, we won't restart 173 // classification in this case. We may want to adjust this. 174 CancelPendingClassification(PAGE_RECAPTURED); 175 last_finished_load_url_ = GetToplevelUrl(); 176 classifier_page_text_.swap(*page_text); 177 have_page_text_ = true; 178 MaybeStartClassification(); 179 } 180 181 void PhishingClassifierDelegate::CancelPendingClassification( 182 CancelClassificationReason reason) { 183 if (is_classifying_) { 184 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason", 185 reason, 186 CANCEL_CLASSIFICATION_MAX); 187 is_classifying_ = false; 188 } 189 if (classifier_->is_ready()) { 190 classifier_->CancelPendingClassification(); 191 } 192 classifier_page_text_.clear(); 193 have_page_text_ = false; 194 } 195 196 bool PhishingClassifierDelegate::OnMessageReceived( 197 const IPC::Message& message) { 198 bool handled = true; 199 IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message) 200 IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection, 201 OnStartPhishingDetection) 202 IPC_MESSAGE_UNHANDLED(handled = false) 203 IPC_END_MESSAGE_MAP() 204 return handled; 205 } 206 207 void PhishingClassifierDelegate::ClassificationDone( 208 const ClientPhishingRequest& verdict) { 209 // We no longer need the page text. 210 classifier_page_text_.clear(); 211 VLOG(2) << "Phishy verdict = " << verdict.is_phishing() 212 << " score = " << verdict.client_score(); 213 if (verdict.client_score() != PhishingClassifier::kInvalidScore) { 214 DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url()); 215 RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone( 216 routing_id(), verdict.SerializeAsString())); 217 } 218 } 219 220 GURL PhishingClassifierDelegate::GetToplevelUrl() { 221 return render_view()->GetWebView()->mainFrame()->document().url(); 222 } 223 224 void PhishingClassifierDelegate::MaybeStartClassification() { 225 // We can begin phishing classification when the following conditions are 226 // met: 227 // 1. A Scorer has been created 228 // 2. The browser has sent a StartPhishingDetection message for the current 229 // toplevel URL. 230 // 3. The page has finished loading and the page text has been extracted. 231 // 4. The load is a new navigation (not a session history navigation). 232 // 5. The toplevel URL has not already been classified. 233 // 234 // Note that if we determine that this particular navigation should not be 235 // classified at all (as opposed to deferring it until we get an IPC or the 236 // load completes), we discard the page text since it won't be needed. 237 if (!classifier_->is_ready()) { 238 VLOG(2) << "Not starting classification, no Scorer created."; 239 // Keep classifier_page_text_, in case a Scorer is set later. 240 return; 241 } 242 243 if (last_main_frame_transition_ & ui::PAGE_TRANSITION_FORWARD_BACK) { 244 // Skip loads from session history navigation. However, update the 245 // last URL sent to the classifier, so that we'll properly detect 246 // in-page navigations. 247 VLOG(2) << "Not starting classification for back/forward navigation"; 248 last_url_sent_to_classifier_ = last_finished_load_url_; 249 classifier_page_text_.clear(); // we won't need this. 250 have_page_text_ = false; 251 return; 252 } 253 254 GURL stripped_last_load_url(StripRef(last_finished_load_url_)); 255 if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) { 256 // We've already classified this toplevel URL, so this was likely an 257 // in-page navigation or a subframe navigation. The browser should not 258 // send a StartPhishingDetection IPC in this case. 259 VLOG(2) << "Toplevel URL is unchanged, not starting classification."; 260 classifier_page_text_.clear(); // we won't need this. 261 have_page_text_ = false; 262 return; 263 } 264 265 if (!have_page_text_) { 266 VLOG(2) << "Not starting classification, there is no page text ready."; 267 return; 268 } 269 270 if (last_url_received_from_browser_ != stripped_last_load_url) { 271 // The browser has not yet confirmed that this URL should be classified, 272 // so defer classification for now. Note: the ref does not affect 273 // any of the browser's preclassification checks, so we don't require it 274 // to match. 275 VLOG(2) << "Not starting classification, last url from browser is " 276 << last_url_received_from_browser_ << ", last finished load is " 277 << last_finished_load_url_; 278 // Keep classifier_page_text_, in case the browser notifies us later that 279 // we should classify the URL. 280 return; 281 } 282 283 VLOG(2) << "Starting classification for " << last_finished_load_url_; 284 last_url_sent_to_classifier_ = last_finished_load_url_; 285 is_classifying_ = true; 286 classifier_->BeginClassification( 287 &classifier_page_text_, 288 base::Bind(&PhishingClassifierDelegate::ClassificationDone, 289 base::Unretained(this))); 290 } 291 292 } // namespace safe_browsing 293