1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 6 7 #include "base/bind.h" 8 #include "base/compiler_specific.h" 9 #include "base/containers/hash_tables.h" 10 #include "base/logging.h" 11 #include "base/message_loop/message_loop.h" 12 #include "base/metrics/histogram.h" 13 #include "base/strings/string_util.h" 14 #include "base/time/time.h" 15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 16 #include "chrome/renderer/safe_browsing/features.h" 17 #include "content/public/renderer/render_view.h" 18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" 19 #include "third_party/WebKit/public/platform/WebString.h" 20 #include "third_party/WebKit/public/web/WebElement.h" 21 #include "third_party/WebKit/public/web/WebFrame.h" 22 #include "third_party/WebKit/public/web/WebNodeCollection.h" 23 #include "third_party/WebKit/public/web/WebView.h" 24 25 namespace safe_browsing { 26 27 // This time should be short enough that it doesn't noticeably disrupt the 28 // user's interaction with the page. 29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; 30 31 // Experimenting shows that we get a reasonable gain in performance by 32 // increasing this up to around 10, but there's not much benefit in 33 // increasing it past that. 34 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; 35 36 // This should be longer than we expect feature extraction to take on any 37 // actual phishing page. 38 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; 39 40 // Intermediate state used for computing features. See features.h for 41 // descriptions of the DOM features that are computed. 42 struct PhishingDOMFeatureExtractor::PageFeatureState { 43 // Link related features 44 int external_links; 45 base::hash_set<std::string> external_domains; 46 int secure_links; 47 int total_links; 48 49 // Form related features 50 int num_forms; 51 int num_text_inputs; 52 int num_pswd_inputs; 53 int num_radio_inputs; 54 int num_check_inputs; 55 int action_other_domain; 56 int total_actions; 57 58 // Image related features 59 int img_other_domain; 60 int total_imgs; 61 62 // How many script tags 63 int num_script_tags; 64 65 // The time at which we started feature extraction for the current page. 66 base::TimeTicks start_time; 67 68 // The number of iterations we've done for the current extraction. 69 int num_iterations; 70 71 explicit PageFeatureState(base::TimeTicks start_time_ticks) 72 : external_links(0), 73 secure_links(0), 74 total_links(0), 75 num_forms(0), 76 num_text_inputs(0), 77 num_pswd_inputs(0), 78 num_radio_inputs(0), 79 num_check_inputs(0), 80 action_other_domain(0), 81 total_actions(0), 82 img_other_domain(0), 83 total_imgs(0), 84 num_script_tags(0), 85 start_time(start_time_ticks), 86 num_iterations(0) {} 87 88 ~PageFeatureState() {} 89 }; 90 91 // Per-frame state 92 struct PhishingDOMFeatureExtractor::FrameData { 93 // This is our reference to document.all, which is an iterator over all 94 // of the elements in the document. It keeps track of our current position. 95 WebKit::WebNodeCollection elements; 96 // The domain of the document URL, stored here so that we don't need to 97 // recompute it every time it's needed. 98 std::string domain; 99 }; 100 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 102 content::RenderView* render_view, 103 FeatureExtractorClock* clock) 104 : render_view_(render_view), 105 clock_(clock), 106 weak_factory_(this) { 107 Clear(); 108 } 109 110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { 111 // The RenderView should have called CancelPendingExtraction() before 112 // we are destroyed. 113 CheckNoPendingExtraction(); 114 } 115 116 void PhishingDOMFeatureExtractor::ExtractFeatures( 117 FeatureMap* features, 118 const DoneCallback& done_callback) { 119 // The RenderView should have called CancelPendingExtraction() before 120 // starting a new extraction, so DCHECK this. 121 CheckNoPendingExtraction(); 122 // However, in an opt build, we will go ahead and clean up the pending 123 // extraction so that we can start in a known state. 124 CancelPendingExtraction(); 125 126 features_ = features; 127 done_callback_ = done_callback; 128 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); 130 WebKit::WebView* web_view = render_view_->GetWebView(); 131 if (web_view && web_view->mainFrame()) { 132 cur_document_ = web_view->mainFrame()->document(); 133 } 134 135 base::MessageLoop::current()->PostTask( 136 FROM_HERE, 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 138 weak_factory_.GetWeakPtr())); 139 } 140 141 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 142 // Cancel any pending callbacks, and clear our state. 143 weak_factory_.InvalidateWeakPtrs(); 144 Clear(); 145 } 146 147 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 148 DCHECK(page_feature_state_.get()); 149 ++page_feature_state_->num_iterations; 150 base::TimeTicks current_chunk_start_time = clock_->Now(); 151 152 if (cur_document_.isNull()) { 153 // This will only happen if we weren't able to get the document for the 154 // main frame. We'll treat this as an extraction failure. 155 RunCallback(false); 156 return; 157 } 158 159 int num_elements = 0; 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { 161 WebKit::WebNode cur_node; 162 if (cur_frame_data_.get()) { 163 // We're resuming traversal of a frame, so just advance to the next node. 164 cur_node = cur_frame_data_->elements.nextItem(); 165 // When we resume the traversal, the first call to nextItem() potentially 166 // has to walk through the document again from the beginning, if it was 167 // modified between our chunks of work. Log how long this takes, so we 168 // can tell if it's too slow. 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 170 clock_->Now() - current_chunk_start_time); 171 } else { 172 // We just moved to a new frame, so update our frame state 173 // and advance to the first element. 174 ResetFrameData(); 175 cur_node = cur_frame_data_->elements.firstItem(); 176 } 177 178 for (; !cur_node.isNull(); 179 cur_node = cur_frame_data_->elements.nextItem()) { 180 if (!cur_node.isElementNode()) { 181 continue; 182 } 183 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); 184 if (element.hasTagName("a")) { 185 HandleLink(element); 186 } else if (element.hasTagName("form")) { 187 HandleForm(element); 188 } else if (element.hasTagName("img")) { 189 HandleImage(element); 190 } else if (element.hasTagName("input")) { 191 HandleInput(element); 192 } else if (element.hasTagName("script")) { 193 HandleScript(element); 194 } 195 196 if (++num_elements >= kClockCheckGranularity) { 197 num_elements = 0; 198 base::TimeTicks now = clock_->Now(); 199 if (now - page_feature_state_->start_time >= 200 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 201 DLOG(ERROR) << "Feature extraction took too long, giving up"; 202 // We expect this to happen infrequently, so record when it does. 203 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); 204 RunCallback(false); 205 return; 206 } 207 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 208 if (chunk_elapsed >= 209 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 210 // The time limit for the current chunk is up, so post a task to 211 // continue extraction. 212 // 213 // Record how much time we actually spent on the chunk. If this is 214 // much higher than kMaxTimePerChunkMs, we may need to adjust the 215 // clock granularity. 216 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", 217 chunk_elapsed); 218 base::MessageLoop::current()->PostTask( 219 FROM_HERE, 220 base::Bind( 221 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 222 weak_factory_.GetWeakPtr())); 223 return; 224 } 225 // Otherwise, continue. 226 } 227 } 228 229 // We're done with this frame, recalculate the FrameData when we 230 // advance to the next frame. 231 cur_frame_data_.reset(); 232 } 233 234 InsertFeatures(); 235 RunCallback(true); 236 } 237 238 void PhishingDOMFeatureExtractor::HandleLink( 239 const WebKit::WebElement& element) { 240 // Count the number of times we link to a different host. 241 if (!element.hasAttribute("href")) { 242 DVLOG(1) << "Skipping anchor tag with no href"; 243 return; 244 } 245 246 // Retrieve the link and resolve the link in case it's relative. 247 WebKit::WebURL full_url = element.document().completeURL( 248 element.getAttribute("href")); 249 250 std::string domain; 251 bool is_external = IsExternalDomain(full_url, &domain); 252 if (domain.empty()) { 253 DVLOG(1) << "Could not extract domain from link: " << full_url; 254 return; 255 } 256 257 if (is_external) { 258 ++page_feature_state_->external_links; 259 260 // Record each unique domain that we link to. 261 page_feature_state_->external_domains.insert(domain); 262 } 263 264 // Check how many are https links. 265 if (GURL(full_url).SchemeIs("https")) { 266 ++page_feature_state_->secure_links; 267 } 268 269 ++page_feature_state_->total_links; 270 } 271 272 void PhishingDOMFeatureExtractor::HandleForm( 273 const WebKit::WebElement& element) { 274 // Increment the number of forms on this page. 275 ++page_feature_state_->num_forms; 276 277 // Record whether the action points to a different domain. 278 if (!element.hasAttribute("action")) { 279 return; 280 } 281 282 WebKit::WebURL full_url = element.document().completeURL( 283 element.getAttribute("action")); 284 285 std::string domain; 286 bool is_external = IsExternalDomain(full_url, &domain); 287 if (domain.empty()) { 288 DVLOG(1) << "Could not extract domain from form action: " << full_url; 289 return; 290 } 291 292 if (is_external) { 293 ++page_feature_state_->action_other_domain; 294 } 295 ++page_feature_state_->total_actions; 296 } 297 298 void PhishingDOMFeatureExtractor::HandleImage( 299 const WebKit::WebElement& element) { 300 if (!element.hasAttribute("src")) { 301 DVLOG(1) << "Skipping img tag with no src"; 302 } 303 304 // Record whether the image points to a different domain. 305 WebKit::WebURL full_url = element.document().completeURL( 306 element.getAttribute("src")); 307 std::string domain; 308 bool is_external = IsExternalDomain(full_url, &domain); 309 if (domain.empty()) { 310 DVLOG(1) << "Could not extract domain from image src: " << full_url; 311 return; 312 } 313 314 if (is_external) { 315 ++page_feature_state_->img_other_domain; 316 } 317 ++page_feature_state_->total_imgs; 318 } 319 320 void PhishingDOMFeatureExtractor::HandleInput( 321 const WebKit::WebElement& element) { 322 // The HTML spec says that if the type is unspecified, it defaults to text. 323 // In addition, any unrecognized type will be treated as a text input. 324 // 325 // Note that we use the attribute value rather than 326 // WebFormControlElement::formControlType() for consistency with the 327 // way the phishing classification model is created. 328 std::string type = element.getAttribute("type").utf8(); 329 StringToLowerASCII(&type); 330 if (type == "password") { 331 ++page_feature_state_->num_pswd_inputs; 332 } else if (type == "radio") { 333 ++page_feature_state_->num_radio_inputs; 334 } else if (type == "checkbox") { 335 ++page_feature_state_->num_check_inputs; 336 } else if (type != "submit" && type != "reset" && type != "file" && 337 type != "hidden" && type != "image" && type != "button") { 338 // Note that there are a number of new input types in HTML5 that are not 339 // handled above. For now, we will consider these as text inputs since 340 // they could be used to capture user input. 341 ++page_feature_state_->num_text_inputs; 342 } 343 } 344 345 void PhishingDOMFeatureExtractor::HandleScript( 346 const WebKit::WebElement& element) { 347 ++page_feature_state_->num_script_tags; 348 } 349 350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 351 DCHECK(done_callback_.is_null()); 352 DCHECK(!cur_frame_data_.get()); 353 DCHECK(cur_document_.isNull()); 354 if (!done_callback_.is_null() || cur_frame_data_.get() || 355 !cur_document_.isNull()) { 356 LOG(ERROR) << "Extraction in progress, missing call to " 357 << "CancelPendingExtraction"; 358 } 359 } 360 361 void PhishingDOMFeatureExtractor::RunCallback(bool success) { 362 // Record some timing stats that we can use to evaluate feature extraction 363 // performance. These include both successful and failed extractions. 364 DCHECK(page_feature_state_.get()); 365 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", 366 page_feature_state_->num_iterations); 367 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", 368 clock_->Now() - page_feature_state_->start_time); 369 370 DCHECK(!done_callback_.is_null()); 371 done_callback_.Run(success); 372 Clear(); 373 } 374 375 void PhishingDOMFeatureExtractor::Clear() { 376 features_ = NULL; 377 done_callback_.Reset(); 378 cur_frame_data_.reset(NULL); 379 cur_document_.reset(); 380 } 381 382 void PhishingDOMFeatureExtractor::ResetFrameData() { 383 DCHECK(!cur_document_.isNull()); 384 DCHECK(!cur_frame_data_.get()); 385 386 cur_frame_data_.reset(new FrameData()); 387 cur_frame_data_->elements = cur_document_.all(); 388 cur_frame_data_->domain = 389 net::registry_controlled_domains::GetDomainAndRegistry( 390 cur_document_.url(), 391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 392 } 393 394 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { 395 DCHECK(!cur_document_.isNull()); 396 WebKit::WebFrame* frame = cur_document_.frame(); 397 // Advance to the next frame that contains a document, with no wrapping. 398 if (frame) { 399 while ((frame = frame->traverseNext(false))) { 400 if (!frame->document().isNull()) { 401 return frame->document(); 402 } 403 } 404 } else { 405 // Keep track of how often frame traversal got "stuck" due to the 406 // current subdocument getting removed from the frame tree. 407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); 408 } 409 return WebKit::WebDocument(); 410 } 411 412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 413 std::string* domain) const { 414 DCHECK(domain); 415 DCHECK(cur_frame_data_.get()); 416 417 if (cur_frame_data_->domain.empty()) { 418 return false; 419 } 420 421 // TODO(bryner): Ensure that the url encoding is consistent with the features 422 // in the model. 423 if (url.HostIsIPAddress()) { 424 domain->assign(url.host()); 425 } else { 426 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( 427 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); 428 } 429 430 return !domain->empty() && *domain != cur_frame_data_->domain; 431 } 432 433 void PhishingDOMFeatureExtractor::InsertFeatures() { 434 DCHECK(page_feature_state_.get()); 435 436 if (page_feature_state_->total_links > 0) { 437 // Add a feature for the fraction of times the page links to an external 438 // domain vs. an internal domain. 439 double link_freq = static_cast<double>( 440 page_feature_state_->external_links) / 441 page_feature_state_->total_links; 442 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); 443 444 // Add a feature for each unique domain that we're linking to 445 for (base::hash_set<std::string>::iterator it = 446 page_feature_state_->external_domains.begin(); 447 it != page_feature_state_->external_domains.end(); ++it) { 448 features_->AddBooleanFeature(features::kPageLinkDomain + *it); 449 } 450 451 // Fraction of links that use https. 452 double secure_freq = static_cast<double>( 453 page_feature_state_->secure_links) / page_feature_state_->total_links; 454 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); 455 } 456 457 // Record whether forms appear and whether various form elements appear. 458 if (page_feature_state_->num_forms > 0) { 459 features_->AddBooleanFeature(features::kPageHasForms); 460 } 461 if (page_feature_state_->num_text_inputs > 0) { 462 features_->AddBooleanFeature(features::kPageHasTextInputs); 463 } 464 if (page_feature_state_->num_pswd_inputs > 0) { 465 features_->AddBooleanFeature(features::kPageHasPswdInputs); 466 } 467 if (page_feature_state_->num_radio_inputs > 0) { 468 features_->AddBooleanFeature(features::kPageHasRadioInputs); 469 } 470 if (page_feature_state_->num_check_inputs > 0) { 471 features_->AddBooleanFeature(features::kPageHasCheckInputs); 472 } 473 474 // Record fraction of form actions that point to a different domain. 475 if (page_feature_state_->total_actions > 0) { 476 double action_freq = static_cast<double>( 477 page_feature_state_->action_other_domain) / 478 page_feature_state_->total_actions; 479 features_->AddRealFeature(features::kPageActionOtherDomainFreq, 480 action_freq); 481 } 482 483 // Record how many image src attributes point to a different domain. 484 if (page_feature_state_->total_imgs > 0) { 485 double img_freq = static_cast<double>( 486 page_feature_state_->img_other_domain) / 487 page_feature_state_->total_imgs; 488 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); 489 } 490 491 // Record number of script tags (discretized for numerical stability.) 492 if (page_feature_state_->num_script_tags > 1) { 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 494 if (page_feature_state_->num_script_tags > 6) { 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 496 } 497 } 498 } 499 500 } // namespace safe_browsing 501