1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 6 7 #include "base/bind.h" 8 #include "base/compiler_specific.h" 9 #include "base/containers/hash_tables.h" 10 #include "base/logging.h" 11 #include "base/message_loop/message_loop.h" 12 #include "base/metrics/histogram.h" 13 #include "base/strings/string_util.h" 14 #include "base/time/time.h" 15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 16 #include "chrome/renderer/safe_browsing/features.h" 17 #include "content/public/renderer/render_view.h" 18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" 19 #include "third_party/WebKit/public/platform/WebString.h" 20 #include "third_party/WebKit/public/web/WebElement.h" 21 #include "third_party/WebKit/public/web/WebElementCollection.h" 22 #include "third_party/WebKit/public/web/WebLocalFrame.h" 23 #include "third_party/WebKit/public/web/WebView.h" 24 25 namespace safe_browsing { 26 27 // This time should be short enough that it doesn't noticeably disrupt the 28 // user's interaction with the page. 29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; 30 31 // Experimenting shows that we get a reasonable gain in performance by 32 // increasing this up to around 10, but there's not much benefit in 33 // increasing it past that. 34 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; 35 36 // This should be longer than we expect feature extraction to take on any 37 // actual phishing page. 38 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; 39 40 // Intermediate state used for computing features. See features.h for 41 // descriptions of the DOM features that are computed. 42 struct PhishingDOMFeatureExtractor::PageFeatureState { 43 // Link related features 44 int external_links; 45 base::hash_set<std::string> external_domains; 46 int secure_links; 47 int total_links; 48 49 // Form related features 50 int num_forms; 51 int num_text_inputs; 52 int num_pswd_inputs; 53 int num_radio_inputs; 54 int num_check_inputs; 55 int action_other_domain; 56 int total_actions; 57 58 // Image related features 59 int img_other_domain; 60 int total_imgs; 61 62 // How many script tags 63 int num_script_tags; 64 65 // The time at which we started feature extraction for the current page. 66 base::TimeTicks start_time; 67 68 // The number of iterations we've done for the current extraction. 69 int num_iterations; 70 71 explicit PageFeatureState(base::TimeTicks start_time_ticks) 72 : external_links(0), 73 secure_links(0), 74 total_links(0), 75 num_forms(0), 76 num_text_inputs(0), 77 num_pswd_inputs(0), 78 num_radio_inputs(0), 79 num_check_inputs(0), 80 action_other_domain(0), 81 total_actions(0), 82 img_other_domain(0), 83 total_imgs(0), 84 num_script_tags(0), 85 start_time(start_time_ticks), 86 num_iterations(0) {} 87 88 ~PageFeatureState() {} 89 }; 90 91 // Per-frame state 92 struct PhishingDOMFeatureExtractor::FrameData { 93 // This is our reference to document.all, which is an iterator over all 94 // of the elements in the document. It keeps track of our current position. 95 blink::WebElementCollection elements; 96 // The domain of the document URL, stored here so that we don't need to 97 // recompute it every time it's needed. 98 std::string domain; 99 }; 100 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 102 content::RenderView* render_view, 103 FeatureExtractorClock* clock) 104 : render_view_(render_view), 105 clock_(clock), 106 weak_factory_(this) { 107 Clear(); 108 } 109 110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { 111 // The RenderView should have called CancelPendingExtraction() before 112 // we are destroyed. 113 CheckNoPendingExtraction(); 114 } 115 116 void PhishingDOMFeatureExtractor::ExtractFeatures( 117 FeatureMap* features, 118 const DoneCallback& done_callback) { 119 // The RenderView should have called CancelPendingExtraction() before 120 // starting a new extraction, so DCHECK this. 121 CheckNoPendingExtraction(); 122 // However, in an opt build, we will go ahead and clean up the pending 123 // extraction so that we can start in a known state. 124 CancelPendingExtraction(); 125 126 features_ = features; 127 done_callback_ = done_callback; 128 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); 130 blink::WebView* web_view = render_view_->GetWebView(); 131 if (web_view && web_view->mainFrame()) { 132 cur_document_ = web_view->mainFrame()->document(); 133 } 134 135 base::MessageLoop::current()->PostTask( 136 FROM_HERE, 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 138 weak_factory_.GetWeakPtr())); 139 } 140 141 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 142 // Cancel any pending callbacks, and clear our state. 143 weak_factory_.InvalidateWeakPtrs(); 144 Clear(); 145 } 146 147 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 148 DCHECK(page_feature_state_.get()); 149 ++page_feature_state_->num_iterations; 150 base::TimeTicks current_chunk_start_time = clock_->Now(); 151 152 if (cur_document_.isNull()) { 153 // This will only happen if we weren't able to get the document for the 154 // main frame. We'll treat this as an extraction failure. 155 RunCallback(false); 156 return; 157 } 158 159 int num_elements = 0; 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { 161 blink::WebElement cur_element; 162 if (cur_frame_data_.get()) { 163 // We're resuming traversal of a frame, so just advance to the next 164 // element. 165 cur_element = cur_frame_data_->elements.nextItem(); 166 // When we resume the traversal, the first call to nextItem() potentially 167 // has to walk through the document again from the beginning, if it was 168 // modified between our chunks of work. Log how long this takes, so we 169 // can tell if it's too slow. 170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 171 clock_->Now() - current_chunk_start_time); 172 } else { 173 // We just moved to a new frame, so update our frame state 174 // and advance to the first element. 175 ResetFrameData(); 176 cur_element = cur_frame_data_->elements.firstItem(); 177 } 178 179 for (; !cur_element.isNull(); 180 cur_element = cur_frame_data_->elements.nextItem()) { 181 if (cur_element.hasHTMLTagName("a")) { 182 HandleLink(cur_element); 183 } else if (cur_element.hasHTMLTagName("form")) { 184 HandleForm(cur_element); 185 } else if (cur_element.hasHTMLTagName("img")) { 186 HandleImage(cur_element); 187 } else if (cur_element.hasHTMLTagName("input")) { 188 HandleInput(cur_element); 189 } else if (cur_element.hasHTMLTagName("script")) { 190 HandleScript(cur_element); 191 } 192 193 if (++num_elements >= kClockCheckGranularity) { 194 num_elements = 0; 195 base::TimeTicks now = clock_->Now(); 196 if (now - page_feature_state_->start_time >= 197 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 198 DLOG(ERROR) << "Feature extraction took too long, giving up"; 199 // We expect this to happen infrequently, so record when it does. 200 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); 201 RunCallback(false); 202 return; 203 } 204 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; 205 if (chunk_elapsed >= 206 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { 207 // The time limit for the current chunk is up, so post a task to 208 // continue extraction. 209 // 210 // Record how much time we actually spent on the chunk. If this is 211 // much higher than kMaxTimePerChunkMs, we may need to adjust the 212 // clock granularity. 213 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", 214 chunk_elapsed); 215 base::MessageLoop::current()->PostTask( 216 FROM_HERE, 217 base::Bind( 218 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 219 weak_factory_.GetWeakPtr())); 220 return; 221 } 222 // Otherwise, continue. 223 } 224 } 225 226 // We're done with this frame, recalculate the FrameData when we 227 // advance to the next frame. 228 cur_frame_data_.reset(); 229 } 230 231 InsertFeatures(); 232 RunCallback(true); 233 } 234 235 void PhishingDOMFeatureExtractor::HandleLink( 236 const blink::WebElement& element) { 237 // Count the number of times we link to a different host. 238 if (!element.hasAttribute("href")) { 239 DVLOG(1) << "Skipping anchor tag with no href"; 240 return; 241 } 242 243 // Retrieve the link and resolve the link in case it's relative. 244 blink::WebURL full_url = element.document().completeURL( 245 element.getAttribute("href")); 246 247 std::string domain; 248 bool is_external = IsExternalDomain(full_url, &domain); 249 if (domain.empty()) { 250 DVLOG(1) << "Could not extract domain from link: " << full_url; 251 return; 252 } 253 254 if (is_external) { 255 ++page_feature_state_->external_links; 256 257 // Record each unique domain that we link to. 258 page_feature_state_->external_domains.insert(domain); 259 } 260 261 // Check how many are https links. 262 if (GURL(full_url).SchemeIs("https")) { 263 ++page_feature_state_->secure_links; 264 } 265 266 ++page_feature_state_->total_links; 267 } 268 269 void PhishingDOMFeatureExtractor::HandleForm( 270 const blink::WebElement& element) { 271 // Increment the number of forms on this page. 272 ++page_feature_state_->num_forms; 273 274 // Record whether the action points to a different domain. 275 if (!element.hasAttribute("action")) { 276 return; 277 } 278 279 blink::WebURL full_url = element.document().completeURL( 280 element.getAttribute("action")); 281 282 std::string domain; 283 bool is_external = IsExternalDomain(full_url, &domain); 284 if (domain.empty()) { 285 DVLOG(1) << "Could not extract domain from form action: " << full_url; 286 return; 287 } 288 289 if (is_external) { 290 ++page_feature_state_->action_other_domain; 291 } 292 ++page_feature_state_->total_actions; 293 } 294 295 void PhishingDOMFeatureExtractor::HandleImage( 296 const blink::WebElement& element) { 297 if (!element.hasAttribute("src")) { 298 DVLOG(1) << "Skipping img tag with no src"; 299 } 300 301 // Record whether the image points to a different domain. 302 blink::WebURL full_url = element.document().completeURL( 303 element.getAttribute("src")); 304 std::string domain; 305 bool is_external = IsExternalDomain(full_url, &domain); 306 if (domain.empty()) { 307 DVLOG(1) << "Could not extract domain from image src: " << full_url; 308 return; 309 } 310 311 if (is_external) { 312 ++page_feature_state_->img_other_domain; 313 } 314 ++page_feature_state_->total_imgs; 315 } 316 317 void PhishingDOMFeatureExtractor::HandleInput( 318 const blink::WebElement& element) { 319 // The HTML spec says that if the type is unspecified, it defaults to text. 320 // In addition, any unrecognized type will be treated as a text input. 321 // 322 // Note that we use the attribute value rather than 323 // WebFormControlElement::formControlType() for consistency with the 324 // way the phishing classification model is created. 325 std::string type = element.getAttribute("type").utf8(); 326 base::StringToLowerASCII(&type); 327 if (type == "password") { 328 ++page_feature_state_->num_pswd_inputs; 329 } else if (type == "radio") { 330 ++page_feature_state_->num_radio_inputs; 331 } else if (type == "checkbox") { 332 ++page_feature_state_->num_check_inputs; 333 } else if (type != "submit" && type != "reset" && type != "file" && 334 type != "hidden" && type != "image" && type != "button") { 335 // Note that there are a number of new input types in HTML5 that are not 336 // handled above. For now, we will consider these as text inputs since 337 // they could be used to capture user input. 338 ++page_feature_state_->num_text_inputs; 339 } 340 } 341 342 void PhishingDOMFeatureExtractor::HandleScript( 343 const blink::WebElement& element) { 344 ++page_feature_state_->num_script_tags; 345 } 346 347 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 348 DCHECK(done_callback_.is_null()); 349 DCHECK(!cur_frame_data_.get()); 350 DCHECK(cur_document_.isNull()); 351 if (!done_callback_.is_null() || cur_frame_data_.get() || 352 !cur_document_.isNull()) { 353 LOG(ERROR) << "Extraction in progress, missing call to " 354 << "CancelPendingExtraction"; 355 } 356 } 357 358 void PhishingDOMFeatureExtractor::RunCallback(bool success) { 359 // Record some timing stats that we can use to evaluate feature extraction 360 // performance. These include both successful and failed extractions. 361 DCHECK(page_feature_state_.get()); 362 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", 363 page_feature_state_->num_iterations); 364 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", 365 clock_->Now() - page_feature_state_->start_time); 366 367 DCHECK(!done_callback_.is_null()); 368 done_callback_.Run(success); 369 Clear(); 370 } 371 372 void PhishingDOMFeatureExtractor::Clear() { 373 features_ = NULL; 374 done_callback_.Reset(); 375 cur_frame_data_.reset(NULL); 376 cur_document_.reset(); 377 } 378 379 void PhishingDOMFeatureExtractor::ResetFrameData() { 380 DCHECK(!cur_document_.isNull()); 381 DCHECK(!cur_frame_data_.get()); 382 383 cur_frame_data_.reset(new FrameData()); 384 cur_frame_data_->elements = cur_document_.all(); 385 cur_frame_data_->domain = 386 net::registry_controlled_domains::GetDomainAndRegistry( 387 cur_document_.url(), 388 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 389 } 390 391 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { 392 DCHECK(!cur_document_.isNull()); 393 blink::WebFrame* frame = cur_document_.frame(); 394 // Advance to the next frame that contains a document, with no wrapping. 395 if (frame) { 396 for (frame = frame->traverseNext(false); frame; 397 frame = frame->traverseNext(false)) { 398 if (!frame->document().isNull()) { 399 return frame->document(); 400 } 401 } 402 } else { 403 // Keep track of how often frame traversal got "stuck" due to the 404 // current subdocument getting removed from the frame tree. 405 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); 406 } 407 return blink::WebDocument(); 408 } 409 410 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 411 std::string* domain) const { 412 DCHECK(domain); 413 DCHECK(cur_frame_data_.get()); 414 415 if (cur_frame_data_->domain.empty()) { 416 return false; 417 } 418 419 // TODO(bryner): Ensure that the url encoding is consistent with the features 420 // in the model. 421 if (url.HostIsIPAddress()) { 422 domain->assign(url.host()); 423 } else { 424 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( 425 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); 426 } 427 428 return !domain->empty() && *domain != cur_frame_data_->domain; 429 } 430 431 void PhishingDOMFeatureExtractor::InsertFeatures() { 432 DCHECK(page_feature_state_.get()); 433 434 if (page_feature_state_->total_links > 0) { 435 // Add a feature for the fraction of times the page links to an external 436 // domain vs. an internal domain. 437 double link_freq = static_cast<double>( 438 page_feature_state_->external_links) / 439 page_feature_state_->total_links; 440 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); 441 442 // Add a feature for each unique domain that we're linking to 443 for (base::hash_set<std::string>::iterator it = 444 page_feature_state_->external_domains.begin(); 445 it != page_feature_state_->external_domains.end(); ++it) { 446 features_->AddBooleanFeature(features::kPageLinkDomain + *it); 447 } 448 449 // Fraction of links that use https. 450 double secure_freq = static_cast<double>( 451 page_feature_state_->secure_links) / page_feature_state_->total_links; 452 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); 453 } 454 455 // Record whether forms appear and whether various form elements appear. 456 if (page_feature_state_->num_forms > 0) { 457 features_->AddBooleanFeature(features::kPageHasForms); 458 } 459 if (page_feature_state_->num_text_inputs > 0) { 460 features_->AddBooleanFeature(features::kPageHasTextInputs); 461 } 462 if (page_feature_state_->num_pswd_inputs > 0) { 463 features_->AddBooleanFeature(features::kPageHasPswdInputs); 464 } 465 if (page_feature_state_->num_radio_inputs > 0) { 466 features_->AddBooleanFeature(features::kPageHasRadioInputs); 467 } 468 if (page_feature_state_->num_check_inputs > 0) { 469 features_->AddBooleanFeature(features::kPageHasCheckInputs); 470 } 471 472 // Record fraction of form actions that point to a different domain. 473 if (page_feature_state_->total_actions > 0) { 474 double action_freq = static_cast<double>( 475 page_feature_state_->action_other_domain) / 476 page_feature_state_->total_actions; 477 features_->AddRealFeature(features::kPageActionOtherDomainFreq, 478 action_freq); 479 } 480 481 // Record how many image src attributes point to a different domain. 482 if (page_feature_state_->total_imgs > 0) { 483 double img_freq = static_cast<double>( 484 page_feature_state_->img_other_domain) / 485 page_feature_state_->total_imgs; 486 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); 487 } 488 489 // Record number of script tags (discretized for numerical stability.) 490 if (page_feature_state_->num_script_tags > 1) { 491 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 492 if (page_feature_state_->num_script_tags > 6) { 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 494 } 495 } 496 } 497 498 } // namespace safe_browsing 499