Home | History | Annotate | Download | only in safe_browsing
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
      6 
      7 #include "base/bind.h"
      8 #include "base/compiler_specific.h"
      9 #include "base/containers/hash_tables.h"
     10 #include "base/logging.h"
     11 #include "base/message_loop/message_loop.h"
     12 #include "base/metrics/histogram.h"
     13 #include "base/strings/string_util.h"
     14 #include "base/time/time.h"
     15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
     16 #include "chrome/renderer/safe_browsing/features.h"
     17 #include "content/public/renderer/render_view.h"
     18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
     19 #include "third_party/WebKit/public/platform/WebString.h"
     20 #include "third_party/WebKit/public/web/WebElement.h"
     21 #include "third_party/WebKit/public/web/WebElementCollection.h"
     22 #include "third_party/WebKit/public/web/WebLocalFrame.h"
     23 #include "third_party/WebKit/public/web/WebView.h"
     24 
     25 namespace safe_browsing {
     26 
     27 // This time should be short enough that it doesn't noticeably disrupt the
     28 // user's interaction with the page.
     29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
     30 
     31 // Experimenting shows that we get a reasonable gain in performance by
     32 // increasing this up to around 10, but there's not much benefit in
     33 // increasing it past that.
     34 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
     35 
     36 // This should be longer than we expect feature extraction to take on any
     37 // actual phishing page.
     38 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
     39 
     40 // Intermediate state used for computing features.  See features.h for
     41 // descriptions of the DOM features that are computed.
     42 struct PhishingDOMFeatureExtractor::PageFeatureState {
     43   // Link related features
     44   int external_links;
     45   base::hash_set<std::string> external_domains;
     46   int secure_links;
     47   int total_links;
     48 
     49   // Form related features
     50   int num_forms;
     51   int num_text_inputs;
     52   int num_pswd_inputs;
     53   int num_radio_inputs;
     54   int num_check_inputs;
     55   int action_other_domain;
     56   int total_actions;
     57 
     58   // Image related features
     59   int img_other_domain;
     60   int total_imgs;
     61 
     62   // How many script tags
     63   int num_script_tags;
     64 
     65   // The time at which we started feature extraction for the current page.
     66   base::TimeTicks start_time;
     67 
     68   // The number of iterations we've done for the current extraction.
     69   int num_iterations;
     70 
     71   explicit PageFeatureState(base::TimeTicks start_time_ticks)
     72       : external_links(0),
     73         secure_links(0),
     74         total_links(0),
     75         num_forms(0),
     76         num_text_inputs(0),
     77         num_pswd_inputs(0),
     78         num_radio_inputs(0),
     79         num_check_inputs(0),
     80         action_other_domain(0),
     81         total_actions(0),
     82         img_other_domain(0),
     83         total_imgs(0),
     84         num_script_tags(0),
     85         start_time(start_time_ticks),
     86         num_iterations(0) {}
     87 
     88   ~PageFeatureState() {}
     89 };
     90 
     91 // Per-frame state
     92 struct PhishingDOMFeatureExtractor::FrameData {
     93   // This is our reference to document.all, which is an iterator over all
     94   // of the elements in the document.  It keeps track of our current position.
     95   blink::WebElementCollection elements;
     96   // The domain of the document URL, stored here so that we don't need to
     97   // recompute it every time it's needed.
     98   std::string domain;
     99 };
    100 
    101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
    102     content::RenderView* render_view,
    103     FeatureExtractorClock* clock)
    104     : render_view_(render_view),
    105       clock_(clock),
    106       weak_factory_(this) {
    107   Clear();
    108 }
    109 
    110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
    111   // The RenderView should have called CancelPendingExtraction() before
    112   // we are destroyed.
    113   CheckNoPendingExtraction();
    114 }
    115 
    116 void PhishingDOMFeatureExtractor::ExtractFeatures(
    117     FeatureMap* features,
    118     const DoneCallback& done_callback) {
    119   // The RenderView should have called CancelPendingExtraction() before
    120   // starting a new extraction, so DCHECK this.
    121   CheckNoPendingExtraction();
    122   // However, in an opt build, we will go ahead and clean up the pending
    123   // extraction so that we can start in a known state.
    124   CancelPendingExtraction();
    125 
    126   features_ = features;
    127   done_callback_ = done_callback;
    128 
    129   page_feature_state_.reset(new PageFeatureState(clock_->Now()));
    130   blink::WebView* web_view = render_view_->GetWebView();
    131   if (web_view && web_view->mainFrame()) {
    132     cur_document_ = web_view->mainFrame()->document();
    133   }
    134 
    135   base::MessageLoop::current()->PostTask(
    136       FROM_HERE,
    137       base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
    138                  weak_factory_.GetWeakPtr()));
    139 }
    140 
    141 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
    142   // Cancel any pending callbacks, and clear our state.
    143   weak_factory_.InvalidateWeakPtrs();
    144   Clear();
    145 }
    146 
    147 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
    148   DCHECK(page_feature_state_.get());
    149   ++page_feature_state_->num_iterations;
    150   base::TimeTicks current_chunk_start_time = clock_->Now();
    151 
    152   if (cur_document_.isNull()) {
    153     // This will only happen if we weren't able to get the document for the
    154     // main frame.  We'll treat this as an extraction failure.
    155     RunCallback(false);
    156     return;
    157   }
    158 
    159   int num_elements = 0;
    160   for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
    161     blink::WebElement cur_element;
    162     if (cur_frame_data_.get()) {
    163       // We're resuming traversal of a frame, so just advance to the next
    164       // element.
    165       cur_element = cur_frame_data_->elements.nextItem();
    166       // When we resume the traversal, the first call to nextItem() potentially
    167       // has to walk through the document again from the beginning, if it was
    168       // modified between our chunks of work.  Log how long this takes, so we
    169       // can tell if it's too slow.
    170       UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
    171                           clock_->Now() - current_chunk_start_time);
    172     } else {
    173       // We just moved to a new frame, so update our frame state
    174       // and advance to the first element.
    175       ResetFrameData();
    176       cur_element = cur_frame_data_->elements.firstItem();
    177     }
    178 
    179     for (; !cur_element.isNull();
    180          cur_element = cur_frame_data_->elements.nextItem()) {
    181       if (cur_element.hasHTMLTagName("a")) {
    182         HandleLink(cur_element);
    183       } else if (cur_element.hasHTMLTagName("form")) {
    184         HandleForm(cur_element);
    185       } else if (cur_element.hasHTMLTagName("img")) {
    186         HandleImage(cur_element);
    187       } else if (cur_element.hasHTMLTagName("input")) {
    188         HandleInput(cur_element);
    189       } else if (cur_element.hasHTMLTagName("script")) {
    190         HandleScript(cur_element);
    191       }
    192 
    193       if (++num_elements >= kClockCheckGranularity) {
    194         num_elements = 0;
    195         base::TimeTicks now = clock_->Now();
    196         if (now - page_feature_state_->start_time >=
    197             base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
    198           DLOG(ERROR) << "Feature extraction took too long, giving up";
    199           // We expect this to happen infrequently, so record when it does.
    200           UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
    201           RunCallback(false);
    202           return;
    203         }
    204         base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
    205         if (chunk_elapsed >=
    206             base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
    207           // The time limit for the current chunk is up, so post a task to
    208           // continue extraction.
    209           //
    210           // Record how much time we actually spent on the chunk. If this is
    211           // much higher than kMaxTimePerChunkMs, we may need to adjust the
    212           // clock granularity.
    213           UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
    214                               chunk_elapsed);
    215           base::MessageLoop::current()->PostTask(
    216               FROM_HERE,
    217               base::Bind(
    218                   &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
    219                   weak_factory_.GetWeakPtr()));
    220           return;
    221         }
    222         // Otherwise, continue.
    223       }
    224     }
    225 
    226     // We're done with this frame, recalculate the FrameData when we
    227     // advance to the next frame.
    228     cur_frame_data_.reset();
    229   }
    230 
    231   InsertFeatures();
    232   RunCallback(true);
    233 }
    234 
    235 void PhishingDOMFeatureExtractor::HandleLink(
    236     const blink::WebElement& element) {
    237   // Count the number of times we link to a different host.
    238   if (!element.hasAttribute("href")) {
    239     DVLOG(1) << "Skipping anchor tag with no href";
    240     return;
    241   }
    242 
    243   // Retrieve the link and resolve the link in case it's relative.
    244   blink::WebURL full_url = element.document().completeURL(
    245       element.getAttribute("href"));
    246 
    247   std::string domain;
    248   bool is_external = IsExternalDomain(full_url, &domain);
    249   if (domain.empty()) {
    250     DVLOG(1) << "Could not extract domain from link: " << full_url;
    251     return;
    252   }
    253 
    254   if (is_external) {
    255     ++page_feature_state_->external_links;
    256 
    257     // Record each unique domain that we link to.
    258     page_feature_state_->external_domains.insert(domain);
    259   }
    260 
    261   // Check how many are https links.
    262   if (GURL(full_url).SchemeIs("https")) {
    263     ++page_feature_state_->secure_links;
    264   }
    265 
    266   ++page_feature_state_->total_links;
    267 }
    268 
    269 void PhishingDOMFeatureExtractor::HandleForm(
    270     const blink::WebElement& element) {
    271   // Increment the number of forms on this page.
    272   ++page_feature_state_->num_forms;
    273 
    274   // Record whether the action points to a different domain.
    275   if (!element.hasAttribute("action")) {
    276     return;
    277   }
    278 
    279   blink::WebURL full_url = element.document().completeURL(
    280       element.getAttribute("action"));
    281 
    282   std::string domain;
    283   bool is_external = IsExternalDomain(full_url, &domain);
    284   if (domain.empty()) {
    285     DVLOG(1) << "Could not extract domain from form action: " << full_url;
    286     return;
    287   }
    288 
    289   if (is_external) {
    290     ++page_feature_state_->action_other_domain;
    291   }
    292   ++page_feature_state_->total_actions;
    293 }
    294 
    295 void PhishingDOMFeatureExtractor::HandleImage(
    296     const blink::WebElement& element) {
    297   if (!element.hasAttribute("src")) {
    298     DVLOG(1) << "Skipping img tag with no src";
    299   }
    300 
    301   // Record whether the image points to a different domain.
    302   blink::WebURL full_url = element.document().completeURL(
    303       element.getAttribute("src"));
    304   std::string domain;
    305   bool is_external = IsExternalDomain(full_url, &domain);
    306   if (domain.empty()) {
    307     DVLOG(1) << "Could not extract domain from image src: " << full_url;
    308     return;
    309   }
    310 
    311   if (is_external) {
    312     ++page_feature_state_->img_other_domain;
    313   }
    314   ++page_feature_state_->total_imgs;
    315 }
    316 
    317 void PhishingDOMFeatureExtractor::HandleInput(
    318     const blink::WebElement& element) {
    319   // The HTML spec says that if the type is unspecified, it defaults to text.
    320   // In addition, any unrecognized type will be treated as a text input.
    321   //
    322   // Note that we use the attribute value rather than
    323   // WebFormControlElement::formControlType() for consistency with the
    324   // way the phishing classification model is created.
    325   std::string type = element.getAttribute("type").utf8();
    326   base::StringToLowerASCII(&type);
    327   if (type == "password") {
    328     ++page_feature_state_->num_pswd_inputs;
    329   } else if (type == "radio") {
    330     ++page_feature_state_->num_radio_inputs;
    331   } else if (type == "checkbox") {
    332     ++page_feature_state_->num_check_inputs;
    333   } else if (type != "submit" && type != "reset" && type != "file" &&
    334              type != "hidden" && type != "image" && type != "button") {
    335     // Note that there are a number of new input types in HTML5 that are not
    336     // handled above.  For now, we will consider these as text inputs since
    337     // they could be used to capture user input.
    338     ++page_feature_state_->num_text_inputs;
    339   }
    340 }
    341 
    342 void PhishingDOMFeatureExtractor::HandleScript(
    343     const blink::WebElement& element) {
    344   ++page_feature_state_->num_script_tags;
    345 }
    346 
    347 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
    348   DCHECK(done_callback_.is_null());
    349   DCHECK(!cur_frame_data_.get());
    350   DCHECK(cur_document_.isNull());
    351   if (!done_callback_.is_null() || cur_frame_data_.get() ||
    352       !cur_document_.isNull()) {
    353     LOG(ERROR) << "Extraction in progress, missing call to "
    354                << "CancelPendingExtraction";
    355   }
    356 }
    357 
    358 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
    359   // Record some timing stats that we can use to evaluate feature extraction
    360   // performance.  These include both successful and failed extractions.
    361   DCHECK(page_feature_state_.get());
    362   UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
    363                        page_feature_state_->num_iterations);
    364   UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
    365                       clock_->Now() - page_feature_state_->start_time);
    366 
    367   DCHECK(!done_callback_.is_null());
    368   done_callback_.Run(success);
    369   Clear();
    370 }
    371 
    372 void PhishingDOMFeatureExtractor::Clear() {
    373   features_ = NULL;
    374   done_callback_.Reset();
    375   cur_frame_data_.reset(NULL);
    376   cur_document_.reset();
    377 }
    378 
    379 void PhishingDOMFeatureExtractor::ResetFrameData() {
    380   DCHECK(!cur_document_.isNull());
    381   DCHECK(!cur_frame_data_.get());
    382 
    383   cur_frame_data_.reset(new FrameData());
    384   cur_frame_data_->elements = cur_document_.all();
    385   cur_frame_data_->domain =
    386       net::registry_controlled_domains::GetDomainAndRegistry(
    387           cur_document_.url(),
    388           net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
    389 }
    390 
    391 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
    392   DCHECK(!cur_document_.isNull());
    393   blink::WebFrame* frame = cur_document_.frame();
    394   // Advance to the next frame that contains a document, with no wrapping.
    395   if (frame) {
    396     for (frame = frame->traverseNext(false); frame;
    397          frame = frame->traverseNext(false)) {
    398       if (!frame->document().isNull()) {
    399         return frame->document();
    400       }
    401     }
    402   } else {
    403     // Keep track of how often frame traversal got "stuck" due to the
    404     // current subdocument getting removed from the frame tree.
    405     UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
    406   }
    407   return blink::WebDocument();
    408 }
    409 
    410 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
    411                                                    std::string* domain) const {
    412   DCHECK(domain);
    413   DCHECK(cur_frame_data_.get());
    414 
    415   if (cur_frame_data_->domain.empty()) {
    416     return false;
    417   }
    418 
    419   // TODO(bryner): Ensure that the url encoding is consistent with the features
    420   // in the model.
    421   if (url.HostIsIPAddress()) {
    422     domain->assign(url.host());
    423   } else {
    424     domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
    425         url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
    426   }
    427 
    428   return !domain->empty() && *domain != cur_frame_data_->domain;
    429 }
    430 
    431 void PhishingDOMFeatureExtractor::InsertFeatures() {
    432   DCHECK(page_feature_state_.get());
    433 
    434   if (page_feature_state_->total_links > 0) {
    435     // Add a feature for the fraction of times the page links to an external
    436     // domain vs. an internal domain.
    437     double link_freq = static_cast<double>(
    438         page_feature_state_->external_links) /
    439         page_feature_state_->total_links;
    440     features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
    441 
    442     // Add a feature for each unique domain that we're linking to
    443     for (base::hash_set<std::string>::iterator it =
    444              page_feature_state_->external_domains.begin();
    445          it != page_feature_state_->external_domains.end(); ++it) {
    446       features_->AddBooleanFeature(features::kPageLinkDomain + *it);
    447     }
    448 
    449     // Fraction of links that use https.
    450     double secure_freq = static_cast<double>(
    451         page_feature_state_->secure_links) / page_feature_state_->total_links;
    452     features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
    453   }
    454 
    455   // Record whether forms appear and whether various form elements appear.
    456   if (page_feature_state_->num_forms > 0) {
    457     features_->AddBooleanFeature(features::kPageHasForms);
    458   }
    459   if (page_feature_state_->num_text_inputs > 0) {
    460     features_->AddBooleanFeature(features::kPageHasTextInputs);
    461   }
    462   if (page_feature_state_->num_pswd_inputs > 0) {
    463     features_->AddBooleanFeature(features::kPageHasPswdInputs);
    464   }
    465   if (page_feature_state_->num_radio_inputs > 0) {
    466     features_->AddBooleanFeature(features::kPageHasRadioInputs);
    467   }
    468   if (page_feature_state_->num_check_inputs > 0) {
    469     features_->AddBooleanFeature(features::kPageHasCheckInputs);
    470   }
    471 
    472   // Record fraction of form actions that point to a different domain.
    473   if (page_feature_state_->total_actions > 0) {
    474     double action_freq = static_cast<double>(
    475         page_feature_state_->action_other_domain) /
    476         page_feature_state_->total_actions;
    477     features_->AddRealFeature(features::kPageActionOtherDomainFreq,
    478                               action_freq);
    479   }
    480 
    481   // Record how many image src attributes point to a different domain.
    482   if (page_feature_state_->total_imgs > 0) {
    483     double img_freq = static_cast<double>(
    484         page_feature_state_->img_other_domain) /
    485         page_feature_state_->total_imgs;
    486     features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
    487   }
    488 
    489   // Record number of script tags (discretized for numerical stability.)
    490   if (page_feature_state_->num_script_tags > 1) {
    491     features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
    492     if (page_feature_state_->num_script_tags > 6) {
    493       features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
    494     }
    495   }
    496 }
    497 
    498 }  // namespace safe_browsing
    499