Home | History | Annotate | Download | only in renderer
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/renderer/savable_resources.h"
      6 
      7 #include <set>
      8 
      9 #include "base/compiler_specific.h"
     10 #include "base/logging.h"
     11 #include "base/strings/string_util.h"
     12 #include "third_party/WebKit/public/platform/WebString.h"
     13 #include "third_party/WebKit/public/platform/WebVector.h"
     14 #include "third_party/WebKit/public/web/WebDocument.h"
     15 #include "third_party/WebKit/public/web/WebElement.h"
     16 #include "third_party/WebKit/public/web/WebElementCollection.h"
     17 #include "third_party/WebKit/public/web/WebInputElement.h"
     18 #include "third_party/WebKit/public/web/WebLocalFrame.h"
     19 #include "third_party/WebKit/public/web/WebNode.h"
     20 #include "third_party/WebKit/public/web/WebNodeList.h"
     21 #include "third_party/WebKit/public/web/WebView.h"
     22 
     23 using blink::WebDocument;
     24 using blink::WebElement;
     25 using blink::WebElementCollection;
     26 using blink::WebFrame;
     27 using blink::WebInputElement;
     28 using blink::WebLocalFrame;
     29 using blink::WebNode;
     30 using blink::WebNodeList;
     31 using blink::WebString;
     32 using blink::WebVector;
     33 using blink::WebView;
     34 
     35 namespace content {
     36 namespace {
     37 
     38 // Structure for storage the unique set of all savable resource links for
     39 // making sure that no duplicated resource link in final result. The consumer
     40 // of the SavableResourcesUniqueCheck is responsible for keeping these pointers
     41 // valid for the lifetime of the SavableResourcesUniqueCheck instance.
     42 struct SavableResourcesUniqueCheck {
     43   // Unique set of all sub resource links.
     44   std::set<GURL>* resources_set;
     45   // Unique set of all frame links.
     46   std::set<GURL>* frames_set;
     47   // Collection of all frames we go through when getting all savable resource
     48   // links.
     49   std::vector<WebFrame*>* frames;
     50 
     51   SavableResourcesUniqueCheck()
     52       : resources_set(NULL),
     53         frames_set(NULL),
     54         frames(NULL) {}
     55 
     56   SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
     57       std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
     58       : resources_set(resources_set),
     59         frames_set(frames_set),
     60         frames(frames) {}
     61 };
     62 
     63 // Get all savable resource links from current element. One element might
     64 // have more than one resource link. It is possible to have some links
     65 // in one CSS stylesheet.
     66 void GetSavableResourceLinkForElement(
     67     const WebElement& element,
     68     const WebDocument& current_doc,
     69     SavableResourcesUniqueCheck* unique_check,
     70     SavableResourcesResult* result) {
     71 
     72   // Handle frame and iframe tag.
     73   if (element.hasTagName("iframe") ||
     74       element.hasTagName("frame")) {
     75     WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
     76     if (sub_frame)
     77       unique_check->frames->push_back(sub_frame);
     78     return;
     79   }
     80 
     81   // Check whether the node has sub resource URL or not.
     82   WebString value = GetSubResourceLinkFromElement(element);
     83   if (value.isNull())
     84     return;
     85   // Get absolute URL.
     86   GURL u = current_doc.completeURL(value);
     87   // ignore invalid URL
     88   if (!u.is_valid())
     89     return;
     90   // Ignore those URLs which are not standard protocols. Because FTP
     91   // protocol does no have cache mechanism, we will skip all
     92   // sub-resources if they use FTP protocol.
     93   if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file"))
     94     return;
     95   // Ignore duplicated resource link.
     96   if (!unique_check->resources_set->insert(u).second)
     97     return;
     98   result->resources_list->push_back(u);
     99   // Insert referrer for above new resource link.
    100   result->referrer_urls_list->push_back(GURL());
    101   result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault);
    102 }
    103 
    104 // Get all savable resource links from current WebFrameImpl object pointer.
    105 void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
    106     SavableResourcesUniqueCheck* unique_check,
    107     SavableResourcesResult* result,
    108     const char** savable_schemes) {
    109   // Get current frame's URL.
    110   GURL current_frame_url = current_frame->document().url();
    111 
    112   // If url of current frame is invalid, ignore it.
    113   if (!current_frame_url.is_valid())
    114     return;
    115 
    116   // If url of current frame is not a savable protocol, ignore it.
    117   bool is_valid_protocol = false;
    118   for (int i = 0; savable_schemes[i] != NULL; ++i) {
    119     if (current_frame_url.SchemeIs(savable_schemes[i])) {
    120       is_valid_protocol = true;
    121       break;
    122     }
    123   }
    124   if (!is_valid_protocol)
    125     return;
    126 
    127   // If find same frame we have recorded, ignore it.
    128   if (!unique_check->frames_set->insert(current_frame_url).second)
    129     return;
    130 
    131   // Get current using document.
    132   WebDocument current_doc = current_frame->document();
    133   // Go through all descent nodes.
    134   WebElementCollection all = current_doc.all();
    135   // Go through all elements in this frame.
    136   for (WebElement element = all.firstItem(); !element.isNull();
    137        element = all.nextItem()) {
    138     GetSavableResourceLinkForElement(element,
    139                                      current_doc,
    140                                      unique_check,
    141                                      result);
    142   }
    143 }
    144 
    145 }  // namespace
    146 
    147 WebString GetSubResourceLinkFromElement(const WebElement& element) {
    148   const char* attribute_name = NULL;
    149   if (element.hasHTMLTagName("img") ||
    150       element.hasHTMLTagName("script")) {
    151     attribute_name = "src";
    152   } else if (element.hasHTMLTagName("input")) {
    153     const WebInputElement input = element.toConst<WebInputElement>();
    154     if (input.isImageButton()) {
    155       attribute_name = "src";
    156     }
    157   } else if (element.hasHTMLTagName("body") ||
    158              element.hasHTMLTagName("table") ||
    159              element.hasHTMLTagName("tr") ||
    160              element.hasHTMLTagName("td")) {
    161     attribute_name = "background";
    162   } else if (element.hasHTMLTagName("blockquote") ||
    163              element.hasHTMLTagName("q") ||
    164              element.hasHTMLTagName("del") ||
    165              element.hasHTMLTagName("ins")) {
    166     attribute_name = "cite";
    167   } else if (element.hasHTMLTagName("link")) {
    168     // If the link element is not linked to css, ignore it.
    169     if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
    170       // TODO(jnd): Add support for extracting links of sub-resources which
    171       // are inside style-sheet such as @import, url(), etc.
    172       // See bug: http://b/issue?id=1111667.
    173       attribute_name = "href";
    174     }
    175   }
    176   if (!attribute_name)
    177     return WebString();
    178   WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
    179   // If value has content and not start with "javascript:" then return it,
    180   // otherwise return NULL.
    181   if (!value.isNull() && !value.isEmpty() &&
    182       !StartsWithASCII(value.utf8(), "javascript:", false))
    183     return value;
    184 
    185   return WebString();
    186 }
    187 
    188 // Get all savable resource links from current webview, include main
    189 // frame and sub-frame
    190 bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
    191     const GURL& page_url, SavableResourcesResult* result,
    192     const char** savable_schemes) {
    193   WebFrame* main_frame = view->mainFrame();
    194   if (!main_frame)
    195     return false;
    196 
    197   std::set<GURL> resources_set;
    198   std::set<GURL> frames_set;
    199   std::vector<WebFrame*> frames;
    200   SavableResourcesUniqueCheck unique_check(&resources_set,
    201                                            &frames_set,
    202                                            &frames);
    203 
    204   GURL main_page_gurl(main_frame->document().url());
    205 
    206   // Make sure we are saving same page between embedder and webkit.
    207   // If page has being navigated, embedder will get three empty vector,
    208   // which will make the saving page job ended.
    209   if (page_url != main_page_gurl)
    210     return true;
    211 
    212   // First, process main frame.
    213   frames.push_back(main_frame);
    214 
    215   // Check all resource in this page, include sub-frame.
    216   for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
    217     // Get current frame's all savable resource links.
    218     GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
    219                                        savable_schemes);
    220   }
    221 
    222   // Since frame's src can also point to sub-resources link, so it is possible
    223   // that some URLs in frames_list are also in resources_list. For those
    224   // URLs, we will remove it from frame_list, only keep them in resources_list.
    225   for (std::set<GURL>::iterator it = frames_set.begin();
    226        it != frames_set.end(); ++it) {
    227     // Append unique frame source to savable frame list.
    228     if (resources_set.find(*it) == resources_set.end())
    229       result->frames_list->push_back(*it);
    230   }
    231 
    232   return true;
    233 }
    234 
    235 }  // namespace content
    236