Home | History | Annotate | Download | only in renderer
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/renderer/savable_resources.h"
      6 
      7 #include <set>
      8 
      9 #include "base/compiler_specific.h"
     10 #include "base/logging.h"
     11 #include "base/strings/string_util.h"
     12 #include "third_party/WebKit/public/platform/WebString.h"
     13 #include "third_party/WebKit/public/platform/WebVector.h"
     14 #include "third_party/WebKit/public/web/WebDocument.h"
     15 #include "third_party/WebKit/public/web/WebElement.h"
     16 #include "third_party/WebKit/public/web/WebFrame.h"
     17 #include "third_party/WebKit/public/web/WebInputElement.h"
     18 #include "third_party/WebKit/public/web/WebNode.h"
     19 #include "third_party/WebKit/public/web/WebNodeCollection.h"
     20 #include "third_party/WebKit/public/web/WebNodeList.h"
     21 #include "third_party/WebKit/public/web/WebView.h"
     22 
     23 using WebKit::WebDocument;
     24 using WebKit::WebElement;
     25 using WebKit::WebFrame;
     26 using WebKit::WebInputElement;
     27 using WebKit::WebNode;
     28 using WebKit::WebNodeCollection;
     29 using WebKit::WebNodeList;
     30 using WebKit::WebString;
     31 using WebKit::WebVector;
     32 using WebKit::WebView;
     33 
     34 namespace content {
     35 namespace {
     36 
     37 // Structure for storage the unique set of all savable resource links for
     38 // making sure that no duplicated resource link in final result. The consumer
     39 // of the SavableResourcesUniqueCheck is responsible for keeping these pointers
     40 // valid for the lifetime of the SavableResourcesUniqueCheck instance.
     41 struct SavableResourcesUniqueCheck {
     42   // Unique set of all sub resource links.
     43   std::set<GURL>* resources_set;
     44   // Unique set of all frame links.
     45   std::set<GURL>* frames_set;
     46   // Collection of all frames we go through when getting all savable resource
     47   // links.
     48   std::vector<WebFrame*>* frames;
     49 
     50   SavableResourcesUniqueCheck()
     51       : resources_set(NULL),
     52         frames_set(NULL),
     53         frames(NULL) {}
     54 
     55   SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
     56       std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
     57       : resources_set(resources_set),
     58         frames_set(frames_set),
     59         frames(frames) {}
     60 };
     61 
     62 // Get all savable resource links from current element. One element might
     63 // have more than one resource link. It is possible to have some links
     64 // in one CSS stylesheet.
     65 void GetSavableResourceLinkForElement(
     66     const WebElement& element,
     67     const WebDocument& current_doc,
     68     SavableResourcesUniqueCheck* unique_check,
     69     SavableResourcesResult* result) {
     70 
     71   // Handle frame and iframe tag.
     72   if (element.hasTagName("iframe") ||
     73       element.hasTagName("frame")) {
     74     WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
     75     if (sub_frame)
     76       unique_check->frames->push_back(sub_frame);
     77     return;
     78   }
     79 
     80   // Check whether the node has sub resource URL or not.
     81   WebString value = GetSubResourceLinkFromElement(element);
     82   if (value.isNull())
     83     return;
     84   // Get absolute URL.
     85   GURL u = current_doc.completeURL(value);
     86   // ignore invalid URL
     87   if (!u.is_valid())
     88     return;
     89   // Ignore those URLs which are not standard protocols. Because FTP
     90   // protocol does no have cache mechanism, we will skip all
     91   // sub-resources if they use FTP protocol.
     92   if (!u.SchemeIs("http") && !u.SchemeIs("https") && !u.SchemeIs("file"))
     93     return;
     94   // Ignore duplicated resource link.
     95   if (!unique_check->resources_set->insert(u).second)
     96     return;
     97   result->resources_list->push_back(u);
     98   // Insert referrer for above new resource link.
     99   result->referrer_urls_list->push_back(GURL());
    100   result->referrer_policies_list->push_back(WebKit::WebReferrerPolicyDefault);
    101 }
    102 
    103 // Get all savable resource links from current WebFrameImpl object pointer.
    104 void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
    105     SavableResourcesUniqueCheck* unique_check,
    106     SavableResourcesResult* result,
    107     const char** savable_schemes) {
    108   // Get current frame's URL.
    109   GURL current_frame_url = current_frame->document().url();
    110 
    111   // If url of current frame is invalid, ignore it.
    112   if (!current_frame_url.is_valid())
    113     return;
    114 
    115   // If url of current frame is not a savable protocol, ignore it.
    116   bool is_valid_protocol = false;
    117   for (int i = 0; savable_schemes[i] != NULL; ++i) {
    118     if (current_frame_url.SchemeIs(savable_schemes[i])) {
    119       is_valid_protocol = true;
    120       break;
    121     }
    122   }
    123   if (!is_valid_protocol)
    124     return;
    125 
    126   // If find same frame we have recorded, ignore it.
    127   if (!unique_check->frames_set->insert(current_frame_url).second)
    128     return;
    129 
    130   // Get current using document.
    131   WebDocument current_doc = current_frame->document();
    132   // Go through all descent nodes.
    133   WebNodeCollection all = current_doc.all();
    134   // Go through all node in this frame.
    135   for (WebNode node = all.firstItem(); !node.isNull();
    136        node = all.nextItem()) {
    137     // We only save HTML resources.
    138     if (!node.isElementNode())
    139       continue;
    140     WebElement element = node.to<WebElement>();
    141     GetSavableResourceLinkForElement(element,
    142                                      current_doc,
    143                                      unique_check,
    144                                      result);
    145   }
    146 }
    147 
    148 }  // namespace
    149 
    150 WebString GetSubResourceLinkFromElement(const WebElement& element) {
    151   const char* attribute_name = NULL;
    152   if (element.hasHTMLTagName("img") ||
    153       element.hasHTMLTagName("script")) {
    154     attribute_name = "src";
    155   } else if (element.hasHTMLTagName("input")) {
    156     const WebInputElement input = element.toConst<WebInputElement>();
    157     if (input.isImageButton()) {
    158       attribute_name = "src";
    159     }
    160   } else if (element.hasHTMLTagName("body") ||
    161              element.hasHTMLTagName("table") ||
    162              element.hasHTMLTagName("tr") ||
    163              element.hasHTMLTagName("td")) {
    164     attribute_name = "background";
    165   } else if (element.hasHTMLTagName("blockquote") ||
    166              element.hasHTMLTagName("q") ||
    167              element.hasHTMLTagName("del") ||
    168              element.hasHTMLTagName("ins")) {
    169     attribute_name = "cite";
    170   } else if (element.hasHTMLTagName("link")) {
    171     // If the link element is not linked to css, ignore it.
    172     if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
    173       // TODO(jnd): Add support for extracting links of sub-resources which
    174       // are inside style-sheet such as @import, url(), etc.
    175       // See bug: http://b/issue?id=1111667.
    176       attribute_name = "href";
    177     }
    178   }
    179   if (!attribute_name)
    180     return WebString();
    181   WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
    182   // If value has content and not start with "javascript:" then return it,
    183   // otherwise return NULL.
    184   if (!value.isNull() && !value.isEmpty() &&
    185       !StartsWithASCII(value.utf8(), "javascript:", false))
    186     return value;
    187 
    188   return WebString();
    189 }
    190 
    191 // Get all savable resource links from current webview, include main
    192 // frame and sub-frame
    193 bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
    194     const GURL& page_url, SavableResourcesResult* result,
    195     const char** savable_schemes) {
    196   WebFrame* main_frame = view->mainFrame();
    197   if (!main_frame)
    198     return false;
    199 
    200   std::set<GURL> resources_set;
    201   std::set<GURL> frames_set;
    202   std::vector<WebFrame*> frames;
    203   SavableResourcesUniqueCheck unique_check(&resources_set,
    204                                            &frames_set,
    205                                            &frames);
    206 
    207   GURL main_page_gurl(main_frame->document().url());
    208 
    209   // Make sure we are saving same page between embedder and webkit.
    210   // If page has being navigated, embedder will get three empty vector,
    211   // which will make the saving page job ended.
    212   if (page_url != main_page_gurl)
    213     return true;
    214 
    215   // First, process main frame.
    216   frames.push_back(main_frame);
    217 
    218   // Check all resource in this page, include sub-frame.
    219   for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
    220     // Get current frame's all savable resource links.
    221     GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
    222                                        savable_schemes);
    223   }
    224 
    225   // Since frame's src can also point to sub-resources link, so it is possible
    226   // that some URLs in frames_list are also in resources_list. For those
    227   // URLs, we will remove it from frame_list, only keep them in resources_list.
    228   for (std::set<GURL>::iterator it = frames_set.begin();
    229        it != frames_set.end(); ++it) {
    230     // Append unique frame source to savable frame list.
    231     if (resources_set.find(*it) == resources_set.end())
    232       result->frames_list->push_back(*it);
    233   }
    234 
    235   return true;
    236 }
    237 
    238 }  // namespace content
    239