1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/renderer/savable_resources.h" 6 7 #include <set> 8 9 #include "base/compiler_specific.h" 10 #include "base/logging.h" 11 #include "base/strings/string_util.h" 12 #include "third_party/WebKit/public/platform/WebString.h" 13 #include "third_party/WebKit/public/platform/WebVector.h" 14 #include "third_party/WebKit/public/web/WebDocument.h" 15 #include "third_party/WebKit/public/web/WebElement.h" 16 #include "third_party/WebKit/public/web/WebElementCollection.h" 17 #include "third_party/WebKit/public/web/WebInputElement.h" 18 #include "third_party/WebKit/public/web/WebLocalFrame.h" 19 #include "third_party/WebKit/public/web/WebNode.h" 20 #include "third_party/WebKit/public/web/WebNodeList.h" 21 #include "third_party/WebKit/public/web/WebView.h" 22 23 using blink::WebDocument; 24 using blink::WebElement; 25 using blink::WebElementCollection; 26 using blink::WebFrame; 27 using blink::WebInputElement; 28 using blink::WebLocalFrame; 29 using blink::WebNode; 30 using blink::WebNodeList; 31 using blink::WebString; 32 using blink::WebVector; 33 using blink::WebView; 34 35 namespace content { 36 namespace { 37 38 // Structure for storage the unique set of all savable resource links for 39 // making sure that no duplicated resource link in final result. The consumer 40 // of the SavableResourcesUniqueCheck is responsible for keeping these pointers 41 // valid for the lifetime of the SavableResourcesUniqueCheck instance. 42 struct SavableResourcesUniqueCheck { 43 // Unique set of all sub resource links. 44 std::set<GURL>* resources_set; 45 // Unique set of all frame links. 46 std::set<GURL>* frames_set; 47 // Collection of all frames we go through when getting all savable resource 48 // links. 49 std::vector<WebFrame*>* frames; 50 51 SavableResourcesUniqueCheck() 52 : resources_set(NULL), 53 frames_set(NULL), 54 frames(NULL) {} 55 56 SavableResourcesUniqueCheck(std::set<GURL>* resources_set, 57 std::set<GURL>* frames_set, std::vector<WebFrame*>* frames) 58 : resources_set(resources_set), 59 frames_set(frames_set), 60 frames(frames) {} 61 }; 62 63 // Get all savable resource links from current element. One element might 64 // have more than one resource link. It is possible to have some links 65 // in one CSS stylesheet. 66 void GetSavableResourceLinkForElement( 67 const WebElement& element, 68 const WebDocument& current_doc, 69 SavableResourcesUniqueCheck* unique_check, 70 SavableResourcesResult* result) { 71 72 // Handle frame and iframe tag. 73 if (element.hasHTMLTagName("iframe") || 74 element.hasHTMLTagName("frame")) { 75 WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element); 76 if (sub_frame) 77 unique_check->frames->push_back(sub_frame); 78 return; 79 } 80 81 // Check whether the node has sub resource URL or not. 82 WebString value = GetSubResourceLinkFromElement(element); 83 if (value.isNull()) 84 return; 85 // Get absolute URL. 86 GURL u = current_doc.completeURL(value); 87 // ignore invalid URL 88 if (!u.is_valid()) 89 return; 90 // Ignore those URLs which are not standard protocols. Because FTP 91 // protocol does no have cache mechanism, we will skip all 92 // sub-resources if they use FTP protocol. 93 if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs(url::kFileScheme)) 94 return; 95 // Ignore duplicated resource link. 96 if (!unique_check->resources_set->insert(u).second) 97 return; 98 result->resources_list->push_back(u); 99 // Insert referrer for above new resource link. 100 result->referrer_urls_list->push_back(GURL()); 101 result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault); 102 } 103 104 // Get all savable resource links from current WebFrameImpl object pointer. 105 void GetAllSavableResourceLinksForFrame(WebFrame* current_frame, 106 SavableResourcesUniqueCheck* unique_check, 107 SavableResourcesResult* result, 108 const char** savable_schemes) { 109 // Get current frame's URL. 110 GURL current_frame_url = current_frame->document().url(); 111 112 // If url of current frame is invalid, ignore it. 113 if (!current_frame_url.is_valid()) 114 return; 115 116 // If url of current frame is not a savable protocol, ignore it. 117 bool is_valid_protocol = false; 118 for (int i = 0; savable_schemes[i] != NULL; ++i) { 119 if (current_frame_url.SchemeIs(savable_schemes[i])) { 120 is_valid_protocol = true; 121 break; 122 } 123 } 124 if (!is_valid_protocol) 125 return; 126 127 // If find same frame we have recorded, ignore it. 128 if (!unique_check->frames_set->insert(current_frame_url).second) 129 return; 130 131 // Get current using document. 132 WebDocument current_doc = current_frame->document(); 133 // Go through all descent nodes. 134 WebElementCollection all = current_doc.all(); 135 // Go through all elements in this frame. 136 for (WebElement element = all.firstItem(); !element.isNull(); 137 element = all.nextItem()) { 138 GetSavableResourceLinkForElement(element, 139 current_doc, 140 unique_check, 141 result); 142 } 143 } 144 145 } // namespace 146 147 WebString GetSubResourceLinkFromElement(const WebElement& element) { 148 const char* attribute_name = NULL; 149 if (element.hasHTMLTagName("img") || 150 element.hasHTMLTagName("script")) { 151 attribute_name = "src"; 152 } else if (element.hasHTMLTagName("input")) { 153 const WebInputElement input = element.toConst<WebInputElement>(); 154 if (input.isImageButton()) { 155 attribute_name = "src"; 156 } 157 } else if (element.hasHTMLTagName("body") || 158 element.hasHTMLTagName("table") || 159 element.hasHTMLTagName("tr") || 160 element.hasHTMLTagName("td")) { 161 attribute_name = "background"; 162 } else if (element.hasHTMLTagName("blockquote") || 163 element.hasHTMLTagName("q") || 164 element.hasHTMLTagName("del") || 165 element.hasHTMLTagName("ins")) { 166 attribute_name = "cite"; 167 } else if (element.hasHTMLTagName("link")) { 168 // If the link element is not linked to css, ignore it. 169 if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) { 170 // TODO(jnd): Add support for extracting links of sub-resources which 171 // are inside style-sheet such as @import, url(), etc. 172 // See bug: http://b/issue?id=1111667. 173 attribute_name = "href"; 174 } 175 } 176 if (!attribute_name) 177 return WebString(); 178 WebString value = element.getAttribute(WebString::fromUTF8(attribute_name)); 179 // If value has content and not start with "javascript:" then return it, 180 // otherwise return NULL. 181 if (!value.isNull() && !value.isEmpty() && 182 !StartsWithASCII(value.utf8(), "javascript:", false)) 183 return value; 184 185 return WebString(); 186 } 187 188 // Get all savable resource links from current webview, include main 189 // frame and sub-frame 190 bool GetAllSavableResourceLinksForCurrentPage(WebView* view, 191 const GURL& page_url, SavableResourcesResult* result, 192 const char** savable_schemes) { 193 WebFrame* main_frame = view->mainFrame(); 194 if (!main_frame) 195 return false; 196 197 std::set<GURL> resources_set; 198 std::set<GURL> frames_set; 199 std::vector<WebFrame*> frames; 200 SavableResourcesUniqueCheck unique_check(&resources_set, 201 &frames_set, 202 &frames); 203 204 GURL main_page_gurl(main_frame->document().url()); 205 206 // Make sure we are saving same page between embedder and webkit. 207 // If page has being navigated, embedder will get three empty vector, 208 // which will make the saving page job ended. 209 if (page_url != main_page_gurl) 210 return true; 211 212 // First, process main frame. 213 frames.push_back(main_frame); 214 215 // Check all resource in this page, include sub-frame. 216 for (int i = 0; i < static_cast<int>(frames.size()); ++i) { 217 // Get current frame's all savable resource links. 218 GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result, 219 savable_schemes); 220 } 221 222 // Since frame's src can also point to sub-resources link, so it is possible 223 // that some URLs in frames_list are also in resources_list. For those 224 // URLs, we will remove it from frame_list, only keep them in resources_list. 225 for (std::set<GURL>::iterator it = frames_set.begin(); 226 it != frames_set.end(); ++it) { 227 // Append unique frame source to savable frame list. 228 if (resources_set.find(*it) == resources_set.end()) 229 result->frames_list->push_back(*it); 230 } 231 232 return true; 233 } 234 235 } // namespace content 236