1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "WebPageSerializer.h" 33 34 #include "DocumentLoader.h" 35 #include "Element.h" 36 #include "Frame.h" 37 #include "HTMLAllCollection.h" 38 #include "HTMLFrameOwnerElement.h" 39 #include "HTMLInputElement.h" 40 #include "HTMLNames.h" 41 #include "KURL.h" 42 #include "Vector.h" 43 44 #include "WebCString.h" 45 #include "WebFrame.h" 46 #include "WebFrameImpl.h" 47 #include "WebPageSerializerClient.h" 48 #include "WebPageSerializerImpl.h" 49 #include "WebString.h" 50 #include "WebURL.h" 51 #include "WebVector.h" 52 #include "WebView.h" 53 54 #include <wtf/text/StringConcatenate.h> 55 56 using namespace WebCore; 57 58 namespace { 59 60 KURL getSubResourceURLFromElement(Element* element) 61 { 62 ASSERT(element); 63 const QualifiedName* attributeName = 0; 64 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) 65 attributeName = &HTMLNames::srcAttr; 66 else if (element->hasTagName(HTMLNames::inputTag)) { 67 HTMLInputElement* input = static_cast<HTMLInputElement*>(element); 68 if (input->isImageButton()) 69 attributeName = &HTMLNames::srcAttr; 70 } else if (element->hasTagName(HTMLNames::bodyTag) 71 || element->hasTagName(HTMLNames::tableTag) 72 || element->hasTagName(HTMLNames::trTag) 73 || element->hasTagName(HTMLNames::tdTag)) 74 attributeName = &HTMLNames::backgroundAttr; 75 else if (element->hasTagName(HTMLNames::blockquoteTag) 76 || element->hasTagName(HTMLNames::qTag) 77 || element->hasTagName(HTMLNames::delTag) 78 || element->hasTagName(HTMLNames::insTag)) 79 attributeName = &HTMLNames::citeAttr; 80 else if (element->hasTagName(HTMLNames::linkTag)) { 81 // If the link element is not css, ignore it. 82 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { 83 // FIXME: Add support for extracting links of sub-resources which 84 // are inside style-sheet such as @import, @font-face, url(), etc. 85 attributeName = &HTMLNames::hrefAttr; 86 } 87 } else if (element->hasTagName(HTMLNames::objectTag)) 88 attributeName = &HTMLNames::dataAttr; 89 else if (element->hasTagName(HTMLNames::embedTag)) 90 attributeName = &HTMLNames::srcAttr; 91 92 if (!attributeName) 93 return KURL(); 94 95 String value = element->getAttribute(*attributeName); 96 // Ignore javascript content. 97 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 98 return KURL(); 99 100 return element->document()->completeURL(value); 101 } 102 103 void retrieveResourcesForElement(Element* element, 104 Vector<Frame*>* visitedFrames, 105 Vector<Frame*>* framesToVisit, 106 Vector<KURL>* frameURLs, 107 Vector<KURL>* resourceURLs) 108 { 109 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 110 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) 111 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) 112 && element->isFrameOwnerElement()) { 113 Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame(); 114 if (frame) { 115 if (!visitedFrames->contains(frame)) 116 framesToVisit->append(frame); 117 return; 118 } 119 } 120 121 KURL url = getSubResourceURLFromElement(element); 122 if (url.isEmpty() || !url.isValid()) 123 return; // No subresource for this node. 124 125 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 126 // does no have a cache mechanism, we skip it as well. 127 if (!url.protocolInHTTPFamily() && !url.isLocalFile()) 128 return; 129 130 if (!resourceURLs->contains(url)) 131 resourceURLs->append(url); 132 } 133 134 void retrieveResourcesForFrame(Frame* frame, 135 const WebKit::WebVector<WebKit::WebCString>& supportedSchemes, 136 Vector<Frame*>* visitedFrames, 137 Vector<Frame*>* framesToVisit, 138 Vector<KURL>* frameURLs, 139 Vector<KURL>* resourceURLs) 140 { 141 KURL frameURL = frame->loader()->documentLoader()->request().url(); 142 143 // If the frame's URL is invalid, ignore it, it is not retrievable. 144 if (!frameURL.isValid()) 145 return; 146 147 // Ignore frames from unsupported schemes. 148 bool isValidScheme = false; 149 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 150 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 151 isValidScheme = true; 152 break; 153 } 154 } 155 if (!isValidScheme) 156 return; 157 158 // If we have already seen that frame, ignore it. 159 if (visitedFrames->contains(frame)) 160 return; 161 visitedFrames->append(frame); 162 if (!frameURLs->contains(frameURL)) 163 frameURLs->append(frameURL); 164 165 // Now get the resources associated with each node of the document. 166 RefPtr<HTMLAllCollection> allNodes = frame->document()->all(); 167 for (unsigned i = 0; i < allNodes->length(); ++i) { 168 Node* node = allNodes->item(i); 169 // We are only interested in HTML resources. 170 if (!node->isElementNode()) 171 continue; 172 retrieveResourcesForElement(static_cast<Element*>(node), 173 visitedFrames, framesToVisit, 174 frameURLs, resourceURLs); 175 } 176 } 177 178 } // namespace 179 180 namespace WebKit { 181 182 bool WebPageSerializer::serialize(WebFrame* frame, 183 bool recursive, 184 WebPageSerializerClient* client, 185 const WebVector<WebURL>& links, 186 const WebVector<WebString>& localPaths, 187 const WebString& localDirectoryName) 188 { 189 WebPageSerializerImpl serializerImpl( 190 frame, recursive, client, links, localPaths, localDirectoryName); 191 return serializerImpl.serialize(); 192 } 193 194 bool WebPageSerializer::retrieveAllResources(WebView* view, 195 const WebVector<WebCString>& supportedSchemes, 196 WebVector<WebURL>* resourceURLs, 197 WebVector<WebURL>* frameURLs) { 198 WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame()); 199 if (!mainFrame) 200 return false; 201 202 Vector<Frame*> framesToVisit; 203 Vector<Frame*> visitedFrames; 204 Vector<KURL> frameKURLs; 205 Vector<KURL> resourceKURLs; 206 207 // Let's retrieve the resources from every frame in this page. 208 framesToVisit.append(mainFrame->frame()); 209 while (!framesToVisit.isEmpty()) { 210 Frame* frame = framesToVisit[0]; 211 framesToVisit.remove(0); 212 retrieveResourcesForFrame(frame, supportedSchemes, 213 &visitedFrames, &framesToVisit, 214 &frameKURLs, &resourceKURLs); 215 } 216 217 // Converts the results to WebURLs. 218 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 219 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 220 resultResourceURLs[i] = resourceKURLs[i]; 221 // A frame's src can point to the same URL as another resource, keep the 222 // resource URL only in such cases. 223 size_t index = frameKURLs.find(resourceKURLs[i]); 224 if (index != notFound) 225 frameKURLs.remove(index); 226 } 227 *resourceURLs = resultResourceURLs; 228 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 229 for (size_t i = 0; i < frameKURLs.size(); ++i) 230 resultFrameURLs[i] = frameKURLs[i]; 231 *frameURLs = resultFrameURLs; 232 233 return true; 234 } 235 236 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 237 { 238 return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">"); 239 } 240 241 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 242 { 243 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 244 static_cast<int>(url.spec().length()), 245 url.spec().data()); 246 } 247 248 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 249 { 250 if (baseTarget.isEmpty()) 251 return makeString("<base href=\".\">"); 252 return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">"); 253 } 254 255 } // namespace WebKit 256