1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "WebPageSerializer.h" 33 34 #include "HTMLNames.h" 35 #include "WebFrame.h" 36 #include "WebFrameImpl.h" 37 #include "WebPageSerializerClient.h" 38 #include "WebPageSerializerImpl.h" 39 #include "WebView.h" 40 #include "WebViewImpl.h" 41 #include "core/dom/Document.h" 42 #include "core/dom/Element.h" 43 #include "core/html/HTMLAllCollection.h" 44 #include "core/html/HTMLFrameOwnerElement.h" 45 #include "core/html/HTMLInputElement.h" 46 #include "core/html/HTMLTableElement.h" 47 #include "core/loader/DocumentLoader.h" 48 #include "core/loader/archive/MHTMLArchive.h" 49 #include "core/page/Frame.h" 50 #include "core/page/PageSerializer.h" 51 #include "core/platform/SerializedResource.h" 52 #include "public/platform/WebCString.h" 53 #include "public/platform/WebString.h" 54 #include "public/platform/WebURL.h" 55 #include "public/platform/WebVector.h" 56 #include "weborigin/KURL.h" 57 #include "wtf/Vector.h" 58 #include "wtf/text/StringConcatenate.h" 59 60 using namespace WebCore; 61 62 namespace { 63 64 KURL getSubResourceURLFromElement(Element* element) 65 { 66 ASSERT(element); 67 const QualifiedName* attributeName = 0; 68 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) 69 attributeName = &HTMLNames::srcAttr; 70 else if (element->hasTagName(HTMLNames::inputTag)) { 71 if (toHTMLInputElement(element)->isImageButton()) 72 attributeName = &HTMLNames::srcAttr; 73 } else if (element->hasTagName(HTMLNames::bodyTag) 74 || isHTMLTableElement(element) 75 || element->hasTagName(HTMLNames::trTag) 76 || element->hasTagName(HTMLNames::tdTag)) 77 attributeName = &HTMLNames::backgroundAttr; 78 else if (element->hasTagName(HTMLNames::blockquoteTag) 79 || element->hasTagName(HTMLNames::qTag) 80 || element->hasTagName(HTMLNames::delTag) 81 || element->hasTagName(HTMLNames::insTag)) 82 attributeName = &HTMLNames::citeAttr; 83 else if (element->hasTagName(HTMLNames::linkTag)) { 84 // If the link element is not css, ignore it. 85 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { 86 // FIXME: Add support for extracting links of sub-resources which 87 // are inside style-sheet such as @import, @font-face, url(), etc. 88 attributeName = &HTMLNames::hrefAttr; 89 } 90 } else if (element->hasTagName(HTMLNames::objectTag)) 91 attributeName = &HTMLNames::dataAttr; 92 else if (element->hasTagName(HTMLNames::embedTag)) 93 attributeName = &HTMLNames::srcAttr; 94 95 if (!attributeName) 96 return KURL(); 97 98 String value = element->getAttribute(*attributeName); 99 // Ignore javascript content. 100 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 101 return KURL(); 102 103 return element->document()->completeURL(value); 104 } 105 106 void retrieveResourcesForElement(Element* element, 107 Vector<Frame*>* visitedFrames, 108 Vector<Frame*>* framesToVisit, 109 Vector<KURL>* frameURLs, 110 Vector<KURL>* resourceURLs) 111 { 112 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 113 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) 114 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) 115 && element->isFrameOwnerElement()) { 116 Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame(); 117 if (frame) { 118 if (!visitedFrames->contains(frame)) 119 framesToVisit->append(frame); 120 return; 121 } 122 } 123 124 KURL url = getSubResourceURLFromElement(element); 125 if (url.isEmpty() || !url.isValid()) 126 return; // No subresource for this node. 127 128 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 129 // does no have a cache mechanism, we skip it as well. 130 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile()) 131 return; 132 133 if (!resourceURLs->contains(url)) 134 resourceURLs->append(url); 135 } 136 137 void retrieveResourcesForFrame(Frame* frame, 138 const WebKit::WebVector<WebKit::WebCString>& supportedSchemes, 139 Vector<Frame*>* visitedFrames, 140 Vector<Frame*>* framesToVisit, 141 Vector<KURL>* frameURLs, 142 Vector<KURL>* resourceURLs) 143 { 144 KURL frameURL = frame->loader()->documentLoader()->request().url(); 145 146 // If the frame's URL is invalid, ignore it, it is not retrievable. 147 if (!frameURL.isValid()) 148 return; 149 150 // Ignore frames from unsupported schemes. 151 bool isValidScheme = false; 152 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 153 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 154 isValidScheme = true; 155 break; 156 } 157 } 158 if (!isValidScheme) 159 return; 160 161 // If we have already seen that frame, ignore it. 162 if (visitedFrames->contains(frame)) 163 return; 164 visitedFrames->append(frame); 165 if (!frameURLs->contains(frameURL)) 166 frameURLs->append(frameURL); 167 168 // Now get the resources associated with each node of the document. 169 RefPtr<HTMLCollection> allNodes = frame->document()->all(); 170 for (unsigned i = 0; i < allNodes->length(); ++i) { 171 Node* node = allNodes->item(i); 172 // We are only interested in HTML resources. 173 if (!node->isElementNode()) 174 continue; 175 retrieveResourcesForElement(toElement(node), 176 visitedFrames, framesToVisit, 177 frameURLs, resourceURLs); 178 } 179 } 180 181 } // namespace 182 183 namespace WebKit { 184 185 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam) 186 { 187 Vector<SerializedResource> resources; 188 PageSerializer serializer(&resources); 189 serializer.serialize(static_cast<WebViewImpl*>(view)->page()); 190 191 Vector<Resource> result; 192 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) { 193 Resource resource; 194 resource.url = iter->url; 195 resource.mimeType = iter->mimeType.ascii(); 196 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData(). 197 resource.data = WebCString(iter->data->data(), iter->data->size()); 198 result.append(resource); 199 } 200 201 *resourcesParam = result; 202 } 203 204 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy) 205 { 206 Vector<SerializedResource> resources; 207 PageSerializer serializer(&resources); 208 serializer.serialize(page); 209 Document* document = page->mainFrame()->document(); 210 return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType()); 211 } 212 213 WebCString WebPageSerializer::serializeToMHTML(WebView* view) 214 { 215 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(static_cast<WebViewImpl*>(view)->page(), MHTMLArchive::UseDefaultEncoding); 216 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 217 return WebCString(mhtml->data(), mhtml->size()); 218 } 219 220 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view) 221 { 222 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(static_cast<WebViewImpl*>(view)->page(), MHTMLArchive::UseBinaryEncoding); 223 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 224 return WebCString(mhtml->data(), mhtml->size()); 225 } 226 227 bool WebPageSerializer::serialize(WebFrame* frame, 228 bool recursive, 229 WebPageSerializerClient* client, 230 const WebVector<WebURL>& links, 231 const WebVector<WebString>& localPaths, 232 const WebString& localDirectoryName) 233 { 234 WebPageSerializerImpl serializerImpl( 235 frame, recursive, client, links, localPaths, localDirectoryName); 236 return serializerImpl.serialize(); 237 } 238 239 bool WebPageSerializer::retrieveAllResources(WebView* view, 240 const WebVector<WebCString>& supportedSchemes, 241 WebVector<WebURL>* resourceURLs, 242 WebVector<WebURL>* frameURLs) { 243 WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame()); 244 if (!mainFrame) 245 return false; 246 247 Vector<Frame*> framesToVisit; 248 Vector<Frame*> visitedFrames; 249 Vector<KURL> frameKURLs; 250 Vector<KURL> resourceKURLs; 251 252 // Let's retrieve the resources from every frame in this page. 253 framesToVisit.append(mainFrame->frame()); 254 while (!framesToVisit.isEmpty()) { 255 Frame* frame = framesToVisit[0]; 256 framesToVisit.remove(0); 257 retrieveResourcesForFrame(frame, supportedSchemes, 258 &visitedFrames, &framesToVisit, 259 &frameKURLs, &resourceKURLs); 260 } 261 262 // Converts the results to WebURLs. 263 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 264 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 265 resultResourceURLs[i] = resourceKURLs[i]; 266 // A frame's src can point to the same URL as another resource, keep the 267 // resource URL only in such cases. 268 size_t index = frameKURLs.find(resourceKURLs[i]); 269 if (index != notFound) 270 frameKURLs.remove(index); 271 } 272 *resourceURLs = resultResourceURLs; 273 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 274 for (size_t i = 0; i < frameKURLs.size(); ++i) 275 resultFrameURLs[i] = frameKURLs[i]; 276 *frameURLs = resultFrameURLs; 277 278 return true; 279 } 280 281 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 282 { 283 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">"; 284 return charsetString; 285 } 286 287 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 288 { 289 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 290 static_cast<int>(url.spec().length()), 291 url.spec().data()); 292 } 293 294 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 295 { 296 if (baseTarget.isEmpty()) 297 return String("<base href=\".\">"); 298 String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">"; 299 return baseString; 300 } 301 302 } // namespace WebKit 303