1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "WebPageSerializer.h" 33 34 #include "HTMLNames.h" 35 #include "WebFrame.h" 36 #include "WebFrameImpl.h" 37 #include "WebPageSerializerClient.h" 38 #include "WebPageSerializerImpl.h" 39 #include "WebView.h" 40 #include "WebViewImpl.h" 41 #include "core/dom/Document.h" 42 #include "core/dom/Element.h" 43 #include "core/html/HTMLAllCollection.h" 44 #include "core/html/HTMLFrameOwnerElement.h" 45 #include "core/html/HTMLInputElement.h" 46 #include "core/html/HTMLTableElement.h" 47 #include "core/loader/DocumentLoader.h" 48 #include "core/frame/Frame.h" 49 #include "core/page/PageSerializer.h" 50 #include "platform/SerializedResource.h" 51 #include "platform/mhtml/MHTMLArchive.h" 52 #include "platform/weborigin/KURL.h" 53 #include "public/platform/WebCString.h" 54 #include "public/platform/WebString.h" 55 #include "public/platform/WebURL.h" 56 #include "public/platform/WebVector.h" 57 #include "wtf/Vector.h" 58 #include "wtf/text/StringConcatenate.h" 59 60 using namespace WebCore; 61 62 namespace { 63 64 KURL getSubResourceURLFromElement(Element* element) 65 { 66 ASSERT(element); 67 const QualifiedName* attributeName = 0; 68 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) 69 attributeName = &HTMLNames::srcAttr; 70 else if (element->hasTagName(HTMLNames::inputTag)) { 71 if (toHTMLInputElement(element)->isImageButton()) 72 attributeName = &HTMLNames::srcAttr; 73 } else if (element->hasTagName(HTMLNames::bodyTag) 74 || isHTMLTableElement(element) 75 || element->hasTagName(HTMLNames::trTag) 76 || element->hasTagName(HTMLNames::tdTag)) 77 attributeName = &HTMLNames::backgroundAttr; 78 else if (element->hasTagName(HTMLNames::blockquoteTag) 79 || element->hasTagName(HTMLNames::qTag) 80 || element->hasTagName(HTMLNames::delTag) 81 || element->hasTagName(HTMLNames::insTag)) 82 attributeName = &HTMLNames::citeAttr; 83 else if (element->hasTagName(HTMLNames::linkTag)) { 84 // If the link element is not css, ignore it. 85 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { 86 // FIXME: Add support for extracting links of sub-resources which 87 // are inside style-sheet such as @import, @font-face, url(), etc. 88 attributeName = &HTMLNames::hrefAttr; 89 } 90 } else if (element->hasTagName(HTMLNames::objectTag)) 91 attributeName = &HTMLNames::dataAttr; 92 else if (element->hasTagName(HTMLNames::embedTag)) 93 attributeName = &HTMLNames::srcAttr; 94 95 if (!attributeName) 96 return KURL(); 97 98 String value = element->getAttribute(*attributeName); 99 // Ignore javascript content. 100 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 101 return KURL(); 102 103 return element->document().completeURL(value); 104 } 105 106 void retrieveResourcesForElement(Element* element, 107 Vector<Frame*>* visitedFrames, 108 Vector<Frame*>* framesToVisit, 109 Vector<KURL>* frameURLs, 110 Vector<KURL>* resourceURLs) 111 { 112 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 113 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) 114 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) 115 && element->isFrameOwnerElement()) { 116 if (Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame()) { 117 if (!visitedFrames->contains(frame)) 118 framesToVisit->append(frame); 119 return; 120 } 121 } 122 123 KURL url = getSubResourceURLFromElement(element); 124 if (url.isEmpty() || !url.isValid()) 125 return; // No subresource for this node. 126 127 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 128 // does no have a cache mechanism, we skip it as well. 129 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile()) 130 return; 131 132 if (!resourceURLs->contains(url)) 133 resourceURLs->append(url); 134 } 135 136 void retrieveResourcesForFrame(Frame* frame, 137 const blink::WebVector<blink::WebCString>& supportedSchemes, 138 Vector<Frame*>* visitedFrames, 139 Vector<Frame*>* framesToVisit, 140 Vector<KURL>* frameURLs, 141 Vector<KURL>* resourceURLs) 142 { 143 KURL frameURL = frame->loader().documentLoader()->request().url(); 144 145 // If the frame's URL is invalid, ignore it, it is not retrievable. 146 if (!frameURL.isValid()) 147 return; 148 149 // Ignore frames from unsupported schemes. 150 bool isValidScheme = false; 151 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 152 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 153 isValidScheme = true; 154 break; 155 } 156 } 157 if (!isValidScheme) 158 return; 159 160 // If we have already seen that frame, ignore it. 161 if (visitedFrames->contains(frame)) 162 return; 163 visitedFrames->append(frame); 164 if (!frameURLs->contains(frameURL)) 165 frameURLs->append(frameURL); 166 167 // Now get the resources associated with each node of the document. 168 RefPtr<HTMLCollection> allNodes = frame->document()->all(); 169 for (unsigned i = 0; i < allNodes->length(); ++i) { 170 Node* node = allNodes->item(i); 171 // We are only interested in HTML resources. 172 if (!node->isElementNode()) 173 continue; 174 retrieveResourcesForElement(toElement(node), 175 visitedFrames, framesToVisit, 176 frameURLs, resourceURLs); 177 } 178 } 179 180 } // namespace 181 182 namespace blink { 183 184 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam) 185 { 186 Vector<SerializedResource> resources; 187 PageSerializer serializer(&resources); 188 serializer.serialize(toWebViewImpl(view)->page()); 189 190 Vector<Resource> result; 191 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) { 192 Resource resource; 193 resource.url = iter->url; 194 resource.mimeType = iter->mimeType.ascii(); 195 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData(). 196 resource.data = WebCString(iter->data->data(), iter->data->size()); 197 result.append(resource); 198 } 199 200 *resourcesParam = result; 201 } 202 203 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy) 204 { 205 Vector<SerializedResource> resources; 206 PageSerializer serializer(&resources); 207 serializer.serialize(page); 208 Document* document = page->mainFrame()->document(); 209 return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType()); 210 } 211 212 WebCString WebPageSerializer::serializeToMHTML(WebView* view) 213 { 214 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding); 215 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 216 return WebCString(mhtml->data(), mhtml->size()); 217 } 218 219 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view) 220 { 221 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding); 222 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 223 return WebCString(mhtml->data(), mhtml->size()); 224 } 225 226 bool WebPageSerializer::serialize(WebFrame* frame, 227 bool recursive, 228 WebPageSerializerClient* client, 229 const WebVector<WebURL>& links, 230 const WebVector<WebString>& localPaths, 231 const WebString& localDirectoryName) 232 { 233 WebPageSerializerImpl serializerImpl( 234 frame, recursive, client, links, localPaths, localDirectoryName); 235 return serializerImpl.serialize(); 236 } 237 238 bool WebPageSerializer::retrieveAllResources(WebView* view, 239 const WebVector<WebCString>& supportedSchemes, 240 WebVector<WebURL>* resourceURLs, 241 WebVector<WebURL>* frameURLs) { 242 WebFrameImpl* mainFrame = toWebFrameImpl(view->mainFrame()); 243 if (!mainFrame) 244 return false; 245 246 Vector<Frame*> framesToVisit; 247 Vector<Frame*> visitedFrames; 248 Vector<KURL> frameKURLs; 249 Vector<KURL> resourceKURLs; 250 251 // Let's retrieve the resources from every frame in this page. 252 framesToVisit.append(mainFrame->frame()); 253 while (!framesToVisit.isEmpty()) { 254 Frame* frame = framesToVisit[0]; 255 framesToVisit.remove(0); 256 retrieveResourcesForFrame(frame, supportedSchemes, 257 &visitedFrames, &framesToVisit, 258 &frameKURLs, &resourceKURLs); 259 } 260 261 // Converts the results to WebURLs. 262 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 263 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 264 resultResourceURLs[i] = resourceKURLs[i]; 265 // A frame's src can point to the same URL as another resource, keep the 266 // resource URL only in such cases. 267 size_t index = frameKURLs.find(resourceKURLs[i]); 268 if (index != kNotFound) 269 frameKURLs.remove(index); 270 } 271 *resourceURLs = resultResourceURLs; 272 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 273 for (size_t i = 0; i < frameKURLs.size(); ++i) 274 resultFrameURLs[i] = frameKURLs[i]; 275 *frameURLs = resultFrameURLs; 276 277 return true; 278 } 279 280 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 281 { 282 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">"; 283 return charsetString; 284 } 285 286 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 287 { 288 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 289 static_cast<int>(url.spec().length()), 290 url.spec().data()); 291 } 292 293 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 294 { 295 if (baseTarget.isEmpty()) 296 return String("<base href=\".\">"); 297 String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">"; 298 return baseString; 299 } 300 301 } // namespace blink 302