1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "public/web/WebPageSerializer.h" 33 34 #include "core/HTMLNames.h" 35 #include "core/dom/Document.h" 36 #include "core/dom/Element.h" 37 #include "core/frame/LocalFrame.h" 38 #include "core/html/HTMLAllCollection.h" 39 #include "core/html/HTMLFrameElementBase.h" 40 #include "core/html/HTMLFrameOwnerElement.h" 41 #include "core/html/HTMLInputElement.h" 42 #include "core/html/HTMLTableElement.h" 43 #include "core/loader/DocumentLoader.h" 44 #include "core/page/Page.h" 45 #include "core/page/PageSerializer.h" 46 #include "platform/SerializedResource.h" 47 #include "platform/mhtml/MHTMLArchive.h" 48 #include "platform/weborigin/KURL.h" 49 #include "public/platform/WebCString.h" 50 #include "public/platform/WebString.h" 51 #include "public/platform/WebURL.h" 52 #include "public/platform/WebVector.h" 53 #include "public/web/WebFrame.h" 54 #include "public/web/WebPageSerializerClient.h" 55 #include "public/web/WebView.h" 56 #include "web/WebLocalFrameImpl.h" 57 #include "web/WebPageSerializerImpl.h" 58 #include "web/WebViewImpl.h" 59 #include "wtf/Vector.h" 60 #include "wtf/text/StringConcatenate.h" 61 62 namespace blink { 63 64 namespace { 65 66 KURL getSubResourceURLFromElement(Element* element) 67 { 68 ASSERT(element); 69 const QualifiedName& attributeName = element->subResourceAttributeName(); 70 if (attributeName == QualifiedName::null()) 71 return KURL(); 72 73 String value = element->getAttribute(attributeName); 74 // Ignore javascript content. 75 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 76 return KURL(); 77 78 return element->document().completeURL(value); 79 } 80 81 void retrieveResourcesForElement(Element* element, 82 Vector<LocalFrame*>* visitedFrames, 83 Vector<LocalFrame*>* framesToVisit, 84 Vector<KURL>* frameURLs, 85 Vector<KURL>* resourceURLs) 86 { 87 ASSERT(element); 88 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 89 if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) { 90 Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame(); 91 if (frame && frame->isLocalFrame()) { 92 if (!visitedFrames->contains(toLocalFrame(frame))) 93 framesToVisit->append(toLocalFrame(frame)); 94 return; 95 } 96 } 97 98 KURL url = getSubResourceURLFromElement(element); 99 if (url.isEmpty() || !url.isValid()) 100 return; // No subresource for this node. 101 102 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 103 // does no have a cache mechanism, we skip it as well. 104 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile()) 105 return; 106 107 if (!resourceURLs->contains(url)) 108 resourceURLs->append(url); 109 } 110 111 void retrieveResourcesForFrame(LocalFrame* frame, 112 const WebVector<WebCString>& supportedSchemes, 113 Vector<LocalFrame*>* visitedFrames, 114 Vector<LocalFrame*>* framesToVisit, 115 Vector<KURL>* frameURLs, 116 Vector<KURL>* resourceURLs) 117 { 118 KURL frameURL = frame->loader().documentLoader()->request().url(); 119 120 // If the frame's URL is invalid, ignore it, it is not retrievable. 121 if (!frameURL.isValid()) 122 return; 123 124 // Ignore frames from unsupported schemes. 125 bool isValidScheme = false; 126 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 127 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 128 isValidScheme = true; 129 break; 130 } 131 } 132 if (!isValidScheme) 133 return; 134 135 // If we have already seen that frame, ignore it. 136 if (visitedFrames->contains(frame)) 137 return; 138 visitedFrames->append(frame); 139 if (!frameURLs->contains(frameURL)) 140 frameURLs->append(frameURL); 141 142 // Now get the resources associated with each node of the document. 143 RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all(); 144 for (unsigned i = 0; i < allElements->length(); ++i) { 145 Element* element = allElements->item(i); 146 retrieveResourcesForElement(element, 147 visitedFrames, framesToVisit, 148 frameURLs, resourceURLs); 149 } 150 } 151 152 } // namespace 153 154 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam) 155 { 156 Vector<SerializedResource> resources; 157 PageSerializer serializer(&resources); 158 serializer.serialize(toWebViewImpl(view)->page()); 159 160 Vector<Resource> result; 161 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) { 162 Resource resource; 163 resource.url = iter->url; 164 resource.mimeType = iter->mimeType.ascii(); 165 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData(). 166 resource.data = WebCString(iter->data->data(), iter->data->size()); 167 result.append(resource); 168 } 169 170 *resourcesParam = result; 171 } 172 173 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy) 174 { 175 Vector<SerializedResource> resources; 176 PageSerializer serializer(&resources); 177 serializer.serialize(page); 178 Document* document = page->deprecatedLocalMainFrame()->document(); 179 return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType()); 180 } 181 182 WebCString WebPageSerializer::serializeToMHTML(WebView* view) 183 { 184 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding); 185 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 186 return WebCString(mhtml->data(), mhtml->size()); 187 } 188 189 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view) 190 { 191 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding); 192 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 193 return WebCString(mhtml->data(), mhtml->size()); 194 } 195 196 bool WebPageSerializer::serialize(WebLocalFrame* frame, 197 bool recursive, 198 WebPageSerializerClient* client, 199 const WebVector<WebURL>& links, 200 const WebVector<WebString>& localPaths, 201 const WebString& localDirectoryName) 202 { 203 WebPageSerializerImpl serializerImpl( 204 frame, recursive, client, links, localPaths, localDirectoryName); 205 return serializerImpl.serialize(); 206 } 207 208 bool WebPageSerializer::retrieveAllResources(WebView* view, 209 const WebVector<WebCString>& supportedSchemes, 210 WebVector<WebURL>* resourceURLs, 211 WebVector<WebURL>* frameURLs) { 212 WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame()); 213 if (!mainFrame) 214 return false; 215 216 Vector<LocalFrame*> framesToVisit; 217 Vector<LocalFrame*> visitedFrames; 218 Vector<KURL> frameKURLs; 219 Vector<KURL> resourceKURLs; 220 221 // Let's retrieve the resources from every frame in this page. 222 framesToVisit.append(mainFrame->frame()); 223 while (!framesToVisit.isEmpty()) { 224 LocalFrame* frame = framesToVisit[0]; 225 framesToVisit.remove(0); 226 retrieveResourcesForFrame(frame, supportedSchemes, 227 &visitedFrames, &framesToVisit, 228 &frameKURLs, &resourceKURLs); 229 } 230 231 // Converts the results to WebURLs. 232 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 233 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 234 resultResourceURLs[i] = resourceKURLs[i]; 235 // A frame's src can point to the same URL as another resource, keep the 236 // resource URL only in such cases. 237 size_t index = frameKURLs.find(resourceKURLs[i]); 238 if (index != kNotFound) 239 frameKURLs.remove(index); 240 } 241 *resourceURLs = resultResourceURLs; 242 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 243 for (size_t i = 0; i < frameKURLs.size(); ++i) 244 resultFrameURLs[i] = frameKURLs[i]; 245 *frameURLs = resultFrameURLs; 246 247 return true; 248 } 249 250 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 251 { 252 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">"; 253 return charsetString; 254 } 255 256 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 257 { 258 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 259 static_cast<int>(url.spec().length()), 260 url.spec().data()); 261 } 262 263 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 264 { 265 if (baseTarget.isEmpty()) 266 return String("<base href=\".\">"); 267 String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">"; 268 return baseString; 269 } 270 271 } // namespace blink 272