1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "public/web/WebPageSerializer.h" 33 34 #include "core/HTMLNames.h" 35 #include "core/dom/Document.h" 36 #include "core/dom/Element.h" 37 #include "core/frame/LocalFrame.h" 38 #include "core/html/HTMLAllCollection.h" 39 #include "core/html/HTMLFrameElementBase.h" 40 #include "core/html/HTMLFrameOwnerElement.h" 41 #include "core/html/HTMLInputElement.h" 42 #include "core/html/HTMLTableElement.h" 43 #include "core/loader/DocumentLoader.h" 44 #include "core/page/PageSerializer.h" 45 #include "platform/SerializedResource.h" 46 #include "platform/mhtml/MHTMLArchive.h" 47 #include "platform/weborigin/KURL.h" 48 #include "public/platform/WebCString.h" 49 #include "public/platform/WebString.h" 50 #include "public/platform/WebURL.h" 51 #include "public/platform/WebVector.h" 52 #include "public/web/WebFrame.h" 53 #include "public/web/WebPageSerializerClient.h" 54 #include "public/web/WebView.h" 55 #include "web/WebLocalFrameImpl.h" 56 #include "web/WebPageSerializerImpl.h" 57 #include "web/WebViewImpl.h" 58 #include "wtf/Vector.h" 59 #include "wtf/text/StringConcatenate.h" 60 61 using namespace WebCore; 62 63 namespace { 64 65 KURL getSubResourceURLFromElement(Element* element) 66 { 67 ASSERT(element); 68 const QualifiedName& attributeName = element->subResourceAttributeName(); 69 if (attributeName == QualifiedName::null()) 70 return KURL(); 71 72 String value = element->getAttribute(attributeName); 73 // Ignore javascript content. 74 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) 75 return KURL(); 76 77 return element->document().completeURL(value); 78 } 79 80 void retrieveResourcesForElement(Element* element, 81 Vector<LocalFrame*>* visitedFrames, 82 Vector<LocalFrame*>* framesToVisit, 83 Vector<KURL>* frameURLs, 84 Vector<KURL>* resourceURLs) 85 { 86 ASSERT(element); 87 // If the node is a frame, we'll process it later in retrieveResourcesForFrame. 88 if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) { 89 Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame(); 90 if (frame && frame->isLocalFrame()) { 91 if (!visitedFrames->contains(toLocalFrame(frame))) 92 framesToVisit->append(toLocalFrame(frame)); 93 return; 94 } 95 } 96 97 KURL url = getSubResourceURLFromElement(element); 98 if (url.isEmpty() || !url.isValid()) 99 return; // No subresource for this node. 100 101 // Ignore URLs that have a non-standard protocols. Since the FTP protocol 102 // does no have a cache mechanism, we skip it as well. 103 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile()) 104 return; 105 106 if (!resourceURLs->contains(url)) 107 resourceURLs->append(url); 108 } 109 110 void retrieveResourcesForFrame(LocalFrame* frame, 111 const blink::WebVector<blink::WebCString>& supportedSchemes, 112 Vector<LocalFrame*>* visitedFrames, 113 Vector<LocalFrame*>* framesToVisit, 114 Vector<KURL>* frameURLs, 115 Vector<KURL>* resourceURLs) 116 { 117 KURL frameURL = frame->loader().documentLoader()->request().url(); 118 119 // If the frame's URL is invalid, ignore it, it is not retrievable. 120 if (!frameURL.isValid()) 121 return; 122 123 // Ignore frames from unsupported schemes. 124 bool isValidScheme = false; 125 for (size_t i = 0; i < supportedSchemes.size(); ++i) { 126 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { 127 isValidScheme = true; 128 break; 129 } 130 } 131 if (!isValidScheme) 132 return; 133 134 // If we have already seen that frame, ignore it. 135 if (visitedFrames->contains(frame)) 136 return; 137 visitedFrames->append(frame); 138 if (!frameURLs->contains(frameURL)) 139 frameURLs->append(frameURL); 140 141 // Now get the resources associated with each node of the document. 142 RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all(); 143 for (unsigned i = 0; i < allElements->length(); ++i) { 144 Element* element = allElements->item(i); 145 retrieveResourcesForElement(element, 146 visitedFrames, framesToVisit, 147 frameURLs, resourceURLs); 148 } 149 } 150 151 } // namespace 152 153 namespace blink { 154 155 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam) 156 { 157 Vector<SerializedResource> resources; 158 PageSerializer serializer(&resources); 159 serializer.serialize(toWebViewImpl(view)->page()); 160 161 Vector<Resource> result; 162 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) { 163 Resource resource; 164 resource.url = iter->url; 165 resource.mimeType = iter->mimeType.ascii(); 166 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData(). 167 resource.data = WebCString(iter->data->data(), iter->data->size()); 168 result.append(resource); 169 } 170 171 *resourcesParam = result; 172 } 173 174 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy) 175 { 176 Vector<SerializedResource> resources; 177 PageSerializer serializer(&resources); 178 serializer.serialize(page); 179 Document* document = page->deprecatedLocalMainFrame()->document(); 180 return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType()); 181 } 182 183 WebCString WebPageSerializer::serializeToMHTML(WebView* view) 184 { 185 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding); 186 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 187 return WebCString(mhtml->data(), mhtml->size()); 188 } 189 190 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view) 191 { 192 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding); 193 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData(). 194 return WebCString(mhtml->data(), mhtml->size()); 195 } 196 197 bool WebPageSerializer::serialize(WebLocalFrame* frame, 198 bool recursive, 199 WebPageSerializerClient* client, 200 const WebVector<WebURL>& links, 201 const WebVector<WebString>& localPaths, 202 const WebString& localDirectoryName) 203 { 204 WebPageSerializerImpl serializerImpl( 205 frame, recursive, client, links, localPaths, localDirectoryName); 206 return serializerImpl.serialize(); 207 } 208 209 bool WebPageSerializer::retrieveAllResources(WebView* view, 210 const WebVector<WebCString>& supportedSchemes, 211 WebVector<WebURL>* resourceURLs, 212 WebVector<WebURL>* frameURLs) { 213 WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame()); 214 if (!mainFrame) 215 return false; 216 217 Vector<LocalFrame*> framesToVisit; 218 Vector<LocalFrame*> visitedFrames; 219 Vector<KURL> frameKURLs; 220 Vector<KURL> resourceKURLs; 221 222 // Let's retrieve the resources from every frame in this page. 223 framesToVisit.append(mainFrame->frame()); 224 while (!framesToVisit.isEmpty()) { 225 LocalFrame* frame = framesToVisit[0]; 226 framesToVisit.remove(0); 227 retrieveResourcesForFrame(frame, supportedSchemes, 228 &visitedFrames, &framesToVisit, 229 &frameKURLs, &resourceKURLs); 230 } 231 232 // Converts the results to WebURLs. 233 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 234 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 235 resultResourceURLs[i] = resourceKURLs[i]; 236 // A frame's src can point to the same URL as another resource, keep the 237 // resource URL only in such cases. 238 size_t index = frameKURLs.find(resourceKURLs[i]); 239 if (index != kNotFound) 240 frameKURLs.remove(index); 241 } 242 *resourceURLs = resultResourceURLs; 243 WebVector<WebURL> resultFrameURLs(frameKURLs.size()); 244 for (size_t i = 0; i < frameKURLs.size(); ++i) 245 resultFrameURLs[i] = frameKURLs[i]; 246 *frameURLs = resultFrameURLs; 247 248 return true; 249 } 250 251 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) 252 { 253 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">"; 254 return charsetString; 255 } 256 257 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 258 { 259 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 260 static_cast<int>(url.spec().length()), 261 url.spec().data()); 262 } 263 264 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget) 265 { 266 if (baseTarget.isEmpty()) 267 return String("<base href=\".\">"); 268 String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">"; 269 return baseString; 270 } 271 272 } // namespace blink 273