Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "public/web/WebPageSerializer.h"
     33 
     34 #include "core/HTMLNames.h"
     35 #include "core/dom/Document.h"
     36 #include "core/dom/Element.h"
     37 #include "core/frame/LocalFrame.h"
     38 #include "core/html/HTMLAllCollection.h"
     39 #include "core/html/HTMLFrameElementBase.h"
     40 #include "core/html/HTMLFrameOwnerElement.h"
     41 #include "core/html/HTMLInputElement.h"
     42 #include "core/html/HTMLTableElement.h"
     43 #include "core/loader/DocumentLoader.h"
     44 #include "core/page/PageSerializer.h"
     45 #include "platform/SerializedResource.h"
     46 #include "platform/mhtml/MHTMLArchive.h"
     47 #include "platform/weborigin/KURL.h"
     48 #include "public/platform/WebCString.h"
     49 #include "public/platform/WebString.h"
     50 #include "public/platform/WebURL.h"
     51 #include "public/platform/WebVector.h"
     52 #include "public/web/WebFrame.h"
     53 #include "public/web/WebPageSerializerClient.h"
     54 #include "public/web/WebView.h"
     55 #include "web/WebLocalFrameImpl.h"
     56 #include "web/WebPageSerializerImpl.h"
     57 #include "web/WebViewImpl.h"
     58 #include "wtf/Vector.h"
     59 #include "wtf/text/StringConcatenate.h"
     60 
     61 using namespace WebCore;
     62 
     63 namespace {
     64 
     65 KURL getSubResourceURLFromElement(Element* element)
     66 {
     67     ASSERT(element);
     68     const QualifiedName& attributeName = element->subResourceAttributeName();
     69     if (attributeName == QualifiedName::null())
     70         return KURL();
     71 
     72     String value = element->getAttribute(attributeName);
     73     // Ignore javascript content.
     74     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
     75         return KURL();
     76 
     77     return element->document().completeURL(value);
     78 }
     79 
     80 void retrieveResourcesForElement(Element* element,
     81                                  Vector<LocalFrame*>* visitedFrames,
     82                                  Vector<LocalFrame*>* framesToVisit,
     83                                  Vector<KURL>* frameURLs,
     84                                  Vector<KURL>* resourceURLs)
     85 {
     86     ASSERT(element);
     87     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
     88     if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) {
     89         Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
     90         if (frame && frame->isLocalFrame()) {
     91             if (!visitedFrames->contains(toLocalFrame(frame)))
     92                 framesToVisit->append(toLocalFrame(frame));
     93             return;
     94         }
     95     }
     96 
     97     KURL url = getSubResourceURLFromElement(element);
     98     if (url.isEmpty() || !url.isValid())
     99         return; // No subresource for this node.
    100 
    101     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
    102     // does no have a cache mechanism, we skip it as well.
    103     if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
    104         return;
    105 
    106     if (!resourceURLs->contains(url))
    107         resourceURLs->append(url);
    108 }
    109 
    110 void retrieveResourcesForFrame(LocalFrame* frame,
    111                                const blink::WebVector<blink::WebCString>& supportedSchemes,
    112                                Vector<LocalFrame*>* visitedFrames,
    113                                Vector<LocalFrame*>* framesToVisit,
    114                                Vector<KURL>* frameURLs,
    115                                Vector<KURL>* resourceURLs)
    116 {
    117     KURL frameURL = frame->loader().documentLoader()->request().url();
    118 
    119     // If the frame's URL is invalid, ignore it, it is not retrievable.
    120     if (!frameURL.isValid())
    121         return;
    122 
    123     // Ignore frames from unsupported schemes.
    124     bool isValidScheme = false;
    125     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
    126         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
    127             isValidScheme = true;
    128             break;
    129         }
    130     }
    131     if (!isValidScheme)
    132         return;
    133 
    134     // If we have already seen that frame, ignore it.
    135     if (visitedFrames->contains(frame))
    136         return;
    137     visitedFrames->append(frame);
    138     if (!frameURLs->contains(frameURL))
    139         frameURLs->append(frameURL);
    140 
    141     // Now get the resources associated with each node of the document.
    142     RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all();
    143     for (unsigned i = 0; i < allElements->length(); ++i) {
    144         Element* element = allElements->item(i);
    145         retrieveResourcesForElement(element,
    146                                     visitedFrames, framesToVisit,
    147                                     frameURLs, resourceURLs);
    148     }
    149 }
    150 
    151 } // namespace
    152 
    153 namespace blink {
    154 
    155 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
    156 {
    157     Vector<SerializedResource> resources;
    158     PageSerializer serializer(&resources);
    159     serializer.serialize(toWebViewImpl(view)->page());
    160 
    161     Vector<Resource> result;
    162     for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
    163         Resource resource;
    164         resource.url = iter->url;
    165         resource.mimeType = iter->mimeType.ascii();
    166         // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
    167         resource.data = WebCString(iter->data->data(), iter->data->size());
    168         result.append(resource);
    169     }
    170 
    171     *resourcesParam = result;
    172 }
    173 
    174 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
    175 {
    176     Vector<SerializedResource> resources;
    177     PageSerializer serializer(&resources);
    178     serializer.serialize(page);
    179     Document* document = page->deprecatedLocalMainFrame()->document();
    180     return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
    181 }
    182 
    183 WebCString WebPageSerializer::serializeToMHTML(WebView* view)
    184 {
    185     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding);
    186     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    187     return WebCString(mhtml->data(), mhtml->size());
    188 }
    189 
    190 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
    191 {
    192     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding);
    193     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    194     return WebCString(mhtml->data(), mhtml->size());
    195 }
    196 
    197 bool WebPageSerializer::serialize(WebLocalFrame* frame,
    198                                   bool recursive,
    199                                   WebPageSerializerClient* client,
    200                                   const WebVector<WebURL>& links,
    201                                   const WebVector<WebString>& localPaths,
    202                                   const WebString& localDirectoryName)
    203 {
    204     WebPageSerializerImpl serializerImpl(
    205         frame, recursive, client, links, localPaths, localDirectoryName);
    206     return serializerImpl.serialize();
    207 }
    208 
    209 bool WebPageSerializer::retrieveAllResources(WebView* view,
    210                                              const WebVector<WebCString>& supportedSchemes,
    211                                              WebVector<WebURL>* resourceURLs,
    212                                              WebVector<WebURL>* frameURLs) {
    213     WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame());
    214     if (!mainFrame)
    215         return false;
    216 
    217     Vector<LocalFrame*> framesToVisit;
    218     Vector<LocalFrame*> visitedFrames;
    219     Vector<KURL> frameKURLs;
    220     Vector<KURL> resourceKURLs;
    221 
    222     // Let's retrieve the resources from every frame in this page.
    223     framesToVisit.append(mainFrame->frame());
    224     while (!framesToVisit.isEmpty()) {
    225         LocalFrame* frame = framesToVisit[0];
    226         framesToVisit.remove(0);
    227         retrieveResourcesForFrame(frame, supportedSchemes,
    228                                   &visitedFrames, &framesToVisit,
    229                                   &frameKURLs, &resourceKURLs);
    230     }
    231 
    232     // Converts the results to WebURLs.
    233     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
    234     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
    235         resultResourceURLs[i] = resourceKURLs[i];
    236         // A frame's src can point to the same URL as another resource, keep the
    237         // resource URL only in such cases.
    238         size_t index = frameKURLs.find(resourceKURLs[i]);
    239         if (index != kNotFound)
    240             frameKURLs.remove(index);
    241     }
    242     *resourceURLs = resultResourceURLs;
    243     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
    244     for (size_t i = 0; i < frameKURLs.size(); ++i)
    245         resultFrameURLs[i] = frameKURLs[i];
    246     *frameURLs = resultFrameURLs;
    247 
    248     return true;
    249 }
    250 
    251 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
    252 {
    253     String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
    254     return charsetString;
    255 }
    256 
    257 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
    258 {
    259     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
    260                           static_cast<int>(url.spec().length()),
    261                           url.spec().data());
    262 }
    263 
    264 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
    265 {
    266     if (baseTarget.isEmpty())
    267         return String("<base href=\".\">");
    268     String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
    269     return baseString;
    270 }
    271 
    272 } // namespace blink
    273