Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "public/web/WebPageSerializer.h"
     33 
     34 #include "core/HTMLNames.h"
     35 #include "core/dom/Document.h"
     36 #include "core/dom/Element.h"
     37 #include "core/frame/LocalFrame.h"
     38 #include "core/html/HTMLAllCollection.h"
     39 #include "core/html/HTMLFrameElementBase.h"
     40 #include "core/html/HTMLFrameOwnerElement.h"
     41 #include "core/html/HTMLInputElement.h"
     42 #include "core/html/HTMLTableElement.h"
     43 #include "core/loader/DocumentLoader.h"
     44 #include "core/page/Page.h"
     45 #include "core/page/PageSerializer.h"
     46 #include "platform/SerializedResource.h"
     47 #include "platform/mhtml/MHTMLArchive.h"
     48 #include "platform/weborigin/KURL.h"
     49 #include "public/platform/WebCString.h"
     50 #include "public/platform/WebString.h"
     51 #include "public/platform/WebURL.h"
     52 #include "public/platform/WebVector.h"
     53 #include "public/web/WebFrame.h"
     54 #include "public/web/WebPageSerializerClient.h"
     55 #include "public/web/WebView.h"
     56 #include "web/WebLocalFrameImpl.h"
     57 #include "web/WebPageSerializerImpl.h"
     58 #include "web/WebViewImpl.h"
     59 #include "wtf/Vector.h"
     60 #include "wtf/text/StringConcatenate.h"
     61 
     62 namespace blink {
     63 
     64 namespace {
     65 
     66 KURL getSubResourceURLFromElement(Element* element)
     67 {
     68     ASSERT(element);
     69     const QualifiedName& attributeName = element->subResourceAttributeName();
     70     if (attributeName == QualifiedName::null())
     71         return KURL();
     72 
     73     String value = element->getAttribute(attributeName);
     74     // Ignore javascript content.
     75     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
     76         return KURL();
     77 
     78     return element->document().completeURL(value);
     79 }
     80 
     81 void retrieveResourcesForElement(Element* element,
     82                                  Vector<LocalFrame*>* visitedFrames,
     83                                  Vector<LocalFrame*>* framesToVisit,
     84                                  Vector<KURL>* frameURLs,
     85                                  Vector<KURL>* resourceURLs)
     86 {
     87     ASSERT(element);
     88     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
     89     if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isHTMLEmbedElement(*element)) {
     90         Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
     91         if (frame && frame->isLocalFrame()) {
     92             if (!visitedFrames->contains(toLocalFrame(frame)))
     93                 framesToVisit->append(toLocalFrame(frame));
     94             return;
     95         }
     96     }
     97 
     98     KURL url = getSubResourceURLFromElement(element);
     99     if (url.isEmpty() || !url.isValid())
    100         return; // No subresource for this node.
    101 
    102     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
    103     // does no have a cache mechanism, we skip it as well.
    104     if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
    105         return;
    106 
    107     if (!resourceURLs->contains(url))
    108         resourceURLs->append(url);
    109 }
    110 
    111 void retrieveResourcesForFrame(LocalFrame* frame,
    112     const WebVector<WebCString>& supportedSchemes,
    113     Vector<LocalFrame*>* visitedFrames,
    114     Vector<LocalFrame*>* framesToVisit,
    115     Vector<KURL>* frameURLs,
    116     Vector<KURL>* resourceURLs)
    117 {
    118     KURL frameURL = frame->loader().documentLoader()->request().url();
    119 
    120     // If the frame's URL is invalid, ignore it, it is not retrievable.
    121     if (!frameURL.isValid())
    122         return;
    123 
    124     // Ignore frames from unsupported schemes.
    125     bool isValidScheme = false;
    126     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
    127         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
    128             isValidScheme = true;
    129             break;
    130         }
    131     }
    132     if (!isValidScheme)
    133         return;
    134 
    135     // If we have already seen that frame, ignore it.
    136     if (visitedFrames->contains(frame))
    137         return;
    138     visitedFrames->append(frame);
    139     if (!frameURLs->contains(frameURL))
    140         frameURLs->append(frameURL);
    141 
    142     // Now get the resources associated with each node of the document.
    143     RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all();
    144     for (unsigned i = 0; i < allElements->length(); ++i) {
    145         Element* element = allElements->item(i);
    146         retrieveResourcesForElement(element,
    147                                     visitedFrames, framesToVisit,
    148                                     frameURLs, resourceURLs);
    149     }
    150 }
    151 
    152 } // namespace
    153 
    154 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
    155 {
    156     Vector<SerializedResource> resources;
    157     PageSerializer serializer(&resources);
    158     serializer.serialize(toWebViewImpl(view)->page());
    159 
    160     Vector<Resource> result;
    161     for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
    162         Resource resource;
    163         resource.url = iter->url;
    164         resource.mimeType = iter->mimeType.ascii();
    165         // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
    166         resource.data = WebCString(iter->data->data(), iter->data->size());
    167         result.append(resource);
    168     }
    169 
    170     *resourcesParam = result;
    171 }
    172 
    173 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
    174 {
    175     Vector<SerializedResource> resources;
    176     PageSerializer serializer(&resources);
    177     serializer.serialize(page);
    178     Document* document = page->deprecatedLocalMainFrame()->document();
    179     return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
    180 }
    181 
    182 WebCString WebPageSerializer::serializeToMHTML(WebView* view)
    183 {
    184     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseDefaultEncoding);
    185     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    186     return WebCString(mhtml->data(), mhtml->size());
    187 }
    188 
    189 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
    190 {
    191     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page(), MHTMLArchive::UseBinaryEncoding);
    192     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    193     return WebCString(mhtml->data(), mhtml->size());
    194 }
    195 
    196 bool WebPageSerializer::serialize(WebLocalFrame* frame,
    197                                   bool recursive,
    198                                   WebPageSerializerClient* client,
    199                                   const WebVector<WebURL>& links,
    200                                   const WebVector<WebString>& localPaths,
    201                                   const WebString& localDirectoryName)
    202 {
    203     WebPageSerializerImpl serializerImpl(
    204         frame, recursive, client, links, localPaths, localDirectoryName);
    205     return serializerImpl.serialize();
    206 }
    207 
    208 bool WebPageSerializer::retrieveAllResources(WebView* view,
    209                                              const WebVector<WebCString>& supportedSchemes,
    210                                              WebVector<WebURL>* resourceURLs,
    211                                              WebVector<WebURL>* frameURLs) {
    212     WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame());
    213     if (!mainFrame)
    214         return false;
    215 
    216     Vector<LocalFrame*> framesToVisit;
    217     Vector<LocalFrame*> visitedFrames;
    218     Vector<KURL> frameKURLs;
    219     Vector<KURL> resourceKURLs;
    220 
    221     // Let's retrieve the resources from every frame in this page.
    222     framesToVisit.append(mainFrame->frame());
    223     while (!framesToVisit.isEmpty()) {
    224         LocalFrame* frame = framesToVisit[0];
    225         framesToVisit.remove(0);
    226         retrieveResourcesForFrame(frame, supportedSchemes,
    227                                   &visitedFrames, &framesToVisit,
    228                                   &frameKURLs, &resourceKURLs);
    229     }
    230 
    231     // Converts the results to WebURLs.
    232     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
    233     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
    234         resultResourceURLs[i] = resourceKURLs[i];
    235         // A frame's src can point to the same URL as another resource, keep the
    236         // resource URL only in such cases.
    237         size_t index = frameKURLs.find(resourceKURLs[i]);
    238         if (index != kNotFound)
    239             frameKURLs.remove(index);
    240     }
    241     *resourceURLs = resultResourceURLs;
    242     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
    243     for (size_t i = 0; i < frameKURLs.size(); ++i)
    244         resultFrameURLs[i] = frameKURLs[i];
    245     *frameURLs = resultFrameURLs;
    246 
    247     return true;
    248 }
    249 
    250 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
    251 {
    252     String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
    253     return charsetString;
    254 }
    255 
    256 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
    257 {
    258     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
    259                           static_cast<int>(url.spec().length()),
    260                           url.spec().data());
    261 }
    262 
    263 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
    264 {
    265     if (baseTarget.isEmpty())
    266         return String("<base href=\".\">");
    267     String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
    268     return baseString;
    269 }
    270 
    271 } // namespace blink
    272