Home | History | Annotate | Download | only in web
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "WebPageSerializer.h"
     33 
     34 #include "HTMLNames.h"
     35 #include "WebFrame.h"
     36 #include "WebFrameImpl.h"
     37 #include "WebPageSerializerClient.h"
     38 #include "WebPageSerializerImpl.h"
     39 #include "WebView.h"
     40 #include "WebViewImpl.h"
     41 #include "core/dom/Document.h"
     42 #include "core/dom/Element.h"
     43 #include "core/html/HTMLAllCollection.h"
     44 #include "core/html/HTMLFrameOwnerElement.h"
     45 #include "core/html/HTMLInputElement.h"
     46 #include "core/html/HTMLTableElement.h"
     47 #include "core/loader/DocumentLoader.h"
     48 #include "core/loader/archive/MHTMLArchive.h"
     49 #include "core/page/Frame.h"
     50 #include "core/page/PageSerializer.h"
     51 #include "core/platform/SerializedResource.h"
     52 #include "public/platform/WebCString.h"
     53 #include "public/platform/WebString.h"
     54 #include "public/platform/WebURL.h"
     55 #include "public/platform/WebVector.h"
     56 #include "weborigin/KURL.h"
     57 #include "wtf/Vector.h"
     58 #include "wtf/text/StringConcatenate.h"
     59 
     60 using namespace WebCore;
     61 
     62 namespace {
     63 
     64 KURL getSubResourceURLFromElement(Element* element)
     65 {
     66     ASSERT(element);
     67     const QualifiedName* attributeName = 0;
     68     if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
     69         attributeName = &HTMLNames::srcAttr;
     70     else if (element->hasTagName(HTMLNames::inputTag)) {
     71         if (toHTMLInputElement(element)->isImageButton())
     72             attributeName = &HTMLNames::srcAttr;
     73     } else if (element->hasTagName(HTMLNames::bodyTag)
     74         || isHTMLTableElement(element)
     75         || element->hasTagName(HTMLNames::trTag)
     76         || element->hasTagName(HTMLNames::tdTag))
     77         attributeName = &HTMLNames::backgroundAttr;
     78     else if (element->hasTagName(HTMLNames::blockquoteTag)
     79              || element->hasTagName(HTMLNames::qTag)
     80              || element->hasTagName(HTMLNames::delTag)
     81              || element->hasTagName(HTMLNames::insTag))
     82         attributeName = &HTMLNames::citeAttr;
     83     else if (element->hasTagName(HTMLNames::linkTag)) {
     84         // If the link element is not css, ignore it.
     85         if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
     86             // FIXME: Add support for extracting links of sub-resources which
     87             // are inside style-sheet such as @import, @font-face, url(), etc.
     88             attributeName = &HTMLNames::hrefAttr;
     89         }
     90     } else if (element->hasTagName(HTMLNames::objectTag))
     91         attributeName = &HTMLNames::dataAttr;
     92     else if (element->hasTagName(HTMLNames::embedTag))
     93         attributeName = &HTMLNames::srcAttr;
     94 
     95     if (!attributeName)
     96         return KURL();
     97 
     98     String value = element->getAttribute(*attributeName);
     99     // Ignore javascript content.
    100     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
    101         return KURL();
    102 
    103     return element->document()->completeURL(value);
    104 }
    105 
    106 void retrieveResourcesForElement(Element* element,
    107                                  Vector<Frame*>* visitedFrames,
    108                                  Vector<Frame*>* framesToVisit,
    109                                  Vector<KURL>* frameURLs,
    110                                  Vector<KURL>* resourceURLs)
    111 {
    112     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
    113     if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
    114         || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
    115             && element->isFrameOwnerElement()) {
    116         Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
    117         if (frame) {
    118             if (!visitedFrames->contains(frame))
    119                 framesToVisit->append(frame);
    120             return;
    121         }
    122     }
    123 
    124     KURL url = getSubResourceURLFromElement(element);
    125     if (url.isEmpty() || !url.isValid())
    126         return; // No subresource for this node.
    127 
    128     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
    129     // does no have a cache mechanism, we skip it as well.
    130     if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
    131         return;
    132 
    133     if (!resourceURLs->contains(url))
    134         resourceURLs->append(url);
    135 }
    136 
    137 void retrieveResourcesForFrame(Frame* frame,
    138                                const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
    139                                Vector<Frame*>* visitedFrames,
    140                                Vector<Frame*>* framesToVisit,
    141                                Vector<KURL>* frameURLs,
    142                                Vector<KURL>* resourceURLs)
    143 {
    144     KURL frameURL = frame->loader()->documentLoader()->request().url();
    145 
    146     // If the frame's URL is invalid, ignore it, it is not retrievable.
    147     if (!frameURL.isValid())
    148         return;
    149 
    150     // Ignore frames from unsupported schemes.
    151     bool isValidScheme = false;
    152     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
    153         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
    154             isValidScheme = true;
    155             break;
    156         }
    157     }
    158     if (!isValidScheme)
    159         return;
    160 
    161     // If we have already seen that frame, ignore it.
    162     if (visitedFrames->contains(frame))
    163         return;
    164     visitedFrames->append(frame);
    165     if (!frameURLs->contains(frameURL))
    166         frameURLs->append(frameURL);
    167 
    168     // Now get the resources associated with each node of the document.
    169     RefPtr<HTMLCollection> allNodes = frame->document()->all();
    170     for (unsigned i = 0; i < allNodes->length(); ++i) {
    171         Node* node = allNodes->item(i);
    172         // We are only interested in HTML resources.
    173         if (!node->isElementNode())
    174             continue;
    175         retrieveResourcesForElement(toElement(node),
    176                                     visitedFrames, framesToVisit,
    177                                     frameURLs, resourceURLs);
    178     }
    179 }
    180 
    181 } // namespace
    182 
    183 namespace WebKit {
    184 
    185 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
    186 {
    187     Vector<SerializedResource> resources;
    188     PageSerializer serializer(&resources);
    189     serializer.serialize(static_cast<WebViewImpl*>(view)->page());
    190 
    191     Vector<Resource> result;
    192     for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
    193         Resource resource;
    194         resource.url = iter->url;
    195         resource.mimeType = iter->mimeType.ascii();
    196         // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
    197         resource.data = WebCString(iter->data->data(), iter->data->size());
    198         result.append(resource);
    199     }
    200 
    201     *resourcesParam = result;
    202 }
    203 
    204 static PassRefPtr<SharedBuffer> serializePageToMHTML(Page* page, MHTMLArchive::EncodingPolicy encodingPolicy)
    205 {
    206     Vector<SerializedResource> resources;
    207     PageSerializer serializer(&resources);
    208     serializer.serialize(page);
    209     Document* document = page->mainFrame()->document();
    210     return MHTMLArchive::generateMHTMLData(resources, encodingPolicy, document->title(), document->suggestedMIMEType());
    211 }
    212 
    213 WebCString WebPageSerializer::serializeToMHTML(WebView* view)
    214 {
    215     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(static_cast<WebViewImpl*>(view)->page(), MHTMLArchive::UseDefaultEncoding);
    216     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    217     return WebCString(mhtml->data(), mhtml->size());
    218 }
    219 
    220 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
    221 {
    222     RefPtr<SharedBuffer> mhtml = serializePageToMHTML(static_cast<WebViewImpl*>(view)->page(), MHTMLArchive::UseBinaryEncoding);
    223     // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
    224     return WebCString(mhtml->data(), mhtml->size());
    225 }
    226 
    227 bool WebPageSerializer::serialize(WebFrame* frame,
    228                                   bool recursive,
    229                                   WebPageSerializerClient* client,
    230                                   const WebVector<WebURL>& links,
    231                                   const WebVector<WebString>& localPaths,
    232                                   const WebString& localDirectoryName)
    233 {
    234     WebPageSerializerImpl serializerImpl(
    235         frame, recursive, client, links, localPaths, localDirectoryName);
    236     return serializerImpl.serialize();
    237 }
    238 
    239 bool WebPageSerializer::retrieveAllResources(WebView* view,
    240                                              const WebVector<WebCString>& supportedSchemes,
    241                                              WebVector<WebURL>* resourceURLs,
    242                                              WebVector<WebURL>* frameURLs) {
    243     WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
    244     if (!mainFrame)
    245         return false;
    246 
    247     Vector<Frame*> framesToVisit;
    248     Vector<Frame*> visitedFrames;
    249     Vector<KURL> frameKURLs;
    250     Vector<KURL> resourceKURLs;
    251 
    252     // Let's retrieve the resources from every frame in this page.
    253     framesToVisit.append(mainFrame->frame());
    254     while (!framesToVisit.isEmpty()) {
    255         Frame* frame = framesToVisit[0];
    256         framesToVisit.remove(0);
    257         retrieveResourcesForFrame(frame, supportedSchemes,
    258                                   &visitedFrames, &framesToVisit,
    259                                   &frameKURLs, &resourceKURLs);
    260     }
    261 
    262     // Converts the results to WebURLs.
    263     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
    264     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
    265         resultResourceURLs[i] = resourceKURLs[i];
    266         // A frame's src can point to the same URL as another resource, keep the
    267         // resource URL only in such cases.
    268         size_t index = frameKURLs.find(resourceKURLs[i]);
    269         if (index != notFound)
    270             frameKURLs.remove(index);
    271     }
    272     *resourceURLs = resultResourceURLs;
    273     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
    274     for (size_t i = 0; i < frameKURLs.size(); ++i)
    275         resultFrameURLs[i] = frameKURLs[i];
    276     *frameURLs = resultFrameURLs;
    277 
    278     return true;
    279 }
    280 
    281 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
    282 {
    283     String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
    284     return charsetString;
    285 }
    286 
    287 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
    288 {
    289     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
    290                           static_cast<int>(url.spec().length()),
    291                           url.spec().data());
    292 }
    293 
    294 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
    295 {
    296     if (baseTarget.isEmpty())
    297         return String("<base href=\".\">");
    298     String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
    299     return baseString;
    300 }
    301 
    302 } // namespace WebKit
    303