Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "WebPageSerializer.h"
     33 
     34 #include "DocumentLoader.h"
     35 #include "Element.h"
     36 #include "Frame.h"
     37 #include "HTMLAllCollection.h"
     38 #include "HTMLFrameOwnerElement.h"
     39 #include "HTMLInputElement.h"
     40 #include "HTMLNames.h"
     41 #include "KURL.h"
     42 #include "Vector.h"
     43 
     44 #include "WebCString.h"
     45 #include "WebFrame.h"
     46 #include "WebFrameImpl.h"
     47 #include "WebPageSerializerClient.h"
     48 #include "WebPageSerializerImpl.h"
     49 #include "WebString.h"
     50 #include "WebURL.h"
     51 #include "WebVector.h"
     52 #include "WebView.h"
     53 
     54 #include <wtf/text/StringConcatenate.h>
     55 
     56 using namespace WebCore;
     57 
     58 namespace {
     59 
     60 KURL getSubResourceURLFromElement(Element* element)
     61 {
     62     ASSERT(element);
     63     const QualifiedName* attributeName = 0;
     64     if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
     65         attributeName = &HTMLNames::srcAttr;
     66     else if (element->hasTagName(HTMLNames::inputTag)) {
     67         HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
     68         if (input->isImageButton())
     69             attributeName = &HTMLNames::srcAttr;
     70     } else if (element->hasTagName(HTMLNames::bodyTag)
     71                || element->hasTagName(HTMLNames::tableTag)
     72                || element->hasTagName(HTMLNames::trTag)
     73                || element->hasTagName(HTMLNames::tdTag))
     74         attributeName = &HTMLNames::backgroundAttr;
     75     else if (element->hasTagName(HTMLNames::blockquoteTag)
     76              || element->hasTagName(HTMLNames::qTag)
     77              || element->hasTagName(HTMLNames::delTag)
     78              || element->hasTagName(HTMLNames::insTag))
     79         attributeName = &HTMLNames::citeAttr;
     80     else if (element->hasTagName(HTMLNames::linkTag)) {
     81         // If the link element is not css, ignore it.
     82         if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
     83             // FIXME: Add support for extracting links of sub-resources which
     84             // are inside style-sheet such as @import, @font-face, url(), etc.
     85             attributeName = &HTMLNames::hrefAttr;
     86         }
     87     } else if (element->hasTagName(HTMLNames::objectTag))
     88         attributeName = &HTMLNames::dataAttr;
     89     else if (element->hasTagName(HTMLNames::embedTag))
     90         attributeName = &HTMLNames::srcAttr;
     91 
     92     if (!attributeName)
     93         return KURL();
     94 
     95     String value = element->getAttribute(*attributeName);
     96     // Ignore javascript content.
     97     if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
     98         return KURL();
     99 
    100     return element->document()->completeURL(value);
    101 }
    102 
    103 void retrieveResourcesForElement(Element* element,
    104                                  Vector<Frame*>* visitedFrames,
    105                                  Vector<Frame*>* framesToVisit,
    106                                  Vector<KURL>* frameURLs,
    107                                  Vector<KURL>* resourceURLs)
    108 {
    109     // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
    110     if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
    111         || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
    112             && element->isFrameOwnerElement()) {
    113         Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
    114         if (frame) {
    115             if (!visitedFrames->contains(frame))
    116                 framesToVisit->append(frame);
    117             return;
    118         }
    119     }
    120 
    121     KURL url = getSubResourceURLFromElement(element);
    122     if (url.isEmpty() || !url.isValid())
    123         return; // No subresource for this node.
    124 
    125     // Ignore URLs that have a non-standard protocols. Since the FTP protocol
    126     // does no have a cache mechanism, we skip it as well.
    127     if (!url.protocolInHTTPFamily() && !url.isLocalFile())
    128         return;
    129 
    130     if (!resourceURLs->contains(url))
    131         resourceURLs->append(url);
    132 }
    133 
    134 void retrieveResourcesForFrame(Frame* frame,
    135                                const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
    136                                Vector<Frame*>* visitedFrames,
    137                                Vector<Frame*>* framesToVisit,
    138                                Vector<KURL>* frameURLs,
    139                                Vector<KURL>* resourceURLs)
    140 {
    141     KURL frameURL = frame->loader()->documentLoader()->request().url();
    142 
    143     // If the frame's URL is invalid, ignore it, it is not retrievable.
    144     if (!frameURL.isValid())
    145         return;
    146 
    147     // Ignore frames from unsupported schemes.
    148     bool isValidScheme = false;
    149     for (size_t i = 0; i < supportedSchemes.size(); ++i) {
    150         if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
    151             isValidScheme = true;
    152             break;
    153         }
    154     }
    155     if (!isValidScheme)
    156         return;
    157 
    158     // If we have already seen that frame, ignore it.
    159     if (visitedFrames->contains(frame))
    160         return;
    161     visitedFrames->append(frame);
    162     if (!frameURLs->contains(frameURL))
    163         frameURLs->append(frameURL);
    164 
    165     // Now get the resources associated with each node of the document.
    166     RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
    167     for (unsigned i = 0; i < allNodes->length(); ++i) {
    168         Node* node = allNodes->item(i);
    169         // We are only interested in HTML resources.
    170         if (!node->isElementNode())
    171             continue;
    172         retrieveResourcesForElement(static_cast<Element*>(node),
    173                                     visitedFrames, framesToVisit,
    174                                     frameURLs, resourceURLs);
    175     }
    176 }
    177 
    178 } // namespace
    179 
    180 namespace WebKit {
    181 
    182 bool WebPageSerializer::serialize(WebFrame* frame,
    183                                   bool recursive,
    184                                   WebPageSerializerClient* client,
    185                                   const WebVector<WebURL>& links,
    186                                   const WebVector<WebString>& localPaths,
    187                                   const WebString& localDirectoryName)
    188 {
    189     WebPageSerializerImpl serializerImpl(
    190         frame, recursive, client, links, localPaths, localDirectoryName);
    191     return serializerImpl.serialize();
    192 }
    193 
    194 bool WebPageSerializer::retrieveAllResources(WebView* view,
    195                                              const WebVector<WebCString>& supportedSchemes,
    196                                              WebVector<WebURL>* resourceURLs,
    197                                              WebVector<WebURL>* frameURLs) {
    198     WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
    199     if (!mainFrame)
    200         return false;
    201 
    202     Vector<Frame*> framesToVisit;
    203     Vector<Frame*> visitedFrames;
    204     Vector<KURL> frameKURLs;
    205     Vector<KURL> resourceKURLs;
    206 
    207     // Let's retrieve the resources from every frame in this page.
    208     framesToVisit.append(mainFrame->frame());
    209     while (!framesToVisit.isEmpty()) {
    210         Frame* frame = framesToVisit[0];
    211         framesToVisit.remove(0);
    212         retrieveResourcesForFrame(frame, supportedSchemes,
    213                                   &visitedFrames, &framesToVisit,
    214                                   &frameKURLs, &resourceKURLs);
    215     }
    216 
    217     // Converts the results to WebURLs.
    218     WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
    219     for (size_t i = 0; i < resourceKURLs.size(); ++i) {
    220         resultResourceURLs[i] = resourceKURLs[i];
    221         // A frame's src can point to the same URL as another resource, keep the
    222         // resource URL only in such cases.
    223         size_t index = frameKURLs.find(resourceKURLs[i]);
    224         if (index != notFound)
    225             frameKURLs.remove(index);
    226     }
    227     *resourceURLs = resultResourceURLs;
    228     WebVector<WebURL> resultFrameURLs(frameKURLs.size());
    229     for (size_t i = 0; i < frameKURLs.size(); ++i)
    230         resultFrameURLs[i] = frameKURLs[i];
    231     *frameURLs = resultFrameURLs;
    232 
    233     return true;
    234 }
    235 
    236 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
    237 {
    238     return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
    239 }
    240 
    241 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
    242 {
    243     return String::format("\n<!-- saved from url=(%04d)%s -->\n",
    244                           static_cast<int>(url.spec().length()),
    245                           url.spec().data());
    246 }
    247 
    248 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
    249 {
    250     if (baseTarget.isEmpty())
    251         return makeString("<base href=\".\">");
    252     return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
    253 }
    254 
    255 } // namespace WebKit
    256