Home | History | Annotate | Download | only in page
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "core/page/PageSerializer.h"
     33 
     34 #include "core/HTMLNames.h"
     35 #include "core/css/CSSFontFaceRule.h"
     36 #include "core/css/CSSFontFaceSrcValue.h"
     37 #include "core/css/CSSImageValue.h"
     38 #include "core/css/CSSImportRule.h"
     39 #include "core/css/CSSStyleDeclaration.h"
     40 #include "core/css/CSSStyleRule.h"
     41 #include "core/css/CSSValueList.h"
     42 #include "core/css/StylePropertySet.h"
     43 #include "core/css/StyleRule.h"
     44 #include "core/css/StyleSheetContents.h"
     45 #include "core/dom/Document.h"
     46 #include "core/dom/Element.h"
     47 #include "core/dom/Text.h"
     48 #include "core/editing/MarkupAccumulator.h"
     49 #include "core/fetch/FontResource.h"
     50 #include "core/fetch/ImageResource.h"
     51 #include "core/frame/LocalFrame.h"
     52 #include "core/html/HTMLFrameOwnerElement.h"
     53 #include "core/html/HTMLImageElement.h"
     54 #include "core/html/HTMLInputElement.h"
     55 #include "core/html/HTMLLinkElement.h"
     56 #include "core/html/HTMLMetaElement.h"
     57 #include "core/html/HTMLStyleElement.h"
     58 #include "core/html/parser/HTMLParserIdioms.h"
     59 #include "core/page/Page.h"
     60 #include "core/rendering/RenderImage.h"
     61 #include "core/rendering/style/StyleFetchedImage.h"
     62 #include "core/rendering/style/StyleImage.h"
     63 #include "platform/SerializedResource.h"
     64 #include "platform/graphics/Image.h"
     65 #include "wtf/text/CString.h"
     66 #include "wtf/text/StringBuilder.h"
     67 #include "wtf/text/TextEncoding.h"
     68 #include "wtf/text/WTFString.h"
     69 
     70 namespace WebCore {
     71 
     72 static bool isCharsetSpecifyingNode(const Node& node)
     73 {
     74     if (!isHTMLMetaElement(node))
     75         return false;
     76 
     77     const HTMLMetaElement& element = toHTMLMetaElement(node);
     78     HTMLAttributeList attributeList;
     79     if (element.hasAttributes()) {
     80         AttributeCollection attributes = element.attributes();
     81         AttributeCollection::const_iterator end = attributes.end();
     82         for (AttributeCollection::const_iterator it = attributes.begin(); it != end; ++it) {
     83             // FIXME: We should deal appropriately with the attribute if they have a namespace.
     84             attributeList.append(std::make_pair(it->name().localName(), it->value().string()));
     85         }
     86     }
     87     WTF::TextEncoding textEncoding = encodingFromMetaAttributes(attributeList);
     88     return textEncoding.isValid();
     89 }
     90 
     91 static bool shouldIgnoreElement(const Element& element)
     92 {
     93     return isHTMLScriptElement(element) || isHTMLNoScriptElement(element) || isCharsetSpecifyingNode(element);
     94 }
     95 
     96 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
     97 {
     98     // FIXME: We should support all frame owners including applets.
     99     return isHTMLObjectElement(frameOwner) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
    100 }
    101 
    102 class SerializerMarkupAccumulator FINAL : public MarkupAccumulator {
    103 public:
    104     SerializerMarkupAccumulator(PageSerializer*, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node> >*);
    105     virtual ~SerializerMarkupAccumulator();
    106 
    107 protected:
    108     virtual void appendText(StringBuilder& out, Text&) OVERRIDE;
    109     virtual void appendElement(StringBuilder& out, Element&, Namespaces*) OVERRIDE;
    110     virtual void appendCustomAttributes(StringBuilder& out, const Element&, Namespaces*) OVERRIDE;
    111     virtual void appendEndTag(const Node&) OVERRIDE;
    112 
    113 private:
    114     PageSerializer* m_serializer;
    115     const Document& m_document;
    116 };
    117 
    118 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node> >* nodes)
    119     : MarkupAccumulator(nodes, ResolveAllURLs, nullptr)
    120     , m_serializer(serializer)
    121     , m_document(document)
    122 {
    123 }
    124 
    125 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
    126 {
    127 }
    128 
    129 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text& text)
    130 {
    131     Element* parent = text.parentElement();
    132     if (parent && !shouldIgnoreElement(*parent))
    133         MarkupAccumulator::appendText(out, text);
    134 }
    135 
    136 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element& element, Namespaces* namespaces)
    137 {
    138     if (!shouldIgnoreElement(element))
    139         MarkupAccumulator::appendElement(out, element, namespaces);
    140 
    141     if (isHTMLHeadElement(element)) {
    142         out.append("<meta charset=\"");
    143         out.append(m_document.charset());
    144         out.append("\">");
    145     }
    146 
    147     // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
    148 }
    149 
    150 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, const Element& element, Namespaces* namespaces)
    151 {
    152     if (!element.isFrameOwnerElement())
    153         return;
    154 
    155     const HTMLFrameOwnerElement& frameOwner = toHTMLFrameOwnerElement(element);
    156     Frame* frame = frameOwner.contentFrame();
    157     // FIXME: RemoteFrames not currently supported here.
    158     if (!frame || !frame->isLocalFrame())
    159         return;
    160 
    161     KURL url = toLocalFrame(frame)->document()->url();
    162     if (url.isValid() && !url.protocolIsAbout())
    163         return;
    164 
    165     // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
    166     url = m_serializer->urlForBlankFrame(toLocalFrame(frame));
    167     appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(frameOwner), AtomicString(url.string())), namespaces);
    168 }
    169 
    170 void SerializerMarkupAccumulator::appendEndTag(const Node& node)
    171 {
    172     if (node.isElementNode() && !shouldIgnoreElement(toElement(node)))
    173         MarkupAccumulator::appendEndTag(node);
    174 }
    175 
    176 PageSerializer::PageSerializer(Vector<SerializedResource>* resources)
    177     : m_resources(resources)
    178     , m_blankFrameCounter(0)
    179 {
    180 }
    181 
    182 void PageSerializer::serialize(Page* page)
    183 {
    184     serializeFrame(page->deprecatedLocalMainFrame());
    185 }
    186 
    187 void PageSerializer::serializeFrame(LocalFrame* frame)
    188 {
    189     ASSERT(frame->document());
    190     Document& document = *frame->document();
    191     KURL url = document.url();
    192     // FIXME: This probably wants isAboutBlankURL? to exclude other about: urls (like about:srcdoc)?
    193     if (!url.isValid() || url.protocolIsAbout()) {
    194         // For blank frames we generate a fake URL so they can be referenced by their containing frame.
    195         url = urlForBlankFrame(frame);
    196     }
    197 
    198     if (m_resourceURLs.contains(url)) {
    199         // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
    200         // different content. So we should serialize both and somehow rename the frame src in the containing
    201         // frame. Arg!
    202         return;
    203     }
    204 
    205     WTF::TextEncoding textEncoding(document.charset());
    206     if (!textEncoding.isValid()) {
    207         // FIXME: iframes used as images trigger this. We should deal with them correctly.
    208         return;
    209     }
    210 
    211     WillBeHeapVector<RawPtrWillBeMember<Node> > serializedNodes;
    212     SerializerMarkupAccumulator accumulator(this, document, &serializedNodes);
    213     String text = accumulator.serializeNodes(document, IncludeNode);
    214     CString frameHTML = textEncoding.normalizeAndEncode(text, WTF::EntitiesForUnencodables);
    215     m_resources->append(SerializedResource(url, document.suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
    216     m_resourceURLs.add(url);
    217 
    218     for (WillBeHeapVector<RawPtrWillBeMember<Node> >::iterator iter = serializedNodes.begin(); iter != serializedNodes.end(); ++iter) {
    219         ASSERT(*iter);
    220         Node& node = **iter;
    221         if (!node.isElementNode())
    222             continue;
    223 
    224         Element& element = toElement(node);
    225         // We have to process in-line style as it might contain some resources (typically background images).
    226         if (element.isStyledElement())
    227             retrieveResourcesForProperties(element.inlineStyle(), document);
    228 
    229         if (isHTMLImageElement(element)) {
    230             HTMLImageElement& imageElement = toHTMLImageElement(element);
    231             KURL url = document.completeURL(imageElement.getAttribute(HTMLNames::srcAttr));
    232             ImageResource* cachedImage = imageElement.cachedImage();
    233             addImageToResources(cachedImage, imageElement.renderer(), url);
    234         } else if (isHTMLInputElement(element)) {
    235             HTMLInputElement& inputElement = toHTMLInputElement(element);
    236             if (inputElement.isImageButton() && inputElement.hasImageLoader()) {
    237                 KURL url = inputElement.src();
    238                 ImageResource* cachedImage = inputElement.imageLoader()->image();
    239                 addImageToResources(cachedImage, inputElement.renderer(), url);
    240             }
    241         } else if (isHTMLLinkElement(element)) {
    242             HTMLLinkElement& linkElement = toHTMLLinkElement(element);
    243             if (CSSStyleSheet* sheet = linkElement.sheet()) {
    244                 KURL url = document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr));
    245                 serializeCSSStyleSheet(*sheet, url);
    246                 ASSERT(m_resourceURLs.contains(url));
    247             }
    248         } else if (isHTMLStyleElement(element)) {
    249             HTMLStyleElement& styleElement = toHTMLStyleElement(element);
    250             if (CSSStyleSheet* sheet = styleElement.sheet())
    251                 serializeCSSStyleSheet(*sheet, KURL());
    252         }
    253     }
    254 
    255     for (Frame* childFrame = frame->tree().firstChild(); childFrame; childFrame = childFrame->tree().nextSibling()) {
    256         if (childFrame->isLocalFrame())
    257             serializeFrame(toLocalFrame(childFrame));
    258     }
    259 }
    260 
    261 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KURL& url)
    262 {
    263     StringBuilder cssText;
    264     for (unsigned i = 0; i < styleSheet.length(); ++i) {
    265         CSSRule* rule = styleSheet.item(i);
    266         String itemText = rule->cssText();
    267         if (!itemText.isEmpty()) {
    268             cssText.append(itemText);
    269             if (i < styleSheet.length() - 1)
    270                 cssText.append("\n\n");
    271         }
    272         ASSERT(styleSheet.ownerDocument());
    273         Document& document = *styleSheet.ownerDocument();
    274         // Some rules have resources associated with them that we need to retrieve.
    275         if (rule->type() == CSSRule::IMPORT_RULE) {
    276             CSSImportRule* importRule = toCSSImportRule(rule);
    277             KURL importURL = document.completeURL(importRule->href());
    278             if (m_resourceURLs.contains(importURL))
    279                 continue;
    280             if (importRule->styleSheet())
    281                 serializeCSSStyleSheet(*importRule->styleSheet(), importURL);
    282         } else if (rule->type() == CSSRule::FONT_FACE_RULE) {
    283             retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->properties(), document);
    284         } else if (rule->type() == CSSRule::STYLE_RULE) {
    285             retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->properties(), document);
    286         }
    287     }
    288 
    289     if (url.isValid() && !m_resourceURLs.contains(url)) {
    290         // FIXME: We should check whether a charset has been specified and if none was found add one.
    291         WTF::TextEncoding textEncoding(styleSheet.contents()->charset());
    292         ASSERT(textEncoding.isValid());
    293         String textString = cssText.toString();
    294         CString text = textEncoding.normalizeAndEncode(textString, WTF::EntitiesForUnencodables);
    295         m_resources->append(SerializedResource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
    296         m_resourceURLs.add(url);
    297     }
    298 }
    299 
    300 bool PageSerializer::shouldAddURL(const KURL& url)
    301 {
    302     return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData();
    303 }
    304 
    305 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url)
    306 {
    307     if (!data) {
    308         WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data());
    309         return;
    310     }
    311 
    312     String mimeType = resource->response().mimeType();
    313     m_resources->append(SerializedResource(url, mimeType, data));
    314     m_resourceURLs.add(url);
    315 }
    316 
    317 void PageSerializer::addImageToResources(ImageResource* image, RenderObject* imageRenderer, const KURL& url)
    318 {
    319     if (!shouldAddURL(url))
    320         return;
    321 
    322     if (!image || image->image() == Image::nullImage() || image->errorOccurred())
    323         return;
    324 
    325     RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0;
    326     if (!data)
    327         data = image->image()->data();
    328 
    329     addToResources(image, data, url);
    330 }
    331 
    332 void PageSerializer::addFontToResources(FontResource* font)
    333 {
    334     if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resourceBuffer()) {
    335         return;
    336     }
    337     RefPtr<SharedBuffer> data(font->resourceBuffer());
    338 
    339     addToResources(font, data, font->url());
    340 }
    341 
    342 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document& document)
    343 {
    344     if (!styleDeclaration)
    345         return;
    346 
    347     // The background-image and list-style-image (for ul or ol) are the CSS properties
    348     // that make use of images. We iterate to make sure we include any other
    349     // image properties there might be.
    350     unsigned propertyCount = styleDeclaration->propertyCount();
    351     for (unsigned i = 0; i < propertyCount; ++i) {
    352         RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
    353         retrieveResourcesForCSSValue(cssValue.get(), document);
    354     }
    355 }
    356 
    357 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& document)
    358 {
    359     if (cssValue->isImageValue()) {
    360         CSSImageValue* imageValue = toCSSImageValue(cssValue);
    361         StyleImage* styleImage = imageValue->cachedOrPendingImage();
    362         // Non cached-images are just place-holders and do not contain data.
    363         if (!styleImage || !styleImage->isImageResource())
    364             return;
    365 
    366         addImageToResources(styleImage->cachedImage(), 0, styleImage->cachedImage()->url());
    367     } else if (cssValue->isFontFaceSrcValue()) {
    368         CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue);
    369         if (fontFaceSrcValue->isLocal()) {
    370             return;
    371         }
    372 
    373         addFontToResources(fontFaceSrcValue->fetch(&document));
    374     } else if (cssValue->isValueList()) {
    375         CSSValueList* cssValueList = toCSSValueList(cssValue);
    376         for (unsigned i = 0; i < cssValueList->length(); i++)
    377             retrieveResourcesForCSSValue(cssValueList->item(i), document);
    378     }
    379 }
    380 
    381 KURL PageSerializer::urlForBlankFrame(LocalFrame* frame)
    382 {
    383     HashMap<LocalFrame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
    384     if (iter != m_blankFrameURLs.end())
    385         return iter->value;
    386     String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++);
    387     KURL fakeURL(ParsedURLString, url);
    388     m_blankFrameURLs.add(frame, fakeURL);
    389 
    390     return fakeURL;
    391 }
    392 
    393 }
    394