Home | History | Annotate | Download | only in page
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "core/page/PageSerializer.h"
     33 
     34 #include "core/HTMLNames.h"
     35 #include "core/InputTypeNames.h"
     36 #include "core/css/CSSFontFaceRule.h"
     37 #include "core/css/CSSFontFaceSrcValue.h"
     38 #include "core/css/CSSImageValue.h"
     39 #include "core/css/CSSImportRule.h"
     40 #include "core/css/CSSStyleDeclaration.h"
     41 #include "core/css/CSSStyleRule.h"
     42 #include "core/css/CSSValueList.h"
     43 #include "core/css/StylePropertySet.h"
     44 #include "core/css/StyleRule.h"
     45 #include "core/css/StyleSheetContents.h"
     46 #include "core/dom/Document.h"
     47 #include "core/dom/Element.h"
     48 #include "core/dom/Text.h"
     49 #include "core/editing/MarkupAccumulator.h"
     50 #include "core/fetch/FontResource.h"
     51 #include "core/fetch/ImageResource.h"
     52 #include "core/frame/LocalFrame.h"
     53 #include "core/html/HTMLFrameOwnerElement.h"
     54 #include "core/html/HTMLImageElement.h"
     55 #include "core/html/HTMLInputElement.h"
     56 #include "core/html/HTMLLinkElement.h"
     57 #include "core/html/HTMLMetaElement.h"
     58 #include "core/html/HTMLStyleElement.h"
     59 #include "core/html/parser/HTMLParserIdioms.h"
     60 #include "core/page/Page.h"
     61 #include "core/rendering/RenderImage.h"
     62 #include "core/rendering/style/StyleFetchedImage.h"
     63 #include "core/rendering/style/StyleImage.h"
     64 #include "platform/SerializedResource.h"
     65 #include "platform/graphics/Image.h"
     66 #include "wtf/text/CString.h"
     67 #include "wtf/text/StringBuilder.h"
     68 #include "wtf/text/TextEncoding.h"
     69 #include "wtf/text/WTFString.h"
     70 
     71 namespace blink {
     72 
     73 static bool isCharsetSpecifyingNode(const Node& node)
     74 {
     75     if (!isHTMLMetaElement(node))
     76         return false;
     77 
     78     const HTMLMetaElement& element = toHTMLMetaElement(node);
     79     HTMLAttributeList attributeList;
     80     AttributeCollection attributes = element.attributes();
     81     AttributeCollection::iterator end = attributes.end();
     82     for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
     83         // FIXME: We should deal appropriately with the attribute if they have a namespace.
     84         attributeList.append(std::make_pair(it->name().localName(), it->value().string()));
     85     }
     86     WTF::TextEncoding textEncoding = encodingFromMetaAttributes(attributeList);
     87     return textEncoding.isValid();
     88 }
     89 
     90 static bool shouldIgnoreElement(const Element& element)
     91 {
     92     return isHTMLScriptElement(element) || isHTMLNoScriptElement(element) || isCharsetSpecifyingNode(element);
     93 }
     94 
     95 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
     96 {
     97     // FIXME: We should support all frame owners including applets.
     98     return isHTMLObjectElement(frameOwner) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
     99 }
    100 
    101 class SerializerMarkupAccumulator FINAL : public MarkupAccumulator {
    102 public:
    103     SerializerMarkupAccumulator(PageSerializer*, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node> >*);
    104     virtual ~SerializerMarkupAccumulator();
    105 
    106 protected:
    107     virtual void appendText(StringBuilder& out, Text&) OVERRIDE;
    108     virtual void appendElement(StringBuilder& out, Element&, Namespaces*) OVERRIDE;
    109     virtual void appendCustomAttributes(StringBuilder& out, const Element&, Namespaces*) OVERRIDE;
    110     virtual void appendEndTag(const Element&) OVERRIDE;
    111 
    112 private:
    113     PageSerializer* m_serializer;
    114     const Document& m_document;
    115 };
    116 
    117 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node> >* nodes)
    118     : MarkupAccumulator(nodes, ResolveAllURLs, nullptr)
    119     , m_serializer(serializer)
    120     , m_document(document)
    121 {
    122 }
    123 
    124 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
    125 {
    126 }
    127 
    128 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text& text)
    129 {
    130     Element* parent = text.parentElement();
    131     if (parent && !shouldIgnoreElement(*parent))
    132         MarkupAccumulator::appendText(out, text);
    133 }
    134 
    135 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element& element, Namespaces* namespaces)
    136 {
    137     if (!shouldIgnoreElement(element))
    138         MarkupAccumulator::appendElement(out, element, namespaces);
    139 
    140     if (isHTMLHeadElement(element)) {
    141         out.appendLiteral("<meta charset=\"");
    142         out.append(m_document.charset());
    143         out.appendLiteral("\">");
    144     }
    145 
    146     // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
    147 }
    148 
    149 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, const Element& element, Namespaces* namespaces)
    150 {
    151     if (!element.isFrameOwnerElement())
    152         return;
    153 
    154     const HTMLFrameOwnerElement& frameOwner = toHTMLFrameOwnerElement(element);
    155     Frame* frame = frameOwner.contentFrame();
    156     // FIXME: RemoteFrames not currently supported here.
    157     if (!frame || !frame->isLocalFrame())
    158         return;
    159 
    160     KURL url = toLocalFrame(frame)->document()->url();
    161     if (url.isValid() && !url.protocolIsAbout())
    162         return;
    163 
    164     // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
    165     url = m_serializer->urlForBlankFrame(toLocalFrame(frame));
    166     appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(frameOwner), AtomicString(url.string())), namespaces);
    167 }
    168 
    169 void SerializerMarkupAccumulator::appendEndTag(const Element& element)
    170 {
    171     if (!shouldIgnoreElement(element))
    172         MarkupAccumulator::appendEndTag(element);
    173 }
    174 
    175 PageSerializer::PageSerializer(Vector<SerializedResource>* resources)
    176     : m_resources(resources)
    177     , m_blankFrameCounter(0)
    178 {
    179 }
    180 
    181 void PageSerializer::serialize(Page* page)
    182 {
    183     serializeFrame(page->deprecatedLocalMainFrame());
    184 }
    185 
    186 void PageSerializer::serializeFrame(LocalFrame* frame)
    187 {
    188     ASSERT(frame->document());
    189     Document& document = *frame->document();
    190     KURL url = document.url();
    191     // FIXME: This probably wants isAboutBlankURL? to exclude other about: urls (like about:srcdoc)?
    192     if (!url.isValid() || url.protocolIsAbout()) {
    193         // For blank frames we generate a fake URL so they can be referenced by their containing frame.
    194         url = urlForBlankFrame(frame);
    195     }
    196 
    197     if (m_resourceURLs.contains(url)) {
    198         // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
    199         // different content. So we should serialize both and somehow rename the frame src in the containing
    200         // frame. Arg!
    201         return;
    202     }
    203 
    204     WTF::TextEncoding textEncoding(document.charset());
    205     if (!textEncoding.isValid()) {
    206         // FIXME: iframes used as images trigger this. We should deal with them correctly.
    207         return;
    208     }
    209 
    210     WillBeHeapVector<RawPtrWillBeMember<Node> > serializedNodes;
    211     SerializerMarkupAccumulator accumulator(this, document, &serializedNodes);
    212     String text = accumulator.serializeNodes(document, IncludeNode);
    213     CString frameHTML = textEncoding.normalizeAndEncode(text, WTF::EntitiesForUnencodables);
    214     m_resources->append(SerializedResource(url, document.suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
    215     m_resourceURLs.add(url);
    216 
    217     for (WillBeHeapVector<RawPtrWillBeMember<Node> >::iterator iter = serializedNodes.begin(); iter != serializedNodes.end(); ++iter) {
    218         ASSERT(*iter);
    219         Node& node = **iter;
    220         if (!node.isElementNode())
    221             continue;
    222 
    223         Element& element = toElement(node);
    224         // We have to process in-line style as it might contain some resources (typically background images).
    225         if (element.isStyledElement())
    226             retrieveResourcesForProperties(element.inlineStyle(), document);
    227 
    228         if (isHTMLImageElement(element)) {
    229             HTMLImageElement& imageElement = toHTMLImageElement(element);
    230             KURL url = document.completeURL(imageElement.getAttribute(HTMLNames::srcAttr));
    231             ImageResource* cachedImage = imageElement.cachedImage();
    232             addImageToResources(cachedImage, imageElement.renderer(), url);
    233         } else if (isHTMLInputElement(element)) {
    234             HTMLInputElement& inputElement = toHTMLInputElement(element);
    235             if (inputElement.type() == InputTypeNames::image && inputElement.hasImageLoader()) {
    236                 KURL url = inputElement.src();
    237                 ImageResource* cachedImage = inputElement.imageLoader()->image();
    238                 addImageToResources(cachedImage, inputElement.renderer(), url);
    239             }
    240         } else if (isHTMLLinkElement(element)) {
    241             HTMLLinkElement& linkElement = toHTMLLinkElement(element);
    242             if (CSSStyleSheet* sheet = linkElement.sheet()) {
    243                 KURL url = document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr));
    244                 serializeCSSStyleSheet(*sheet, url);
    245                 ASSERT(m_resourceURLs.contains(url));
    246             }
    247         } else if (isHTMLStyleElement(element)) {
    248             HTMLStyleElement& styleElement = toHTMLStyleElement(element);
    249             if (CSSStyleSheet* sheet = styleElement.sheet())
    250                 serializeCSSStyleSheet(*sheet, KURL());
    251         }
    252     }
    253 
    254     for (Frame* childFrame = frame->tree().firstChild(); childFrame; childFrame = childFrame->tree().nextSibling()) {
    255         if (childFrame->isLocalFrame())
    256             serializeFrame(toLocalFrame(childFrame));
    257     }
    258 }
    259 
    260 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KURL& url)
    261 {
    262     StringBuilder cssText;
    263     for (unsigned i = 0; i < styleSheet.length(); ++i) {
    264         CSSRule* rule = styleSheet.item(i);
    265         String itemText = rule->cssText();
    266         if (!itemText.isEmpty()) {
    267             cssText.append(itemText);
    268             if (i < styleSheet.length() - 1)
    269                 cssText.appendLiteral("\n\n");
    270         }
    271         ASSERT(styleSheet.ownerDocument());
    272         Document& document = *styleSheet.ownerDocument();
    273         // Some rules have resources associated with them that we need to retrieve.
    274         if (rule->type() == CSSRule::IMPORT_RULE) {
    275             CSSImportRule* importRule = toCSSImportRule(rule);
    276             KURL importURL = document.completeURL(importRule->href());
    277             if (m_resourceURLs.contains(importURL))
    278                 continue;
    279             if (importRule->styleSheet())
    280                 serializeCSSStyleSheet(*importRule->styleSheet(), importURL);
    281         } else if (rule->type() == CSSRule::FONT_FACE_RULE) {
    282             retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->properties(), document);
    283         } else if (rule->type() == CSSRule::STYLE_RULE) {
    284             retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->properties(), document);
    285         }
    286     }
    287 
    288     if (url.isValid() && !m_resourceURLs.contains(url)) {
    289         // FIXME: We should check whether a charset has been specified and if none was found add one.
    290         WTF::TextEncoding textEncoding(styleSheet.contents()->charset());
    291         ASSERT(textEncoding.isValid());
    292         String textString = cssText.toString();
    293         CString text = textEncoding.normalizeAndEncode(textString, WTF::EntitiesForUnencodables);
    294         m_resources->append(SerializedResource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
    295         m_resourceURLs.add(url);
    296     }
    297 }
    298 
    299 bool PageSerializer::shouldAddURL(const KURL& url)
    300 {
    301     return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData();
    302 }
    303 
    304 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url)
    305 {
    306     if (!data) {
    307         WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data());
    308         return;
    309     }
    310 
    311     String mimeType = resource->response().mimeType();
    312     m_resources->append(SerializedResource(url, mimeType, data));
    313     m_resourceURLs.add(url);
    314 }
    315 
    316 void PageSerializer::addImageToResources(ImageResource* image, RenderObject* imageRenderer, const KURL& url)
    317 {
    318     if (!shouldAddURL(url))
    319         return;
    320 
    321     if (!image || image->image() == Image::nullImage() || image->errorOccurred())
    322         return;
    323 
    324     RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0;
    325     if (!data)
    326         data = image->image()->data();
    327 
    328     addToResources(image, data, url);
    329 }
    330 
    331 void PageSerializer::addFontToResources(FontResource* font)
    332 {
    333     if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resourceBuffer()) {
    334         return;
    335     }
    336     RefPtr<SharedBuffer> data(font->resourceBuffer());
    337 
    338     addToResources(font, data, font->url());
    339 }
    340 
    341 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document& document)
    342 {
    343     if (!styleDeclaration)
    344         return;
    345 
    346     // The background-image and list-style-image (for ul or ol) are the CSS properties
    347     // that make use of images. We iterate to make sure we include any other
    348     // image properties there might be.
    349     unsigned propertyCount = styleDeclaration->propertyCount();
    350     for (unsigned i = 0; i < propertyCount; ++i) {
    351         RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
    352         retrieveResourcesForCSSValue(cssValue.get(), document);
    353     }
    354 }
    355 
    356 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& document)
    357 {
    358     if (cssValue->isImageValue()) {
    359         CSSImageValue* imageValue = toCSSImageValue(cssValue);
    360         StyleImage* styleImage = imageValue->cachedOrPendingImage();
    361         // Non cached-images are just place-holders and do not contain data.
    362         if (!styleImage || !styleImage->isImageResource())
    363             return;
    364 
    365         addImageToResources(styleImage->cachedImage(), 0, styleImage->cachedImage()->url());
    366     } else if (cssValue->isFontFaceSrcValue()) {
    367         CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue);
    368         if (fontFaceSrcValue->isLocal()) {
    369             return;
    370         }
    371 
    372         addFontToResources(fontFaceSrcValue->fetch(&document));
    373     } else if (cssValue->isValueList()) {
    374         CSSValueList* cssValueList = toCSSValueList(cssValue);
    375         for (unsigned i = 0; i < cssValueList->length(); i++)
    376             retrieveResourcesForCSSValue(cssValueList->item(i), document);
    377     }
    378 }
    379 
    380 KURL PageSerializer::urlForBlankFrame(LocalFrame* frame)
    381 {
    382     HashMap<LocalFrame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
    383     if (iter != m_blankFrameURLs.end())
    384         return iter->value;
    385     String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++);
    386     KURL fakeURL(ParsedURLString, url);
    387     m_blankFrameURLs.add(frame, fakeURL);
    388 
    389     return fakeURL;
    390 }
    391 
    392 }
    393