Home | History | Annotate | Download | only in page
      1 /*
      2  * Copyright (C) 2011 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "core/page/PageSerializer.h"
     33 
     34 #include "HTMLNames.h"
     35 #include "core/css/CSSImageValue.h"
     36 #include "core/css/CSSImportRule.h"
     37 #include "core/css/CSSStyleRule.h"
     38 #include "core/css/StylePropertySet.h"
     39 #include "core/css/StyleRule.h"
     40 #include "core/css/StyleSheetContents.h"
     41 #include "core/dom/Document.h"
     42 #include "core/dom/Element.h"
     43 #include "core/dom/Text.h"
     44 #include "core/editing/MarkupAccumulator.h"
     45 #include "core/html/HTMLFrameOwnerElement.h"
     46 #include "core/html/HTMLImageElement.h"
     47 #include "core/html/HTMLInputElement.h"
     48 #include "core/html/HTMLLinkElement.h"
     49 #include "core/html/HTMLStyleElement.h"
     50 #include "core/html/parser/HTMLMetaCharsetParser.h"
     51 #include "core/loader/cache/ImageResource.h"
     52 #include "core/page/Frame.h"
     53 #include "core/page/Page.h"
     54 #include "core/platform/SerializedResource.h"
     55 #include "core/platform/graphics/Image.h"
     56 #include "core/rendering/RenderImage.h"
     57 #include "core/rendering/style/StyleFetchedImage.h"
     58 #include "core/rendering/style/StyleImage.h"
     59 #include "wtf/text/CString.h"
     60 #include "wtf/text/StringBuilder.h"
     61 #include "wtf/text/TextEncoding.h"
     62 #include "wtf/text/WTFString.h"
     63 
     64 namespace WebCore {
     65 
     66 static bool isCharsetSpecifyingNode(Node* node)
     67 {
     68     if (!node->isHTMLElement())
     69         return false;
     70 
     71     HTMLElement* element = toHTMLElement(node);
     72     if (!element->hasTagName(HTMLNames::metaTag))
     73         return false;
     74     HTMLMetaCharsetParser::AttributeList attributes;
     75     if (element->hasAttributes()) {
     76         for (unsigned i = 0; i < element->attributeCount(); ++i) {
     77             const Attribute* attribute = element->attributeItem(i);
     78             // FIXME: We should deal appropriately with the attribute if they have a namespace.
     79             attributes.append(std::make_pair(attribute->name().toString(), attribute->value().string()));
     80         }
     81     }
     82     WTF::TextEncoding textEncoding = HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes);
     83     return textEncoding.isValid();
     84 }
     85 
     86 static bool shouldIgnoreElement(Element* element)
     87 {
     88     return element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element);
     89 }
     90 
     91 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
     92 {
     93     // FIXME: We should support all frame owners including applets.
     94     return frameOwner.hasTagName(HTMLNames::objectTag) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
     95 }
     96 
     97 class SerializerMarkupAccumulator : public WebCore::MarkupAccumulator {
     98 public:
     99     SerializerMarkupAccumulator(PageSerializer*, Document*, Vector<Node*>*);
    100     virtual ~SerializerMarkupAccumulator();
    101 
    102 protected:
    103     virtual void appendText(StringBuilder& out, Text*);
    104     virtual void appendElement(StringBuilder& out, Element*, Namespaces*);
    105     virtual void appendCustomAttributes(StringBuilder& out, Element*, Namespaces*);
    106     virtual void appendEndTag(Node*);
    107 
    108 private:
    109     PageSerializer* m_serializer;
    110     Document* m_document;
    111 };
    112 
    113 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, Document* document, Vector<Node*>* nodes)
    114     : MarkupAccumulator(nodes, ResolveAllURLs)
    115     , m_serializer(serializer)
    116     , m_document(document)
    117 {
    118     // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified.
    119     if (m_document->isXHTMLDocument() || m_document->xmlStandalone() || m_document->isSVGDocument())
    120         appendString("<?xml version=\"" + m_document->xmlVersion() + "\" encoding=\"" + m_document->charset() + "\"?>");
    121 }
    122 
    123 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
    124 {
    125 }
    126 
    127 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text* text)
    128 {
    129     Element* parent = text->parentElement();
    130     if (parent && !shouldIgnoreElement(parent))
    131         MarkupAccumulator::appendText(out, text);
    132 }
    133 
    134 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element* element, Namespaces* namespaces)
    135 {
    136     if (!shouldIgnoreElement(element))
    137         MarkupAccumulator::appendElement(out, element, namespaces);
    138 
    139     if (element->hasTagName(HTMLNames::headTag)) {
    140         out.append("<meta charset=\"");
    141         out.append(m_document->charset());
    142         out.append("\">");
    143     }
    144 
    145     // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
    146 }
    147 
    148 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, Element* element, Namespaces* namespaces)
    149 {
    150     if (!element->isFrameOwnerElement())
    151         return;
    152 
    153     HTMLFrameOwnerElement* frameOwner = toFrameOwnerElement(element);
    154     Frame* frame = frameOwner->contentFrame();
    155     if (!frame)
    156         return;
    157 
    158     KURL url = frame->document()->url();
    159     if (url.isValid() && !url.isBlankURL())
    160         return;
    161 
    162     // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
    163     url = m_serializer->urlForBlankFrame(frame);
    164     appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(*frameOwner), url.string()), namespaces);
    165 }
    166 
    167 void SerializerMarkupAccumulator::appendEndTag(Node* node)
    168 {
    169     if (node->isElementNode() && !shouldIgnoreElement(toElement(node)))
    170         MarkupAccumulator::appendEndTag(node);
    171 }
    172 
    173 PageSerializer::PageSerializer(Vector<SerializedResource>* resources)
    174     : m_resources(resources)
    175     , m_blankFrameCounter(0)
    176 {
    177 }
    178 
    179 void PageSerializer::serialize(Page* page)
    180 {
    181     serializeFrame(page->mainFrame());
    182 }
    183 
    184 void PageSerializer::serializeFrame(Frame* frame)
    185 {
    186     Document* document = frame->document();
    187     KURL url = document->url();
    188     if (!url.isValid() || url.isBlankURL()) {
    189         // For blank frames we generate a fake URL so they can be referenced by their containing frame.
    190         url = urlForBlankFrame(frame);
    191     }
    192 
    193     if (m_resourceURLs.contains(url)) {
    194         // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
    195         // different content. So we should serialize both and somehow rename the frame src in the containing
    196         // frame. Arg!
    197         return;
    198     }
    199 
    200     Vector<Node*> nodes;
    201     SerializerMarkupAccumulator accumulator(this, document, &nodes);
    202     WTF::TextEncoding textEncoding(document->charset());
    203     CString data;
    204     if (!textEncoding.isValid()) {
    205         // FIXME: iframes used as images trigger this. We should deal with them correctly.
    206         return;
    207     }
    208     String text = accumulator.serializeNodes(document->documentElement(), IncludeNode);
    209     CString frameHTML = textEncoding.normalizeAndEncode(text, WTF::EntitiesForUnencodables);
    210     m_resources->append(SerializedResource(url, document->suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
    211     m_resourceURLs.add(url);
    212 
    213     for (Vector<Node*>::iterator iter = nodes.begin(); iter != nodes.end(); ++iter) {
    214         Node* node = *iter;
    215         if (!node->isElementNode())
    216             continue;
    217 
    218         Element* element = toElement(node);
    219         // We have to process in-line style as it might contain some resources (typically background images).
    220         if (element->isStyledElement())
    221             retrieveResourcesForProperties(element->inlineStyle(), document);
    222 
    223         if (element->hasTagName(HTMLNames::imgTag)) {
    224             HTMLImageElement* imageElement = toHTMLImageElement(element);
    225             KURL url = document->completeURL(imageElement->getAttribute(HTMLNames::srcAttr));
    226             ImageResource* cachedImage = imageElement->cachedImage();
    227             addImageToResources(cachedImage, imageElement->renderer(), url);
    228         } else if (element->hasTagName(HTMLNames::inputTag)) {
    229             HTMLInputElement* inputElement = toHTMLInputElement(element);
    230             if (inputElement->isImageButton() && inputElement->hasImageLoader()) {
    231                 KURL url = inputElement->src();
    232                 ImageResource* cachedImage = inputElement->imageLoader()->image();
    233                 addImageToResources(cachedImage, inputElement->renderer(), url);
    234             }
    235         } else if (element->hasTagName(HTMLNames::linkTag)) {
    236             HTMLLinkElement* linkElement = toHTMLLinkElement(element);
    237             if (CSSStyleSheet* sheet = linkElement->sheet()) {
    238                 KURL url = document->completeURL(linkElement->getAttribute(HTMLNames::hrefAttr));
    239                 serializeCSSStyleSheet(sheet, url);
    240                 ASSERT(m_resourceURLs.contains(url));
    241             }
    242         } else if (element->hasTagName(HTMLNames::styleTag)) {
    243             HTMLStyleElement* styleElement = toHTMLStyleElement(element);
    244             if (CSSStyleSheet* sheet = styleElement->sheet())
    245                 serializeCSSStyleSheet(sheet, KURL());
    246         }
    247     }
    248 
    249     for (Frame* childFrame = frame->tree()->firstChild(); childFrame; childFrame = childFrame->tree()->nextSibling())
    250         serializeFrame(childFrame);
    251 }
    252 
    253 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const KURL& url)
    254 {
    255     StringBuilder cssText;
    256     for (unsigned i = 0; i < styleSheet->length(); ++i) {
    257         CSSRule* rule = styleSheet->item(i);
    258         String itemText = rule->cssText();
    259         if (!itemText.isEmpty()) {
    260             cssText.append(itemText);
    261             if (i < styleSheet->length() - 1)
    262                 cssText.append("\n\n");
    263         }
    264         Document* document = styleSheet->ownerDocument();
    265         // Some rules have resources associated with them that we need to retrieve.
    266         if (rule->type() == CSSRule::IMPORT_RULE) {
    267             CSSImportRule* importRule = static_cast<CSSImportRule*>(rule);
    268             KURL importURL = document->completeURL(importRule->href());
    269             if (m_resourceURLs.contains(importURL))
    270                 continue;
    271             serializeCSSStyleSheet(importRule->styleSheet(), importURL);
    272         } else if (rule->type() == CSSRule::FONT_FACE_RULE) {
    273             // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can
    274             // be retrieved from the CSSFontFaceRule object.
    275         } else if (rule->type() == CSSRule::STYLE_RULE) {
    276             retrieveResourcesForRule(static_cast<CSSStyleRule*>(rule)->styleRule(), document);
    277         }
    278     }
    279 
    280     if (url.isValid() && !m_resourceURLs.contains(url)) {
    281         // FIXME: We should check whether a charset has been specified and if none was found add one.
    282         WTF::TextEncoding textEncoding(styleSheet->contents()->charset());
    283         ASSERT(textEncoding.isValid());
    284         String textString = cssText.toString();
    285         CString text = textEncoding.normalizeAndEncode(textString, WTF::EntitiesForUnencodables);
    286         m_resources->append(SerializedResource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
    287         m_resourceURLs.add(url);
    288     }
    289 }
    290 
    291 void PageSerializer::addImageToResources(ImageResource* image, RenderObject* imageRenderer, const KURL& url)
    292 {
    293     if (!url.isValid() || m_resourceURLs.contains(url) || url.protocolIsData())
    294         return;
    295 
    296     if (!image || image->image() == Image::nullImage())
    297         return;
    298 
    299     RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0;
    300     if (!data)
    301         data = image->image()->data();
    302 
    303     if (!data) {
    304         LOG_ERROR("No data for image %s", url.string().utf8().data());
    305         return;
    306     }
    307 
    308     String mimeType = image->response().mimeType();
    309     m_resources->append(SerializedResource(url, mimeType, data));
    310     m_resourceURLs.add(url);
    311 }
    312 
    313 void PageSerializer::retrieveResourcesForRule(StyleRule* rule, Document* document)
    314 {
    315     retrieveResourcesForProperties(rule->properties(), document);
    316 }
    317 
    318 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document* document)
    319 {
    320     if (!styleDeclaration)
    321         return;
    322 
    323     // The background-image and list-style-image (for ul or ol) are the CSS properties
    324     // that make use of images. We iterate to make sure we include any other
    325     // image properties there might be.
    326     unsigned propertyCount = styleDeclaration->propertyCount();
    327     for (unsigned i = 0; i < propertyCount; ++i) {
    328         RefPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
    329         if (!cssValue->isImageValue())
    330             continue;
    331 
    332         CSSImageValue* imageValue = toCSSImageValue(cssValue.get());
    333         StyleImage* styleImage = imageValue->cachedOrPendingImage();
    334         // Non cached-images are just place-holders and do not contain data.
    335         if (!styleImage || !styleImage->isImageResource())
    336             continue;
    337 
    338         ImageResource* image = static_cast<StyleFetchedImage*>(styleImage)->cachedImage();
    339         addImageToResources(image, 0, image->url());
    340     }
    341 }
    342 
    343 KURL PageSerializer::urlForBlankFrame(Frame* frame)
    344 {
    345     HashMap<Frame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
    346     if (iter != m_blankFrameURLs.end())
    347         return iter->value;
    348     String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++);
    349     KURL fakeURL(ParsedURLString, url);
    350     m_blankFrameURLs.add(frame, fakeURL);
    351 
    352     return fakeURL;
    353 }
    354 
    355 }
    356