1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "core/page/PageSerializer.h" 33 34 #include "core/HTMLNames.h" 35 #include "core/css/CSSFontFaceRule.h" 36 #include "core/css/CSSFontFaceSrcValue.h" 37 #include "core/css/CSSImageValue.h" 38 #include "core/css/CSSImportRule.h" 39 #include "core/css/CSSStyleDeclaration.h" 40 #include "core/css/CSSStyleRule.h" 41 #include "core/css/CSSValueList.h" 42 #include "core/css/StylePropertySet.h" 43 #include "core/css/StyleRule.h" 44 #include "core/css/StyleSheetContents.h" 45 #include "core/dom/Document.h" 46 #include "core/dom/Element.h" 47 #include "core/dom/Text.h" 48 #include "core/editing/MarkupAccumulator.h" 49 #include "core/fetch/FontResource.h" 50 #include "core/fetch/ImageResource.h" 51 #include "core/frame/LocalFrame.h" 52 #include "core/html/HTMLFrameOwnerElement.h" 53 #include "core/html/HTMLImageElement.h" 54 #include "core/html/HTMLInputElement.h" 55 #include "core/html/HTMLLinkElement.h" 56 #include "core/html/HTMLMetaElement.h" 57 #include "core/html/HTMLStyleElement.h" 58 #include "core/html/parser/HTMLParserIdioms.h" 59 #include "core/page/Page.h" 60 #include "core/rendering/RenderImage.h" 61 #include "core/rendering/style/StyleFetchedImage.h" 62 #include "core/rendering/style/StyleImage.h" 63 #include "platform/SerializedResource.h" 64 #include "platform/graphics/Image.h" 65 #include "wtf/text/CString.h" 66 #include "wtf/text/StringBuilder.h" 67 #include "wtf/text/TextEncoding.h" 68 #include "wtf/text/WTFString.h" 69 70 namespace WebCore { 71 72 static bool isCharsetSpecifyingNode(const Node& node) 73 { 74 if (!isHTMLMetaElement(node)) 75 return false; 76 77 const HTMLMetaElement& element = toHTMLMetaElement(node); 78 HTMLAttributeList attributeList; 79 if (element.hasAttributes()) { 80 AttributeCollection attributes = element.attributes(); 81 AttributeCollection::const_iterator end = attributes.end(); 82 for (AttributeCollection::const_iterator it = attributes.begin(); it != end; ++it) { 83 // FIXME: We should deal appropriately with the attribute if they have a namespace. 84 attributeList.append(std::make_pair(it->name().localName(), it->value().string())); 85 } 86 } 87 WTF::TextEncoding textEncoding = encodingFromMetaAttributes(attributeList); 88 return textEncoding.isValid(); 89 } 90 91 static bool shouldIgnoreElement(const Element& element) 92 { 93 return isHTMLScriptElement(element) || isHTMLNoScriptElement(element) || isCharsetSpecifyingNode(element); 94 } 95 96 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner) 97 { 98 // FIXME: We should support all frame owners including applets. 99 return isHTMLObjectElement(frameOwner) ? HTMLNames::dataAttr : HTMLNames::srcAttr; 100 } 101 102 class SerializerMarkupAccumulator FINAL : public MarkupAccumulator { 103 public: 104 SerializerMarkupAccumulator(PageSerializer*, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node> >*); 105 virtual ~SerializerMarkupAccumulator(); 106 107 protected: 108 virtual void appendText(StringBuilder& out, Text&) OVERRIDE; 109 virtual void appendElement(StringBuilder& out, Element&, Namespaces*) OVERRIDE; 110 virtual void appendCustomAttributes(StringBuilder& out, const Element&, Namespaces*) OVERRIDE; 111 virtual void appendEndTag(const Node&) OVERRIDE; 112 113 private: 114 PageSerializer* m_serializer; 115 const Document& m_document; 116 }; 117 118 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node> >* nodes) 119 : MarkupAccumulator(nodes, ResolveAllURLs, nullptr) 120 , m_serializer(serializer) 121 , m_document(document) 122 { 123 } 124 125 SerializerMarkupAccumulator::~SerializerMarkupAccumulator() 126 { 127 } 128 129 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text& text) 130 { 131 Element* parent = text.parentElement(); 132 if (parent && !shouldIgnoreElement(*parent)) 133 MarkupAccumulator::appendText(out, text); 134 } 135 136 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element& element, Namespaces* namespaces) 137 { 138 if (!shouldIgnoreElement(element)) 139 MarkupAccumulator::appendElement(out, element, namespaces); 140 141 if (isHTMLHeadElement(element)) { 142 out.append("<meta charset=\""); 143 out.append(m_document.charset()); 144 out.append("\">"); 145 } 146 147 // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents. 148 } 149 150 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, const Element& element, Namespaces* namespaces) 151 { 152 if (!element.isFrameOwnerElement()) 153 return; 154 155 const HTMLFrameOwnerElement& frameOwner = toHTMLFrameOwnerElement(element); 156 Frame* frame = frameOwner.contentFrame(); 157 // FIXME: RemoteFrames not currently supported here. 158 if (!frame || !frame->isLocalFrame()) 159 return; 160 161 KURL url = toLocalFrame(frame)->document()->url(); 162 if (url.isValid() && !url.protocolIsAbout()) 163 return; 164 165 // We need to give a fake location to blank frames so they can be referenced by the serialized frame. 166 url = m_serializer->urlForBlankFrame(toLocalFrame(frame)); 167 appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(frameOwner), AtomicString(url.string())), namespaces); 168 } 169 170 void SerializerMarkupAccumulator::appendEndTag(const Node& node) 171 { 172 if (node.isElementNode() && !shouldIgnoreElement(toElement(node))) 173 MarkupAccumulator::appendEndTag(node); 174 } 175 176 PageSerializer::PageSerializer(Vector<SerializedResource>* resources) 177 : m_resources(resources) 178 , m_blankFrameCounter(0) 179 { 180 } 181 182 void PageSerializer::serialize(Page* page) 183 { 184 serializeFrame(page->deprecatedLocalMainFrame()); 185 } 186 187 void PageSerializer::serializeFrame(LocalFrame* frame) 188 { 189 ASSERT(frame->document()); 190 Document& document = *frame->document(); 191 KURL url = document.url(); 192 // FIXME: This probably wants isAboutBlankURL? to exclude other about: urls (like about:srcdoc)? 193 if (!url.isValid() || url.protocolIsAbout()) { 194 // For blank frames we generate a fake URL so they can be referenced by their containing frame. 195 url = urlForBlankFrame(frame); 196 } 197 198 if (m_resourceURLs.contains(url)) { 199 // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now 200 // different content. So we should serialize both and somehow rename the frame src in the containing 201 // frame. Arg! 202 return; 203 } 204 205 WTF::TextEncoding textEncoding(document.charset()); 206 if (!textEncoding.isValid()) { 207 // FIXME: iframes used as images trigger this. We should deal with them correctly. 208 return; 209 } 210 211 WillBeHeapVector<RawPtrWillBeMember<Node> > serializedNodes; 212 SerializerMarkupAccumulator accumulator(this, document, &serializedNodes); 213 String text = accumulator.serializeNodes(document, IncludeNode); 214 CString frameHTML = textEncoding.normalizeAndEncode(text, WTF::EntitiesForUnencodables); 215 m_resources->append(SerializedResource(url, document.suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length()))); 216 m_resourceURLs.add(url); 217 218 for (WillBeHeapVector<RawPtrWillBeMember<Node> >::iterator iter = serializedNodes.begin(); iter != serializedNodes.end(); ++iter) { 219 ASSERT(*iter); 220 Node& node = **iter; 221 if (!node.isElementNode()) 222 continue; 223 224 Element& element = toElement(node); 225 // We have to process in-line style as it might contain some resources (typically background images). 226 if (element.isStyledElement()) 227 retrieveResourcesForProperties(element.inlineStyle(), document); 228 229 if (isHTMLImageElement(element)) { 230 HTMLImageElement& imageElement = toHTMLImageElement(element); 231 KURL url = document.completeURL(imageElement.getAttribute(HTMLNames::srcAttr)); 232 ImageResource* cachedImage = imageElement.cachedImage(); 233 addImageToResources(cachedImage, imageElement.renderer(), url); 234 } else if (isHTMLInputElement(element)) { 235 HTMLInputElement& inputElement = toHTMLInputElement(element); 236 if (inputElement.isImageButton() && inputElement.hasImageLoader()) { 237 KURL url = inputElement.src(); 238 ImageResource* cachedImage = inputElement.imageLoader()->image(); 239 addImageToResources(cachedImage, inputElement.renderer(), url); 240 } 241 } else if (isHTMLLinkElement(element)) { 242 HTMLLinkElement& linkElement = toHTMLLinkElement(element); 243 if (CSSStyleSheet* sheet = linkElement.sheet()) { 244 KURL url = document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr)); 245 serializeCSSStyleSheet(*sheet, url); 246 ASSERT(m_resourceURLs.contains(url)); 247 } 248 } else if (isHTMLStyleElement(element)) { 249 HTMLStyleElement& styleElement = toHTMLStyleElement(element); 250 if (CSSStyleSheet* sheet = styleElement.sheet()) 251 serializeCSSStyleSheet(*sheet, KURL()); 252 } 253 } 254 255 for (Frame* childFrame = frame->tree().firstChild(); childFrame; childFrame = childFrame->tree().nextSibling()) { 256 if (childFrame->isLocalFrame()) 257 serializeFrame(toLocalFrame(childFrame)); 258 } 259 } 260 261 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KURL& url) 262 { 263 StringBuilder cssText; 264 for (unsigned i = 0; i < styleSheet.length(); ++i) { 265 CSSRule* rule = styleSheet.item(i); 266 String itemText = rule->cssText(); 267 if (!itemText.isEmpty()) { 268 cssText.append(itemText); 269 if (i < styleSheet.length() - 1) 270 cssText.append("\n\n"); 271 } 272 ASSERT(styleSheet.ownerDocument()); 273 Document& document = *styleSheet.ownerDocument(); 274 // Some rules have resources associated with them that we need to retrieve. 275 if (rule->type() == CSSRule::IMPORT_RULE) { 276 CSSImportRule* importRule = toCSSImportRule(rule); 277 KURL importURL = document.completeURL(importRule->href()); 278 if (m_resourceURLs.contains(importURL)) 279 continue; 280 if (importRule->styleSheet()) 281 serializeCSSStyleSheet(*importRule->styleSheet(), importURL); 282 } else if (rule->type() == CSSRule::FONT_FACE_RULE) { 283 retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->properties(), document); 284 } else if (rule->type() == CSSRule::STYLE_RULE) { 285 retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->properties(), document); 286 } 287 } 288 289 if (url.isValid() && !m_resourceURLs.contains(url)) { 290 // FIXME: We should check whether a charset has been specified and if none was found add one. 291 WTF::TextEncoding textEncoding(styleSheet.contents()->charset()); 292 ASSERT(textEncoding.isValid()); 293 String textString = cssText.toString(); 294 CString text = textEncoding.normalizeAndEncode(textString, WTF::EntitiesForUnencodables); 295 m_resources->append(SerializedResource(url, String("text/css"), SharedBuffer::create(text.data(), text.length()))); 296 m_resourceURLs.add(url); 297 } 298 } 299 300 bool PageSerializer::shouldAddURL(const KURL& url) 301 { 302 return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData(); 303 } 304 305 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url) 306 { 307 if (!data) { 308 WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data()); 309 return; 310 } 311 312 String mimeType = resource->response().mimeType(); 313 m_resources->append(SerializedResource(url, mimeType, data)); 314 m_resourceURLs.add(url); 315 } 316 317 void PageSerializer::addImageToResources(ImageResource* image, RenderObject* imageRenderer, const KURL& url) 318 { 319 if (!shouldAddURL(url)) 320 return; 321 322 if (!image || image->image() == Image::nullImage() || image->errorOccurred()) 323 return; 324 325 RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0; 326 if (!data) 327 data = image->image()->data(); 328 329 addToResources(image, data, url); 330 } 331 332 void PageSerializer::addFontToResources(FontResource* font) 333 { 334 if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resourceBuffer()) { 335 return; 336 } 337 RefPtr<SharedBuffer> data(font->resourceBuffer()); 338 339 addToResources(font, data, font->url()); 340 } 341 342 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document& document) 343 { 344 if (!styleDeclaration) 345 return; 346 347 // The background-image and list-style-image (for ul or ol) are the CSS properties 348 // that make use of images. We iterate to make sure we include any other 349 // image properties there might be. 350 unsigned propertyCount = styleDeclaration->propertyCount(); 351 for (unsigned i = 0; i < propertyCount; ++i) { 352 RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value(); 353 retrieveResourcesForCSSValue(cssValue.get(), document); 354 } 355 } 356 357 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& document) 358 { 359 if (cssValue->isImageValue()) { 360 CSSImageValue* imageValue = toCSSImageValue(cssValue); 361 StyleImage* styleImage = imageValue->cachedOrPendingImage(); 362 // Non cached-images are just place-holders and do not contain data. 363 if (!styleImage || !styleImage->isImageResource()) 364 return; 365 366 addImageToResources(styleImage->cachedImage(), 0, styleImage->cachedImage()->url()); 367 } else if (cssValue->isFontFaceSrcValue()) { 368 CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue); 369 if (fontFaceSrcValue->isLocal()) { 370 return; 371 } 372 373 addFontToResources(fontFaceSrcValue->fetch(&document)); 374 } else if (cssValue->isValueList()) { 375 CSSValueList* cssValueList = toCSSValueList(cssValue); 376 for (unsigned i = 0; i < cssValueList->length(); i++) 377 retrieveResourcesForCSSValue(cssValueList->item(i), document); 378 } 379 } 380 381 KURL PageSerializer::urlForBlankFrame(LocalFrame* frame) 382 { 383 HashMap<LocalFrame*, KURL>::iterator iter = m_blankFrameURLs.find(frame); 384 if (iter != m_blankFrameURLs.end()) 385 return iter->value; 386 String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++); 387 KURL fakeURL(ParsedURLString, url); 388 m_blankFrameURLs.add(frame, fakeURL); 389 390 return fakeURL; 391 } 392 393 } 394