1 /* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "core/page/PageSerializer.h" 33 34 #include "core/HTMLNames.h" 35 #include "core/InputTypeNames.h" 36 #include "core/css/CSSFontFaceRule.h" 37 #include "core/css/CSSFontFaceSrcValue.h" 38 #include "core/css/CSSImageValue.h" 39 #include "core/css/CSSImportRule.h" 40 #include "core/css/CSSStyleDeclaration.h" 41 #include "core/css/CSSStyleRule.h" 42 #include "core/css/CSSValueList.h" 43 #include "core/css/StylePropertySet.h" 44 #include "core/css/StyleRule.h" 45 #include "core/css/StyleSheetContents.h" 46 #include "core/dom/Document.h" 47 #include "core/dom/Element.h" 48 #include "core/dom/Text.h" 49 #include "core/editing/MarkupAccumulator.h" 50 #include "core/fetch/FontResource.h" 51 #include "core/fetch/ImageResource.h" 52 #include "core/frame/LocalFrame.h" 53 #include "core/html/HTMLFrameOwnerElement.h" 54 #include "core/html/HTMLImageElement.h" 55 #include "core/html/HTMLInputElement.h" 56 #include "core/html/HTMLLinkElement.h" 57 #include "core/html/HTMLMetaElement.h" 58 #include "core/html/HTMLStyleElement.h" 59 #include "core/html/parser/HTMLParserIdioms.h" 60 #include "core/page/Page.h" 61 #include "core/rendering/RenderImage.h" 62 #include "core/rendering/style/StyleFetchedImage.h" 63 #include "core/rendering/style/StyleImage.h" 64 #include "platform/SerializedResource.h" 65 #include "platform/graphics/Image.h" 66 #include "wtf/text/CString.h" 67 #include "wtf/text/StringBuilder.h" 68 #include "wtf/text/TextEncoding.h" 69 #include "wtf/text/WTFString.h" 70 71 namespace blink { 72 73 static bool isCharsetSpecifyingNode(const Node& node) 74 { 75 if (!isHTMLMetaElement(node)) 76 return false; 77 78 const HTMLMetaElement& element = toHTMLMetaElement(node); 79 HTMLAttributeList attributeList; 80 AttributeCollection attributes = element.attributes(); 81 AttributeCollection::iterator end = attributes.end(); 82 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) { 83 // FIXME: We should deal appropriately with the attribute if they have a namespace. 84 attributeList.append(std::make_pair(it->name().localName(), it->value().string())); 85 } 86 WTF::TextEncoding textEncoding = encodingFromMetaAttributes(attributeList); 87 return textEncoding.isValid(); 88 } 89 90 static bool shouldIgnoreElement(const Element& element) 91 { 92 return isHTMLScriptElement(element) || isHTMLNoScriptElement(element) || isCharsetSpecifyingNode(element); 93 } 94 95 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner) 96 { 97 // FIXME: We should support all frame owners including applets. 98 return isHTMLObjectElement(frameOwner) ? HTMLNames::dataAttr : HTMLNames::srcAttr; 99 } 100 101 class SerializerMarkupAccumulator FINAL : public MarkupAccumulator { 102 public: 103 SerializerMarkupAccumulator(PageSerializer*, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node> >*); 104 virtual ~SerializerMarkupAccumulator(); 105 106 protected: 107 virtual void appendText(StringBuilder& out, Text&) OVERRIDE; 108 virtual void appendElement(StringBuilder& out, Element&, Namespaces*) OVERRIDE; 109 virtual void appendCustomAttributes(StringBuilder& out, const Element&, Namespaces*) OVERRIDE; 110 virtual void appendEndTag(const Element&) OVERRIDE; 111 112 private: 113 PageSerializer* m_serializer; 114 const Document& m_document; 115 }; 116 117 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node> >* nodes) 118 : MarkupAccumulator(nodes, ResolveAllURLs, nullptr) 119 , m_serializer(serializer) 120 , m_document(document) 121 { 122 } 123 124 SerializerMarkupAccumulator::~SerializerMarkupAccumulator() 125 { 126 } 127 128 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text& text) 129 { 130 Element* parent = text.parentElement(); 131 if (parent && !shouldIgnoreElement(*parent)) 132 MarkupAccumulator::appendText(out, text); 133 } 134 135 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element& element, Namespaces* namespaces) 136 { 137 if (!shouldIgnoreElement(element)) 138 MarkupAccumulator::appendElement(out, element, namespaces); 139 140 if (isHTMLHeadElement(element)) { 141 out.appendLiteral("<meta charset=\""); 142 out.append(m_document.charset()); 143 out.appendLiteral("\">"); 144 } 145 146 // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents. 147 } 148 149 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, const Element& element, Namespaces* namespaces) 150 { 151 if (!element.isFrameOwnerElement()) 152 return; 153 154 const HTMLFrameOwnerElement& frameOwner = toHTMLFrameOwnerElement(element); 155 Frame* frame = frameOwner.contentFrame(); 156 // FIXME: RemoteFrames not currently supported here. 157 if (!frame || !frame->isLocalFrame()) 158 return; 159 160 KURL url = toLocalFrame(frame)->document()->url(); 161 if (url.isValid() && !url.protocolIsAbout()) 162 return; 163 164 // We need to give a fake location to blank frames so they can be referenced by the serialized frame. 165 url = m_serializer->urlForBlankFrame(toLocalFrame(frame)); 166 appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(frameOwner), AtomicString(url.string())), namespaces); 167 } 168 169 void SerializerMarkupAccumulator::appendEndTag(const Element& element) 170 { 171 if (!shouldIgnoreElement(element)) 172 MarkupAccumulator::appendEndTag(element); 173 } 174 175 PageSerializer::PageSerializer(Vector<SerializedResource>* resources) 176 : m_resources(resources) 177 , m_blankFrameCounter(0) 178 { 179 } 180 181 void PageSerializer::serialize(Page* page) 182 { 183 serializeFrame(page->deprecatedLocalMainFrame()); 184 } 185 186 void PageSerializer::serializeFrame(LocalFrame* frame) 187 { 188 ASSERT(frame->document()); 189 Document& document = *frame->document(); 190 KURL url = document.url(); 191 // FIXME: This probably wants isAboutBlankURL? to exclude other about: urls (like about:srcdoc)? 192 if (!url.isValid() || url.protocolIsAbout()) { 193 // For blank frames we generate a fake URL so they can be referenced by their containing frame. 194 url = urlForBlankFrame(frame); 195 } 196 197 if (m_resourceURLs.contains(url)) { 198 // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now 199 // different content. So we should serialize both and somehow rename the frame src in the containing 200 // frame. Arg! 201 return; 202 } 203 204 WTF::TextEncoding textEncoding(document.charset()); 205 if (!textEncoding.isValid()) { 206 // FIXME: iframes used as images trigger this. We should deal with them correctly. 207 return; 208 } 209 210 WillBeHeapVector<RawPtrWillBeMember<Node> > serializedNodes; 211 SerializerMarkupAccumulator accumulator(this, document, &serializedNodes); 212 String text = accumulator.serializeNodes(document, IncludeNode); 213 CString frameHTML = textEncoding.normalizeAndEncode(text, WTF::EntitiesForUnencodables); 214 m_resources->append(SerializedResource(url, document.suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length()))); 215 m_resourceURLs.add(url); 216 217 for (WillBeHeapVector<RawPtrWillBeMember<Node> >::iterator iter = serializedNodes.begin(); iter != serializedNodes.end(); ++iter) { 218 ASSERT(*iter); 219 Node& node = **iter; 220 if (!node.isElementNode()) 221 continue; 222 223 Element& element = toElement(node); 224 // We have to process in-line style as it might contain some resources (typically background images). 225 if (element.isStyledElement()) 226 retrieveResourcesForProperties(element.inlineStyle(), document); 227 228 if (isHTMLImageElement(element)) { 229 HTMLImageElement& imageElement = toHTMLImageElement(element); 230 KURL url = document.completeURL(imageElement.getAttribute(HTMLNames::srcAttr)); 231 ImageResource* cachedImage = imageElement.cachedImage(); 232 addImageToResources(cachedImage, imageElement.renderer(), url); 233 } else if (isHTMLInputElement(element)) { 234 HTMLInputElement& inputElement = toHTMLInputElement(element); 235 if (inputElement.type() == InputTypeNames::image && inputElement.hasImageLoader()) { 236 KURL url = inputElement.src(); 237 ImageResource* cachedImage = inputElement.imageLoader()->image(); 238 addImageToResources(cachedImage, inputElement.renderer(), url); 239 } 240 } else if (isHTMLLinkElement(element)) { 241 HTMLLinkElement& linkElement = toHTMLLinkElement(element); 242 if (CSSStyleSheet* sheet = linkElement.sheet()) { 243 KURL url = document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr)); 244 serializeCSSStyleSheet(*sheet, url); 245 ASSERT(m_resourceURLs.contains(url)); 246 } 247 } else if (isHTMLStyleElement(element)) { 248 HTMLStyleElement& styleElement = toHTMLStyleElement(element); 249 if (CSSStyleSheet* sheet = styleElement.sheet()) 250 serializeCSSStyleSheet(*sheet, KURL()); 251 } 252 } 253 254 for (Frame* childFrame = frame->tree().firstChild(); childFrame; childFrame = childFrame->tree().nextSibling()) { 255 if (childFrame->isLocalFrame()) 256 serializeFrame(toLocalFrame(childFrame)); 257 } 258 } 259 260 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KURL& url) 261 { 262 StringBuilder cssText; 263 for (unsigned i = 0; i < styleSheet.length(); ++i) { 264 CSSRule* rule = styleSheet.item(i); 265 String itemText = rule->cssText(); 266 if (!itemText.isEmpty()) { 267 cssText.append(itemText); 268 if (i < styleSheet.length() - 1) 269 cssText.appendLiteral("\n\n"); 270 } 271 ASSERT(styleSheet.ownerDocument()); 272 Document& document = *styleSheet.ownerDocument(); 273 // Some rules have resources associated with them that we need to retrieve. 274 if (rule->type() == CSSRule::IMPORT_RULE) { 275 CSSImportRule* importRule = toCSSImportRule(rule); 276 KURL importURL = document.completeURL(importRule->href()); 277 if (m_resourceURLs.contains(importURL)) 278 continue; 279 if (importRule->styleSheet()) 280 serializeCSSStyleSheet(*importRule->styleSheet(), importURL); 281 } else if (rule->type() == CSSRule::FONT_FACE_RULE) { 282 retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->properties(), document); 283 } else if (rule->type() == CSSRule::STYLE_RULE) { 284 retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->properties(), document); 285 } 286 } 287 288 if (url.isValid() && !m_resourceURLs.contains(url)) { 289 // FIXME: We should check whether a charset has been specified and if none was found add one. 290 WTF::TextEncoding textEncoding(styleSheet.contents()->charset()); 291 ASSERT(textEncoding.isValid()); 292 String textString = cssText.toString(); 293 CString text = textEncoding.normalizeAndEncode(textString, WTF::EntitiesForUnencodables); 294 m_resources->append(SerializedResource(url, String("text/css"), SharedBuffer::create(text.data(), text.length()))); 295 m_resourceURLs.add(url); 296 } 297 } 298 299 bool PageSerializer::shouldAddURL(const KURL& url) 300 { 301 return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData(); 302 } 303 304 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url) 305 { 306 if (!data) { 307 WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data()); 308 return; 309 } 310 311 String mimeType = resource->response().mimeType(); 312 m_resources->append(SerializedResource(url, mimeType, data)); 313 m_resourceURLs.add(url); 314 } 315 316 void PageSerializer::addImageToResources(ImageResource* image, RenderObject* imageRenderer, const KURL& url) 317 { 318 if (!shouldAddURL(url)) 319 return; 320 321 if (!image || image->image() == Image::nullImage() || image->errorOccurred()) 322 return; 323 324 RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0; 325 if (!data) 326 data = image->image()->data(); 327 328 addToResources(image, data, url); 329 } 330 331 void PageSerializer::addFontToResources(FontResource* font) 332 { 333 if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resourceBuffer()) { 334 return; 335 } 336 RefPtr<SharedBuffer> data(font->resourceBuffer()); 337 338 addToResources(font, data, font->url()); 339 } 340 341 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document& document) 342 { 343 if (!styleDeclaration) 344 return; 345 346 // The background-image and list-style-image (for ul or ol) are the CSS properties 347 // that make use of images. We iterate to make sure we include any other 348 // image properties there might be. 349 unsigned propertyCount = styleDeclaration->propertyCount(); 350 for (unsigned i = 0; i < propertyCount; ++i) { 351 RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value(); 352 retrieveResourcesForCSSValue(cssValue.get(), document); 353 } 354 } 355 356 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& document) 357 { 358 if (cssValue->isImageValue()) { 359 CSSImageValue* imageValue = toCSSImageValue(cssValue); 360 StyleImage* styleImage = imageValue->cachedOrPendingImage(); 361 // Non cached-images are just place-holders and do not contain data. 362 if (!styleImage || !styleImage->isImageResource()) 363 return; 364 365 addImageToResources(styleImage->cachedImage(), 0, styleImage->cachedImage()->url()); 366 } else if (cssValue->isFontFaceSrcValue()) { 367 CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue); 368 if (fontFaceSrcValue->isLocal()) { 369 return; 370 } 371 372 addFontToResources(fontFaceSrcValue->fetch(&document)); 373 } else if (cssValue->isValueList()) { 374 CSSValueList* cssValueList = toCSSValueList(cssValue); 375 for (unsigned i = 0; i < cssValueList->length(); i++) 376 retrieveResourcesForCSSValue(cssValueList->item(i), document); 377 } 378 } 379 380 KURL PageSerializer::urlForBlankFrame(LocalFrame* frame) 381 { 382 HashMap<LocalFrame*, KURL>::iterator iter = m_blankFrameURLs.find(frame); 383 if (iter != m_blankFrameURLs.end()) 384 return iter->value; 385 String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++); 386 KURL fakeURL(ParsedURLString, url); 387 m_blankFrameURLs.add(frame, fakeURL); 388 389 return fakeURL; 390 } 391 392 } 393