1 /* 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2009, 2010 Google Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 18 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 20 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "MarkupAccumulator.h" 29 30 #include "CDATASection.h" 31 #include "Comment.h" 32 #include "DocumentFragment.h" 33 #include "DocumentType.h" 34 #include "Editor.h" 35 #include "HTMLElement.h" 36 #include "HTMLNames.h" 37 #include "KURL.h" 38 #include "ProcessingInstruction.h" 39 #include "XMLNSNames.h" 40 #include <wtf/unicode/CharacterNames.h> 41 42 namespace WebCore { 43 44 using namespace HTMLNames; 45 46 void appendCharactersReplacingEntities(Vector<UChar>& out, const UChar* content, size_t length, EntityMask entityMask) 47 { 48 DEFINE_STATIC_LOCAL(const String, ampReference, ("&")); 49 DEFINE_STATIC_LOCAL(const String, ltReference, ("<")); 50 DEFINE_STATIC_LOCAL(const String, gtReference, (">")); 51 DEFINE_STATIC_LOCAL(const String, quotReference, (""")); 52 DEFINE_STATIC_LOCAL(const String, nbspReference, (" ")); 53 54 static const EntityDescription entityMaps[] = { 55 { '&', ampReference, EntityAmp }, 56 { '<', ltReference, EntityLt }, 57 { '>', gtReference, EntityGt }, 58 { '"', quotReference, EntityQuot }, 59 { noBreakSpace, nbspReference, EntityNbsp }, 60 }; 61 62 size_t positionAfterLastEntity = 0; 63 for (size_t i = 0; i < length; ++i) { 64 for (size_t m = 0; m < WTF_ARRAY_LENGTH(entityMaps); ++m) { 65 if (content[i] == entityMaps[m].entity && entityMaps[m].mask & entityMask) { 66 out.append(content + positionAfterLastEntity, i - positionAfterLastEntity); 67 append(out, entityMaps[m].reference); 68 positionAfterLastEntity = i + 1; 69 break; 70 } 71 } 72 } 73 out.append(content + positionAfterLastEntity, length - positionAfterLastEntity); 74 } 75 76 MarkupAccumulator::MarkupAccumulator(Vector<Node*>* nodes, EAbsoluteURLs shouldResolveURLs, const Range* range) 77 : m_nodes(nodes) 78 , m_range(range) 79 , m_shouldResolveURLs(shouldResolveURLs) 80 { 81 } 82 83 MarkupAccumulator::~MarkupAccumulator() 84 { 85 } 86 87 String MarkupAccumulator::serializeNodes(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly) 88 { 89 Vector<UChar> out; 90 serializeNodesWithNamespaces(node, nodeToSkip, childrenOnly, 0); 91 out.reserveInitialCapacity(length()); 92 concatenateMarkup(out); 93 return String::adopt(out); 94 } 95 96 void MarkupAccumulator::serializeNodesWithNamespaces(Node* node, Node* nodeToSkip, EChildrenOnly childrenOnly, const Namespaces* namespaces) 97 { 98 if (node == nodeToSkip) 99 return; 100 101 Namespaces namespaceHash; 102 if (namespaces) 103 namespaceHash = *namespaces; 104 105 if (!childrenOnly) 106 appendStartTag(node, &namespaceHash); 107 108 if (!(node->document()->isHTMLDocument() && elementCannotHaveEndTag(node))) { 109 for (Node* current = node->firstChild(); current; current = current->nextSibling()) 110 serializeNodesWithNamespaces(current, nodeToSkip, IncludeNode, &namespaceHash); 111 } 112 113 if (!childrenOnly) 114 appendEndTag(node); 115 } 116 117 void MarkupAccumulator::appendString(const String& string) 118 { 119 m_succeedingMarkup.append(string); 120 } 121 122 void MarkupAccumulator::appendStartTag(Node* node, Namespaces* namespaces) 123 { 124 Vector<UChar> markup; 125 appendStartMarkup(markup, node, namespaces); 126 appendString(String::adopt(markup)); 127 if (m_nodes) 128 m_nodes->append(node); 129 } 130 131 void MarkupAccumulator::appendEndTag(Node* node) 132 { 133 Vector<UChar> markup; 134 appendEndMarkup(markup, node); 135 appendString(String::adopt(markup)); 136 } 137 138 size_t MarkupAccumulator::totalLength(const Vector<String>& strings) 139 { 140 size_t length = 0; 141 for (size_t i = 0; i < strings.size(); ++i) 142 length += strings[i].length(); 143 return length; 144 } 145 146 // FIXME: This is a very inefficient way of accumulating the markup. 147 // We're converting results of appendStartMarkup and appendEndMarkup from Vector<UChar> to String 148 // and then back to Vector<UChar> and again to String here. 149 void MarkupAccumulator::concatenateMarkup(Vector<UChar>& out) 150 { 151 for (size_t i = 0; i < m_succeedingMarkup.size(); ++i) 152 append(out, m_succeedingMarkup[i]); 153 } 154 155 void MarkupAccumulator::appendAttributeValue(Vector<UChar>& result, const String& attribute, bool documentIsHTML) 156 { 157 appendCharactersReplacingEntities(result, attribute.characters(), attribute.length(), 158 documentIsHTML ? EntityMaskInHTMLAttributeValue : EntityMaskInAttributeValue); 159 } 160 161 void MarkupAccumulator::appendQuotedURLAttributeValue(Vector<UChar>& result, const String& urlString) 162 { 163 UChar quoteChar = '\"'; 164 String strippedURLString = urlString.stripWhiteSpace(); 165 if (protocolIsJavaScript(strippedURLString)) { 166 // minimal escaping for javascript urls 167 if (strippedURLString.contains('"')) { 168 if (strippedURLString.contains('\'')) 169 strippedURLString.replace('\"', """); 170 else 171 quoteChar = '\''; 172 } 173 result.append(quoteChar); 174 append(result, strippedURLString); 175 result.append(quoteChar); 176 return; 177 } 178 179 // FIXME: This does not fully match other browsers. Firefox percent-escapes non-ASCII characters for innerHTML. 180 result.append(quoteChar); 181 appendAttributeValue(result, urlString, false); 182 result.append(quoteChar); 183 } 184 185 void MarkupAccumulator::appendNodeValue(Vector<UChar>& out, const Node* node, const Range* range, EntityMask entityMask) 186 { 187 String str = node->nodeValue(); 188 const UChar* characters = str.characters(); 189 size_t length = str.length(); 190 191 if (range) { 192 ExceptionCode ec; 193 if (node == range->endContainer(ec)) 194 length = range->endOffset(ec); 195 if (node == range->startContainer(ec)) { 196 size_t start = range->startOffset(ec); 197 characters += start; 198 length -= start; 199 } 200 } 201 202 appendCharactersReplacingEntities(out, characters, length, entityMask); 203 } 204 205 bool MarkupAccumulator::shouldAddNamespaceElement(const Element* element) 206 { 207 // Don't add namespace attribute if it is already defined for this elem. 208 const AtomicString& prefix = element->prefix(); 209 AtomicString attr = !prefix.isEmpty() ? "xmlns:" + prefix : "xmlns"; 210 return !element->hasAttribute(attr); 211 } 212 213 bool MarkupAccumulator::shouldAddNamespaceAttribute(const Attribute& attribute, Namespaces& namespaces) 214 { 215 namespaces.checkConsistency(); 216 217 // Don't add namespace attributes twice 218 if (attribute.name() == XMLNSNames::xmlnsAttr) { 219 namespaces.set(emptyAtom.impl(), attribute.value().impl()); 220 return false; 221 } 222 223 QualifiedName xmlnsPrefixAttr(xmlnsAtom, attribute.localName(), XMLNSNames::xmlnsNamespaceURI); 224 if (attribute.name() == xmlnsPrefixAttr) { 225 namespaces.set(attribute.localName().impl(), attribute.value().impl()); 226 return false; 227 } 228 229 return true; 230 } 231 232 void MarkupAccumulator::appendNamespace(Vector<UChar>& result, const AtomicString& prefix, const AtomicString& namespaceURI, Namespaces& namespaces) 233 { 234 namespaces.checkConsistency(); 235 if (namespaceURI.isEmpty()) 236 return; 237 238 // Use emptyAtoms's impl() for both null and empty strings since the HashMap can't handle 0 as a key 239 AtomicStringImpl* pre = prefix.isEmpty() ? emptyAtom.impl() : prefix.impl(); 240 AtomicStringImpl* foundNS = namespaces.get(pre); 241 if (foundNS != namespaceURI.impl()) { 242 namespaces.set(pre, namespaceURI.impl()); 243 result.append(' '); 244 append(result, xmlnsAtom.string()); 245 if (!prefix.isEmpty()) { 246 result.append(':'); 247 append(result, prefix); 248 } 249 250 result.append('='); 251 result.append('"'); 252 appendAttributeValue(result, namespaceURI, false); 253 result.append('"'); 254 } 255 } 256 257 EntityMask MarkupAccumulator::entityMaskForText(Text* text) const 258 { 259 const QualifiedName* parentName = 0; 260 if (text->parentElement()) 261 parentName = &static_cast<Element*>(text->parentElement())->tagQName(); 262 263 if (parentName && (*parentName == scriptTag || *parentName == styleTag || *parentName == xmpTag)) 264 return EntityMaskInCDATA; 265 266 return text->document()->isHTMLDocument() ? EntityMaskInHTMLPCDATA : EntityMaskInPCDATA; 267 } 268 269 void MarkupAccumulator::appendText(Vector<UChar>& out, Text* text) 270 { 271 appendNodeValue(out, text, m_range, entityMaskForText(text)); 272 } 273 274 void MarkupAccumulator::appendComment(Vector<UChar>& out, const String& comment) 275 { 276 // FIXME: Comment content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "-->". 277 append(out, "<!--"); 278 append(out, comment); 279 append(out, "-->"); 280 } 281 282 void MarkupAccumulator::appendDocumentType(Vector<UChar>& result, const DocumentType* n) 283 { 284 if (n->name().isEmpty()) 285 return; 286 287 append(result, "<!DOCTYPE "); 288 append(result, n->name()); 289 if (!n->publicId().isEmpty()) { 290 append(result, " PUBLIC \""); 291 append(result, n->publicId()); 292 append(result, "\""); 293 if (!n->systemId().isEmpty()) { 294 append(result, " \""); 295 append(result, n->systemId()); 296 append(result, "\""); 297 } 298 } else if (!n->systemId().isEmpty()) { 299 append(result, " SYSTEM \""); 300 append(result, n->systemId()); 301 append(result, "\""); 302 } 303 if (!n->internalSubset().isEmpty()) { 304 append(result, " ["); 305 append(result, n->internalSubset()); 306 append(result, "]"); 307 } 308 append(result, ">"); 309 } 310 311 void MarkupAccumulator::appendProcessingInstruction(Vector<UChar>& out, const String& target, const String& data) 312 { 313 // FIXME: PI data is not escaped, but XMLSerializer (and possibly other callers) this should raise an exception if it includes "?>". 314 append(out, "<?"); 315 append(out, target); 316 append(out, " "); 317 append(out, data); 318 append(out, "?>"); 319 } 320 321 void MarkupAccumulator::appendElement(Vector<UChar>& out, Element* element, Namespaces* namespaces) 322 { 323 appendOpenTag(out, element, namespaces); 324 325 NamedNodeMap* attributes = element->attributes(); 326 unsigned length = attributes->length(); 327 for (unsigned int i = 0; i < length; i++) 328 appendAttribute(out, element, *attributes->attributeItem(i), namespaces); 329 330 appendCloseTag(out, element); 331 } 332 333 void MarkupAccumulator::appendOpenTag(Vector<UChar>& out, Element* element, Namespaces* namespaces) 334 { 335 out.append('<'); 336 append(out, element->nodeNamePreservingCase()); 337 if (!element->document()->isHTMLDocument() && namespaces && shouldAddNamespaceElement(element)) 338 appendNamespace(out, element->prefix(), element->namespaceURI(), *namespaces); 339 } 340 341 void MarkupAccumulator::appendCloseTag(Vector<UChar>& out, Element* element) 342 { 343 if (shouldSelfClose(element)) { 344 if (element->isHTMLElement()) 345 out.append(' '); // XHTML 1.0 <-> HTML compatibility. 346 out.append('/'); 347 } 348 out.append('>'); 349 } 350 351 void MarkupAccumulator::appendAttribute(Vector<UChar>& out, Element* element, const Attribute& attribute, Namespaces* namespaces) 352 { 353 bool documentIsHTML = element->document()->isHTMLDocument(); 354 355 out.append(' '); 356 357 if (documentIsHTML) 358 append(out, attribute.name().localName()); 359 else 360 append(out, attribute.name().toString()); 361 362 out.append('='); 363 364 if (element->isURLAttribute(const_cast<Attribute*>(&attribute))) { 365 // We don't want to complete file:/// URLs because it may contain sensitive information 366 // about the user's system. 367 if (shouldResolveURLs() && !element->document()->url().isLocalFile()) 368 appendQuotedURLAttributeValue(out, element->document()->completeURL(attribute.value()).string()); 369 else 370 appendQuotedURLAttributeValue(out, attribute.value()); 371 } else { 372 out.append('\"'); 373 appendAttributeValue(out, attribute.value(), documentIsHTML); 374 out.append('\"'); 375 } 376 377 if (!documentIsHTML && namespaces && shouldAddNamespaceAttribute(attribute, *namespaces)) 378 appendNamespace(out, attribute.prefix(), attribute.namespaceURI(), *namespaces); 379 } 380 381 void MarkupAccumulator::appendCDATASection(Vector<UChar>& out, const String& section) 382 { 383 // FIXME: CDATA content is not escaped, but XMLSerializer (and possibly other callers) should raise an exception if it includes "]]>". 384 append(out, "<![CDATA["); 385 append(out, section); 386 append(out, "]]>"); 387 } 388 389 void MarkupAccumulator::appendStartMarkup(Vector<UChar>& result, const Node* node, Namespaces* namespaces) 390 { 391 if (namespaces) 392 namespaces->checkConsistency(); 393 394 switch (node->nodeType()) { 395 case Node::TEXT_NODE: 396 appendText(result, static_cast<Text*>(const_cast<Node*>(node))); 397 break; 398 case Node::COMMENT_NODE: 399 appendComment(result, static_cast<const Comment*>(node)->data()); 400 break; 401 case Node::DOCUMENT_NODE: 402 case Node::DOCUMENT_FRAGMENT_NODE: 403 break; 404 case Node::DOCUMENT_TYPE_NODE: 405 appendDocumentType(result, static_cast<const DocumentType*>(node)); 406 break; 407 case Node::PROCESSING_INSTRUCTION_NODE: 408 appendProcessingInstruction(result, static_cast<const ProcessingInstruction*>(node)->target(), static_cast<const ProcessingInstruction*>(node)->data()); 409 break; 410 case Node::ELEMENT_NODE: 411 appendElement(result, static_cast<Element*>(const_cast<Node*>(node)), namespaces); 412 break; 413 case Node::CDATA_SECTION_NODE: 414 appendCDATASection(result, static_cast<const CDATASection*>(node)->data()); 415 break; 416 case Node::ATTRIBUTE_NODE: 417 case Node::ENTITY_NODE: 418 case Node::ENTITY_REFERENCE_NODE: 419 case Node::NOTATION_NODE: 420 case Node::XPATH_NAMESPACE_NODE: 421 ASSERT_NOT_REACHED(); 422 break; 423 } 424 } 425 426 // Rules of self-closure 427 // 1. No elements in HTML documents use the self-closing syntax. 428 // 2. Elements w/ children never self-close because they use a separate end tag. 429 // 3. HTML elements which do not have a "forbidden" end tag will close with a separate end tag. 430 // 4. Other elements self-close. 431 bool MarkupAccumulator::shouldSelfClose(const Node* node) 432 { 433 if (node->document()->isHTMLDocument()) 434 return false; 435 if (node->hasChildNodes()) 436 return false; 437 if (node->isHTMLElement() && !elementCannotHaveEndTag(node)) 438 return false; 439 return true; 440 } 441 442 bool MarkupAccumulator::elementCannotHaveEndTag(const Node* node) 443 { 444 if (!node->isHTMLElement()) 445 return false; 446 447 // FIXME: ieForbidsInsertHTML may not be the right function to call here 448 // ieForbidsInsertHTML is used to disallow setting innerHTML/outerHTML 449 // or createContextualFragment. It does not necessarily align with 450 // which elements should be serialized w/o end tags. 451 return static_cast<const HTMLElement*>(node)->ieForbidsInsertHTML(); 452 } 453 454 void MarkupAccumulator::appendEndMarkup(Vector<UChar>& result, const Node* node) 455 { 456 if (!node->isElementNode() || shouldSelfClose(node) || (!node->hasChildNodes() && elementCannotHaveEndTag(node))) 457 return; 458 459 result.append('<'); 460 result.append('/'); 461 append(result, static_cast<const Element*>(node)->nodeNamePreservingCase()); 462 result.append('>'); 463 } 464 465 } 466