1 /* 2 * Copyright (C) 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 // How we handle the base tag better. 32 // Current status: 33 // At now the normal way we use to handling base tag is 34 // a) For those links which have corresponding local saved files, such as 35 // savable CSS, JavaScript files, they will be written to relative URLs which 36 // point to local saved file. Why those links can not be resolved as absolute 37 // file URLs, because if they are resolved as absolute URLs, after moving the 38 // file location from one directory to another directory, the file URLs will 39 // be dead links. 40 // b) For those links which have not corresponding local saved files, such as 41 // links in A, AREA tags, they will be resolved as absolute URLs. 42 // c) We comment all base tags when serialzing DOM for the page. 43 // FireFox also uses above way to handle base tag. 44 // 45 // Problem: 46 // This way can not handle the following situation: 47 // the base tag is written by JavaScript. 48 // For example. The page "www.yahoo.com" use 49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL 50 // of page when loading page. So when saving page as completed-HTML, we assume 51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved 52 // completed-HTML page, then the JavaScript will insert a base tag 53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to 54 // local saved resource files will be resolved as 55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource 56 // files can not be loaded correctly. Also the page will be rendered ugly since 57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame 58 // files can not be fetched. 59 // Now FireFox, IE and WebKit based Browser all have this problem. 60 // 61 // Solution: 62 // My solution is that we comment old base tag and write new base tag: 63 // <base href="." ...> after the previous commented base tag. In WebKit, it 64 // always uses the latest "href" attribute of base tag to set document's base 65 // URL. Based on this behavior, when we encounter a base tag, we comment it and 66 // write a new base tag <base href="."> after the previous commented base tag. 67 // The new added base tag can help engine to locate correct base URL for 68 // correctly loading local saved resource files. Also I think we need to inherit 69 // the base target value from document object when appending new base tag. 70 // If there are multiple base tags in original document, we will comment all old 71 // base tags and append new base tag after each old base tag because we do not 72 // know those old base tags are original content or added by JavaScript. If 73 // they are added by JavaScript, it means when loading saved page, the script(s) 74 // will still insert base tag(s) to DOM, so the new added base tag(s) can 75 // override the incorrect base URL and make sure we alway load correct local 76 // saved resource files. 77 78 #include "config.h" 79 #include "web/WebPageSerializerImpl.h" 80 81 #include "core/HTMLNames.h" 82 #include "core/dom/Document.h" 83 #include "core/dom/DocumentType.h" 84 #include "core/dom/Element.h" 85 #include "core/editing/markup.h" 86 #include "core/html/HTMLAllCollection.h" 87 #include "core/html/HTMLElement.h" 88 #include "core/html/HTMLFormElement.h" 89 #include "core/html/HTMLHtmlElement.h" 90 #include "core/html/HTMLMetaElement.h" 91 #include "core/loader/DocumentLoader.h" 92 #include "core/loader/FrameLoader.h" 93 #include "public/platform/WebVector.h" 94 #include "web/WebLocalFrameImpl.h" 95 #include "wtf/text/TextEncoding.h" 96 97 namespace blink { 98 99 // Maximum length of data buffer which is used to temporary save generated 100 // html content data. This is a soft limit which might be passed if a very large 101 // contegious string is found in the page. 102 static const unsigned dataBufferCapacity = 65536; 103 104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, 105 const WTF::TextEncoding& textEncoding, 106 Document* document, 107 const String& directoryName) 108 : url(url) 109 , textEncoding(textEncoding) 110 , document(document) 111 , directoryName(directoryName) 112 , isHTMLDocument(document->isHTMLDocument()) 113 , haveSeenDocType(false) 114 , haveAddedCharsetDeclaration(false) 115 , skipMetaElement(0) 116 , isInScriptOrStyleTag(false) 117 , haveAddedXMLProcessingDirective(false) 118 , haveAddedContentsBeforeEnd(false) 119 { 120 } 121 122 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( 123 const Element* element, SerializeDomParam* param, bool* needSkip) 124 { 125 StringBuilder result; 126 127 *needSkip = false; 128 if (param->isHTMLDocument) { 129 // Skip the open tag of original META tag which declare charset since we 130 // have overrided the META which have correct charset declaration after 131 // serializing open tag of HEAD element. 132 ASSERT(element); 133 if (isHTMLMetaElement(*element)) { 134 const HTMLMetaElement& meta = toHTMLMetaElement(*element); 135 // Check whether the META tag has declared charset or not. 136 String equiv = meta.httpEquiv(); 137 if (equalIgnoringCase(equiv, "content-type")) { 138 String content = meta.content(); 139 if (content.length() && content.contains("charset", false)) { 140 // Find META tag declared charset, we need to skip it when 141 // serializing DOM. 142 param->skipMetaElement = element; 143 *needSkip = true; 144 } 145 } 146 } else if (isHTMLHtmlElement(*element)) { 147 // Check something before processing the open tag of HEAD element. 148 // First we add doc type declaration if original document has it. 149 if (!param->haveSeenDocType) { 150 param->haveSeenDocType = true; 151 result.append(createMarkup(param->document->doctype())); 152 } 153 154 // Add MOTW declaration before html tag. 155 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. 156 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); 157 } else if (isHTMLBaseElement(*element)) { 158 // Comment the BASE tag when serializing dom. 159 result.appendLiteral("<!--"); 160 } 161 } else { 162 // Write XML declaration. 163 if (!param->haveAddedXMLProcessingDirective) { 164 param->haveAddedXMLProcessingDirective = true; 165 // Get encoding info. 166 String xmlEncoding = param->document->xmlEncoding(); 167 if (xmlEncoding.isEmpty()) 168 xmlEncoding = param->document->encodingName(); 169 if (xmlEncoding.isEmpty()) 170 xmlEncoding = UTF8Encoding().name(); 171 result.appendLiteral("<?xml version=\""); 172 result.append(param->document->xmlVersion()); 173 result.appendLiteral("\" encoding=\""); 174 result.append(xmlEncoding); 175 if (param->document->xmlStandalone()) 176 result.appendLiteral("\" standalone=\"yes"); 177 result.appendLiteral("\"?>\n"); 178 } 179 // Add doc type declaration if original document has it. 180 if (!param->haveSeenDocType) { 181 param->haveSeenDocType = true; 182 result.append(createMarkup(param->document->doctype())); 183 } 184 } 185 return result.toString(); 186 } 187 188 String WebPageSerializerImpl::postActionAfterSerializeOpenTag( 189 const Element* element, SerializeDomParam* param) 190 { 191 StringBuilder result; 192 193 param->haveAddedContentsBeforeEnd = false; 194 if (!param->isHTMLDocument) 195 return result.toString(); 196 // Check after processing the open tag of HEAD element 197 if (!param->haveAddedCharsetDeclaration 198 && isHTMLHeadElement(*element)) { 199 param->haveAddedCharsetDeclaration = true; 200 // Check meta element. WebKit only pre-parse the first 512 bytes 201 // of the document. If the whole <HEAD> is larger and meta is the 202 // end of head part, then this kind of pages aren't decoded correctly 203 // because of this issue. So when we serialize the DOM, we need to 204 // make sure the meta will in first child of head tag. 205 // See http://bugs.webkit.org/show_bug.cgi?id=16621. 206 // First we generate new content for writing correct META element. 207 result.append(WebPageSerializer::generateMetaCharsetDeclaration( 208 String(param->textEncoding.name()))); 209 210 param->haveAddedContentsBeforeEnd = true; 211 // Will search each META which has charset declaration, and skip them all 212 // in PreActionBeforeSerializeOpenTag. 213 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { 214 param->isInScriptOrStyleTag = true; 215 } 216 217 return result.toString(); 218 } 219 220 String WebPageSerializerImpl::preActionBeforeSerializeEndTag( 221 const Element* element, SerializeDomParam* param, bool* needSkip) 222 { 223 String result; 224 225 *needSkip = false; 226 if (!param->isHTMLDocument) 227 return result; 228 // Skip the end tag of original META tag which declare charset. 229 // Need not to check whether it's META tag since we guarantee 230 // skipMetaElement is definitely META tag if it's not 0. 231 if (param->skipMetaElement == element) { 232 *needSkip = true; 233 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { 234 ASSERT(param->isInScriptOrStyleTag); 235 param->isInScriptOrStyleTag = false; 236 } 237 238 return result; 239 } 240 241 // After we finish serializing end tag of a element, we give the target 242 // element a chance to do some post work to add some additional data. 243 String WebPageSerializerImpl::postActionAfterSerializeEndTag( 244 const Element* element, SerializeDomParam* param) 245 { 246 StringBuilder result; 247 248 if (!param->isHTMLDocument) 249 return result.toString(); 250 // Comment the BASE tag when serializing DOM. 251 if (isHTMLBaseElement(*element)) { 252 result.appendLiteral("-->"); 253 // Append a new base tag declaration. 254 result.append(WebPageSerializer::generateBaseTagDeclaration( 255 param->document->baseTarget())); 256 } 257 258 return result.toString(); 259 } 260 261 void WebPageSerializerImpl::saveHTMLContentToBuffer( 262 const String& result, SerializeDomParam* param) 263 { 264 m_dataBuffer.append(result); 265 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, 266 param, 267 DoNotForceFlush); 268 } 269 270 void WebPageSerializerImpl::encodeAndFlushBuffer( 271 WebPageSerializerClient::PageSerializationStatus status, 272 SerializeDomParam* param, 273 FlushOption flushOption) 274 { 275 // Data buffer is not full nor do we want to force flush. 276 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) 277 return; 278 279 String content = m_dataBuffer.toString(); 280 m_dataBuffer.clear(); 281 282 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables); 283 284 // Send result to the client. 285 m_client->didSerializeDataForFrame(param->url, 286 WebCString(encodedContent.data(), encodedContent.length()), 287 status); 288 } 289 290 void WebPageSerializerImpl::openTagToString(Element* element, 291 SerializeDomParam* param) 292 { 293 bool needSkip; 294 StringBuilder result; 295 // Do pre action for open tag. 296 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); 297 if (needSkip) 298 return; 299 // Add open tag 300 result.append('<'); 301 result.append(element->nodeName().lower()); 302 // Go through all attributes and serialize them. 303 AttributeCollection attributes = element->attributes(); 304 AttributeCollection::iterator end = attributes.end(); 305 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) { 306 result.append(' '); 307 // Add attribute pair 308 result.append(it->name().toString()); 309 result.appendLiteral("=\""); 310 if (!it->value().isEmpty()) { 311 const String& attrValue = it->value(); 312 313 // Check whether we need to replace some resource links 314 // with local resource paths. 315 const QualifiedName& attrName = it->name(); 316 if (element->hasLegalLinkAttribute(attrName)) { 317 // For links start with "javascript:", we do not change it. 318 if (attrValue.startsWith("javascript:", false)) { 319 result.append(attrValue); 320 } else { 321 // Get the absolute link 322 WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element); 323 String completeURL = subFrame ? subFrame->frame()->document()->url() : 324 param->document->completeURL(attrValue); 325 // Check whether we have local files for those link. 326 if (m_localLinks.contains(completeURL)) { 327 if (!param->directoryName.isEmpty()) { 328 result.appendLiteral("./"); 329 result.append(param->directoryName); 330 result.append('/'); 331 } 332 result.append(m_localLinks.get(completeURL)); 333 } else { 334 result.append(completeURL); 335 } 336 } 337 } else { 338 if (param->isHTMLDocument) 339 result.append(m_htmlEntities.convertEntitiesInString(attrValue)); 340 else 341 result.append(m_xmlEntities.convertEntitiesInString(attrValue)); 342 } 343 } 344 result.append('\"'); 345 } 346 347 // Do post action for open tag. 348 String addedContents = postActionAfterSerializeOpenTag(element, param); 349 // Complete the open tag for element when it has child/children. 350 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) 351 result.append('>'); 352 // Append the added contents generate in post action of open tag. 353 result.append(addedContents); 354 // Save the result to data buffer. 355 saveHTMLContentToBuffer(result.toString(), param); 356 } 357 358 // Serialize end tag of an specified element. 359 void WebPageSerializerImpl::endTagToString(Element* element, 360 SerializeDomParam* param) 361 { 362 bool needSkip; 363 StringBuilder result; 364 // Do pre action for end tag. 365 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); 366 if (needSkip) 367 return; 368 // Write end tag when element has child/children. 369 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { 370 result.appendLiteral("</"); 371 result.append(element->nodeName().lower()); 372 result.append('>'); 373 } else { 374 // Check whether we have to write end tag for empty element. 375 if (param->isHTMLDocument) { 376 result.append('>'); 377 // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. 378 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) { 379 // We need to write end tag when it is required. 380 result.appendLiteral("</"); 381 result.append(element->nodeName().lower()); 382 result.append('>'); 383 } 384 } else { 385 // For xml base document. 386 result.appendLiteral(" />"); 387 } 388 } 389 // Do post action for end tag. 390 result.append(postActionAfterSerializeEndTag(element, param)); 391 // Save the result to data buffer. 392 saveHTMLContentToBuffer(result.toString(), param); 393 } 394 395 void WebPageSerializerImpl::buildContentForNode(Node* node, 396 SerializeDomParam* param) 397 { 398 switch (node->nodeType()) { 399 case Node::ELEMENT_NODE: 400 // Process open tag of element. 401 openTagToString(toElement(node), param); 402 // Walk through the children nodes and process it. 403 for (Node *child = node->firstChild(); child; child = child->nextSibling()) 404 buildContentForNode(child, param); 405 // Process end tag of element. 406 endTagToString(toElement(node), param); 407 break; 408 case Node::TEXT_NODE: 409 saveHTMLContentToBuffer(createMarkup(node), param); 410 break; 411 case Node::ATTRIBUTE_NODE: 412 case Node::DOCUMENT_NODE: 413 case Node::DOCUMENT_FRAGMENT_NODE: 414 // Should not exist. 415 ASSERT_NOT_REACHED(); 416 break; 417 // Document type node can be in DOM? 418 case Node::DOCUMENT_TYPE_NODE: 419 param->haveSeenDocType = true; 420 default: 421 // For other type node, call default action. 422 saveHTMLContentToBuffer(createMarkup(node), param); 423 break; 424 } 425 } 426 427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, 428 bool recursiveSerialization, 429 WebPageSerializerClient* client, 430 const WebVector<WebURL>& links, 431 const WebVector<WebString>& localPaths, 432 const WebString& localDirectoryName) 433 : m_client(client) 434 , m_recursiveSerialization(recursiveSerialization) 435 , m_framesCollected(false) 436 , m_localDirectoryName(localDirectoryName) 437 , m_htmlEntities(false) 438 , m_xmlEntities(true) 439 { 440 // Must specify available webframe. 441 ASSERT(frame); 442 m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); 443 // Make sure we have non 0 client. 444 ASSERT(client); 445 // Build local resources map. 446 ASSERT(links.size() == localPaths.size()); 447 for (size_t i = 0; i < links.size(); i++) { 448 KURL url = links[i]; 449 ASSERT(!m_localLinks.contains(url.string())); 450 m_localLinks.set(url.string(), localPaths[i]); 451 } 452 453 ASSERT(m_dataBuffer.isEmpty()); 454 } 455 456 void WebPageSerializerImpl::collectTargetFrames() 457 { 458 ASSERT(!m_framesCollected); 459 m_framesCollected = true; 460 461 // First, process main frame. 462 m_frames.append(m_specifiedWebLocalFrameImpl); 463 // Return now if user only needs to serialize specified frame, not including 464 // all sub-frames. 465 if (!m_recursiveSerialization) 466 return; 467 // Collect all frames inside the specified frame. 468 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { 469 WebLocalFrameImpl* currentFrame = m_frames[i]; 470 // Get current using document. 471 Document* currentDoc = currentFrame->frame()->document(); 472 // Go through sub-frames. 473 RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all(); 474 475 for (unsigned i = 0; Element* element = all->item(i); ++i) { 476 if (!element->isHTMLElement()) 477 continue; 478 WebLocalFrameImpl* webFrame = 479 WebLocalFrameImpl::fromFrameOwnerElement(element); 480 if (webFrame) 481 m_frames.append(webFrame); 482 } 483 } 484 } 485 486 bool WebPageSerializerImpl::serialize() 487 { 488 if (!m_framesCollected) 489 collectTargetFrames(); 490 491 bool didSerialization = false; 492 KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url(); 493 494 for (unsigned i = 0; i < m_frames.size(); ++i) { 495 WebLocalFrameImpl* webFrame = m_frames[i]; 496 Document* document = webFrame->frame()->document(); 497 const KURL& url = document->url(); 498 499 if (!url.isValid() || !m_localLinks.contains(url.string())) 500 continue; 501 502 didSerialization = true; 503 504 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding(); 505 String directoryName = url == mainURL ? m_localDirectoryName : ""; 506 507 SerializeDomParam param(url, textEncoding, document, directoryName); 508 509 Element* documentElement = document->documentElement(); 510 if (documentElement) 511 buildContentForNode(documentElement, ¶m); 512 513 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); 514 } 515 516 ASSERT(m_dataBuffer.isEmpty()); 517 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); 518 return didSerialization; 519 } 520 521 } // namespace blink 522